@geravant/sinain 1.19.0 → 1.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/sinain-core/package-lock.json +439 -0
- package/sinain-core/package.json +2 -0
- package/sinain-core/src/index.ts +283 -0
- package/sinain-core/src/learning/local-curation.ts +3 -0
- package/sinain-core/src/server.ts +1570 -2
- package/sinain-core/src/web-db/schema.ts +122 -0
- package/sinain-core/src/web-db/store.ts +406 -0
- package/sinain-memory/concept_export.py +310 -0
- package/sinain-memory/concept_import.py +254 -0
- package/sinain-memory/graph_query.py +461 -4
- package/sinain-memory/knowledge_integrator.py +87 -10
- package/sinain-memory/page_renderer.py +447 -0
- package/sinain-memory/retract.py +236 -0
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Concept Export — package an entity + its neighborhood as a portable bundle.
|
|
3
|
+
|
|
4
|
+
Produces a sinain-concept/v1 envelope that can be transferred to another
|
|
5
|
+
machine and re-imported with concept_import.py to reconstruct the same
|
|
6
|
+
entity page (including the LLM-rendered view, if bundled).
|
|
7
|
+
|
|
8
|
+
The reproducibility invariant we honor:
|
|
9
|
+
"On a new machine: import the bundle → open the same URL → see the same page."
|
|
10
|
+
|
|
11
|
+
For that to hold:
|
|
12
|
+
1. Entity IDs are content-addressed slugs → stable across machines.
|
|
13
|
+
2. Triples are exported verbatim (created_at, retracted) for round-trip.
|
|
14
|
+
3. Optionally bundle the rendered_page JSON so the receiver gets a
|
|
15
|
+
cache hit on first view (deterministic visual identity).
|
|
16
|
+
|
|
17
|
+
We do NOT bundle embeddings — same model on both ends → same vectors,
|
|
18
|
+
so receiver recomputes for ~1.5KB/fact saved.
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
python3 concept_export.py --db <kg.db> --root entity:foo \\
|
|
22
|
+
[--depth 1] [--include-retracted] [--include-page] [--web-db <web.db>] \\
|
|
23
|
+
[--redact private,creditcard,apikey,...]
|
|
24
|
+
"""
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import hashlib
|
|
29
|
+
import json
|
|
30
|
+
import re
|
|
31
|
+
import sys
|
|
32
|
+
import time
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Redaction (MIRROR OF sense_client/privacy.py — keep patterns in sync until
|
|
37
|
+
# we extract a shared sinain-memory/redaction.py module).
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
REDACT_RULES_VERSION = "1.2"
|
|
40
|
+
|
|
41
|
+
_REDACT_PATTERNS: list[tuple[re.Pattern, str, str]] = [
|
|
42
|
+
# (regex, replacement, rule-name)
|
|
43
|
+
(re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"), "[REDACTED:card]", "creditcard"),
|
|
44
|
+
(re.compile(r"\b(?:sk-|pk-|api[_-]?key[=:]\s*)[A-Za-z0-9_\-]{20,}\b"), "[REDACTED:apikey]", "apikey"),
|
|
45
|
+
(re.compile(r"Bearer\s+[A-Za-z0-9_\-\.]{20,}"), "[REDACTED:bearer]", "bearer"),
|
|
46
|
+
(re.compile(r"\b(?:AKIA|ASIA)[A-Z0-9]{16}\b"), "[REDACTED:awskey]", "awskey"),
|
|
47
|
+
(re.compile(r"(?:password|passwd|pwd)\s*[:=]\s*\S+", re.IGNORECASE), "[REDACTED:password]", "password"),
|
|
48
|
+
(re.compile(r"\bghp_[A-Za-z0-9]{36}\b"), "[REDACTED:github_pat]", "github_pat"),
|
|
49
|
+
(re.compile(r"\bghs_[A-Za-z0-9]{36}\b"), "[REDACTED:github_srv]", "github_srv"),
|
|
50
|
+
(re.compile(r"\bxox[bpoa]-[0-9A-Za-z\-]+"), "[REDACTED:slack]", "slack"),
|
|
51
|
+
(re.compile(r"\bya29\.[0-9A-Za-z\-_]+"), "[REDACTED:google_oauth]", "google_oauth"),
|
|
52
|
+
(re.compile(r"\beyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+"), "[REDACTED:jwt]", "jwt"),
|
|
53
|
+
(re.compile(r"(?:secret|token|key)\s*[:=]\s*[A-Za-z0-9_\-\.]{10,}", re.IGNORECASE), "[REDACTED:secret]", "secret"),
|
|
54
|
+
(re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED:ssn]", "ssn"),
|
|
55
|
+
(re.compile(r"-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----"), "[REDACTED:privkey]", "privkey"),
|
|
56
|
+
]
|
|
57
|
+
_PRIVATE_TAG = re.compile(r"<private>.*?</private>", re.DOTALL)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def apply_redactions(text: str, enabled_rules: set[str]) -> tuple[str, list[str]]:
|
|
61
|
+
"""Run enabled redaction rules over *text*. Returns (redacted_text, applied)."""
|
|
62
|
+
applied: list[str] = []
|
|
63
|
+
if "private" in enabled_rules:
|
|
64
|
+
new_text = _PRIVATE_TAG.sub("[REDACTED:private]", text)
|
|
65
|
+
if new_text != text:
|
|
66
|
+
applied.append("private")
|
|
67
|
+
text = new_text
|
|
68
|
+
for pattern, replacement, name in _REDACT_PATTERNS:
|
|
69
|
+
if name in enabled_rules:
|
|
70
|
+
new_text = pattern.sub(replacement, text)
|
|
71
|
+
if new_text != text:
|
|
72
|
+
applied.append(name)
|
|
73
|
+
text = new_text
|
|
74
|
+
return text, applied
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# Export
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
def collect_neighborhood(store, root_entity: str, depth: int,
|
|
82
|
+
include_retracted: bool) -> list[str]:
|
|
83
|
+
"""BFS from root, following both incoming and outgoing refs.
|
|
84
|
+
|
|
85
|
+
Outgoing: triples WHERE entity_id = X AND value_type = 'ref' → recurse to value.
|
|
86
|
+
Incoming: triples WHERE value = X AND value_type = 'ref' → recurse to entity_id.
|
|
87
|
+
|
|
88
|
+
Returns deterministically-ordered list of entity_ids reachable within depth.
|
|
89
|
+
"""
|
|
90
|
+
visited: dict[str, int] = {root_entity: 0}
|
|
91
|
+
queue: list[tuple[str, int]] = [(root_entity, 0)]
|
|
92
|
+
retracted_filter = "" if include_retracted else "AND retracted = 0"
|
|
93
|
+
|
|
94
|
+
while queue:
|
|
95
|
+
eid, d = queue.pop(0)
|
|
96
|
+
if d >= depth:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# Outgoing refs
|
|
100
|
+
rows_out = store._conn.execute(
|
|
101
|
+
f"""SELECT DISTINCT value FROM triples
|
|
102
|
+
WHERE entity_id = ? AND value_type = 'ref' {retracted_filter}""",
|
|
103
|
+
(eid,),
|
|
104
|
+
).fetchall()
|
|
105
|
+
for r in rows_out:
|
|
106
|
+
ref = r["value"]
|
|
107
|
+
if ref and ref not in visited:
|
|
108
|
+
visited[ref] = d + 1
|
|
109
|
+
queue.append((ref, d + 1))
|
|
110
|
+
|
|
111
|
+
# Incoming refs
|
|
112
|
+
rows_in = store._conn.execute(
|
|
113
|
+
f"""SELECT DISTINCT entity_id FROM triples
|
|
114
|
+
WHERE value = ? AND value_type = 'ref' {retracted_filter}""",
|
|
115
|
+
(eid,),
|
|
116
|
+
).fetchall()
|
|
117
|
+
for r in rows_in:
|
|
118
|
+
ref = r["entity_id"]
|
|
119
|
+
if ref and ref not in visited:
|
|
120
|
+
visited[ref] = d + 1
|
|
121
|
+
queue.append((ref, d + 1))
|
|
122
|
+
|
|
123
|
+
return sorted(visited.keys()) # deterministic ordering for stable checksum
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def serialize_entity(store, entity_id: str, include_retracted: bool,
|
|
127
|
+
redact_rules: set[str]) -> tuple[dict, list[str], int]:
|
|
128
|
+
"""Pull all triples for entity_id, apply redactions to string values.
|
|
129
|
+
|
|
130
|
+
Returns ({id, type, triples: [...]}, applied_rules, redacted_count).
|
|
131
|
+
"""
|
|
132
|
+
where = "" if include_retracted else "AND retracted = 0"
|
|
133
|
+
rows = store._conn.execute(
|
|
134
|
+
f"""SELECT attribute, value, value_type, tx_id, created_at, retracted, valid_to
|
|
135
|
+
FROM triples WHERE entity_id = ? {where}
|
|
136
|
+
ORDER BY tx_id, attribute, value""",
|
|
137
|
+
(entity_id,),
|
|
138
|
+
).fetchall()
|
|
139
|
+
|
|
140
|
+
triples = []
|
|
141
|
+
all_applied: list[str] = []
|
|
142
|
+
redacted_count = 0
|
|
143
|
+
for r in rows:
|
|
144
|
+
value = r["value"]
|
|
145
|
+
if r["value_type"] == "string" and redact_rules and value:
|
|
146
|
+
new_value, applied = apply_redactions(value, redact_rules)
|
|
147
|
+
if applied:
|
|
148
|
+
all_applied.extend(applied)
|
|
149
|
+
redacted_count += 1
|
|
150
|
+
value = new_value
|
|
151
|
+
triples.append({
|
|
152
|
+
"attribute": r["attribute"],
|
|
153
|
+
"value": value,
|
|
154
|
+
"value_type": r["value_type"],
|
|
155
|
+
"tx_id": r["tx_id"],
|
|
156
|
+
"created_at": r["created_at"],
|
|
157
|
+
"retracted": int(r["retracted"]),
|
|
158
|
+
"valid_to": r["valid_to"],
|
|
159
|
+
})
|
|
160
|
+
|
|
161
|
+
type_prefix = entity_id.split(":", 1)[0] if ":" in entity_id else "unknown"
|
|
162
|
+
return ({"id": entity_id, "type": type_prefix, "triples": triples},
|
|
163
|
+
all_applied, redacted_count)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def fetch_cached_page(web_db_path: str, root_entity: str,
|
|
167
|
+
redact_rules: set[str]) -> dict | None:
|
|
168
|
+
"""Pull the most recent cached rendered_page for root_entity, if any.
|
|
169
|
+
Apply redactions over the page summary + bullet text too — the LLM may
|
|
170
|
+
have woven sensitive content into its synthesis.
|
|
171
|
+
"""
|
|
172
|
+
import sqlite3
|
|
173
|
+
if not Path(web_db_path).exists():
|
|
174
|
+
return None
|
|
175
|
+
try:
|
|
176
|
+
conn = sqlite3.connect(web_db_path)
|
|
177
|
+
conn.row_factory = sqlite3.Row
|
|
178
|
+
row = conn.execute(
|
|
179
|
+
"""SELECT page_json, generated_at, tokens_in, tokens_out, cost_usd
|
|
180
|
+
FROM page_cache WHERE entity_id = ?
|
|
181
|
+
ORDER BY generated_at DESC LIMIT 1""",
|
|
182
|
+
(root_entity,),
|
|
183
|
+
).fetchone()
|
|
184
|
+
conn.close()
|
|
185
|
+
if not row:
|
|
186
|
+
return None
|
|
187
|
+
page = json.loads(row["page_json"])
|
|
188
|
+
# Redact rendered_page content.
|
|
189
|
+
if redact_rules:
|
|
190
|
+
if page.get("summary"):
|
|
191
|
+
new_summary, _ = apply_redactions(page["summary"], redact_rules)
|
|
192
|
+
page["summary"] = new_summary
|
|
193
|
+
for sec in page.get("sections", []) or []:
|
|
194
|
+
for b in sec.get("bullets", []) or []:
|
|
195
|
+
if b.get("text"):
|
|
196
|
+
new_text, _ = apply_redactions(b["text"], redact_rules)
|
|
197
|
+
b["text"] = new_text
|
|
198
|
+
page["generated_at"] = row["generated_at"]
|
|
199
|
+
page.setdefault("rendered_with", {})
|
|
200
|
+
if row["tokens_in"]: page["rendered_with"]["tokens_in"] = row["tokens_in"]
|
|
201
|
+
if row["tokens_out"]: page["rendered_with"]["tokens_out"] = row["tokens_out"]
|
|
202
|
+
return page
|
|
203
|
+
except Exception as e:
|
|
204
|
+
sys.stderr.write(f"fetch_cached_page failed: {e}\n")
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def export_concept(db_path: str, root_entity: str, depth: int = 1,
|
|
209
|
+
include_retracted: bool = False,
|
|
210
|
+
include_page: bool = True,
|
|
211
|
+
web_db_path: str | None = None,
|
|
212
|
+
redact_rules: set[str] | None = None) -> dict:
|
|
213
|
+
from triplestore import TripleStore
|
|
214
|
+
|
|
215
|
+
if redact_rules is None:
|
|
216
|
+
redact_rules = {"private", "creditcard", "apikey", "bearer", "awskey",
|
|
217
|
+
"password", "secret"}
|
|
218
|
+
|
|
219
|
+
store = TripleStore(db_path)
|
|
220
|
+
entity_ids = collect_neighborhood(store, root_entity, depth, include_retracted)
|
|
221
|
+
|
|
222
|
+
entities = []
|
|
223
|
+
all_applied: list[str] = []
|
|
224
|
+
total_redacted = 0
|
|
225
|
+
triple_count = 0
|
|
226
|
+
fact_count = 0
|
|
227
|
+
|
|
228
|
+
for eid in entity_ids:
|
|
229
|
+
entity_obj, applied, n_redacted = serialize_entity(
|
|
230
|
+
store, eid, include_retracted, redact_rules,
|
|
231
|
+
)
|
|
232
|
+
entities.append(entity_obj)
|
|
233
|
+
all_applied.extend(applied)
|
|
234
|
+
total_redacted += n_redacted
|
|
235
|
+
triple_count += len(entity_obj["triples"])
|
|
236
|
+
if eid.startswith("fact:"):
|
|
237
|
+
fact_count += 1
|
|
238
|
+
|
|
239
|
+
store.close()
|
|
240
|
+
|
|
241
|
+
rendered_page = None
|
|
242
|
+
if include_page and web_db_path:
|
|
243
|
+
rendered_page = fetch_cached_page(web_db_path, root_entity, redact_rules)
|
|
244
|
+
|
|
245
|
+
envelope = {
|
|
246
|
+
"format": "sinain-concept/v1",
|
|
247
|
+
"exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
248
|
+
"exporter": {
|
|
249
|
+
"tool": "sinain-core",
|
|
250
|
+
"tool_version": "1.14.0",
|
|
251
|
+
"schema_version": "triplestore/v3",
|
|
252
|
+
"embedding_model": "all-MiniLM-L6-v2",
|
|
253
|
+
},
|
|
254
|
+
"root_entity": root_entity,
|
|
255
|
+
"depth": depth,
|
|
256
|
+
"stats": {
|
|
257
|
+
"entities": len(entity_ids),
|
|
258
|
+
"facts": fact_count,
|
|
259
|
+
"triples": triple_count,
|
|
260
|
+
},
|
|
261
|
+
"entities": entities,
|
|
262
|
+
"rendered_page": rendered_page,
|
|
263
|
+
"redactions": {
|
|
264
|
+
"applied": sorted(set(all_applied)),
|
|
265
|
+
"rules_version": REDACT_RULES_VERSION,
|
|
266
|
+
"redacted_count": total_redacted,
|
|
267
|
+
},
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
# Compute checksum over canonical JSON of (root_entity + entities) — this is
|
|
271
|
+
# what the receiver should validate against on import.
|
|
272
|
+
canonical = json.dumps(
|
|
273
|
+
{"root_entity": root_entity, "entities": entities},
|
|
274
|
+
sort_keys=True, ensure_ascii=False, separators=(",", ":"),
|
|
275
|
+
)
|
|
276
|
+
envelope["checksum"] = "sha256:" + hashlib.sha256(canonical.encode("utf-8")).hexdigest()
|
|
277
|
+
return envelope
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def main() -> None:
|
|
281
|
+
parser = argparse.ArgumentParser(description="Concept Export")
|
|
282
|
+
parser.add_argument("--db", required=True)
|
|
283
|
+
parser.add_argument("--root", required=True, help="Root entity id (e.g. entity:citibank)")
|
|
284
|
+
parser.add_argument("--depth", type=int, default=1)
|
|
285
|
+
parser.add_argument("--include-retracted", action="store_true")
|
|
286
|
+
parser.add_argument("--include-page", action="store_true",
|
|
287
|
+
help="Bundle the cached rendered_page if available")
|
|
288
|
+
parser.add_argument("--web-db", default=None,
|
|
289
|
+
help="Path to web.db for page cache lookup")
|
|
290
|
+
parser.add_argument("--redact", default="private,creditcard,apikey,bearer,awskey,password,secret",
|
|
291
|
+
help="Comma-separated redaction rule names")
|
|
292
|
+
args = parser.parse_args()
|
|
293
|
+
|
|
294
|
+
if not Path(args.db).exists():
|
|
295
|
+
print(json.dumps({"error": f"db not found: {args.db}"}))
|
|
296
|
+
sys.exit(1)
|
|
297
|
+
|
|
298
|
+
rules = {r.strip() for r in args.redact.split(",") if r.strip()}
|
|
299
|
+
envelope = export_concept(
|
|
300
|
+
args.db, args.root, depth=args.depth,
|
|
301
|
+
include_retracted=args.include_retracted,
|
|
302
|
+
include_page=args.include_page,
|
|
303
|
+
web_db_path=args.web_db,
|
|
304
|
+
redact_rules=rules,
|
|
305
|
+
)
|
|
306
|
+
print(json.dumps(envelope, ensure_ascii=False))
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
if __name__ == "__main__":
|
|
310
|
+
main()
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Concept Import — replay a sinain-concept/v1 bundle into the local triplestore.
|
|
3
|
+
|
|
4
|
+
Designed for idempotency: re-importing the same bundle in `merge` mode is a
|
|
5
|
+
no-op (existing triples skip-as-duplicate). The receiver re-issues tx_ids
|
|
6
|
+
locally but preserves the source-tx grouping, so the digest atomicity that
|
|
7
|
+
knowledge_integrator.py relies on survives the round-trip.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python3 concept_import.py --db <kg.db> --bundle <bundle.json> \\
|
|
11
|
+
[--web-db <web.db>] [--conflict skip|merge|overwrite]
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import hashlib
|
|
17
|
+
import json
|
|
18
|
+
import sqlite3
|
|
19
|
+
import sys
|
|
20
|
+
import time
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def verify_envelope(envelope: dict) -> tuple[bool, str]:
|
|
25
|
+
"""Validate format and checksum. Returns (ok, error_msg)."""
|
|
26
|
+
fmt = envelope.get("format")
|
|
27
|
+
if fmt != "sinain-concept/v1":
|
|
28
|
+
return False, f"unsupported format: {fmt!r} (need sinain-concept/v1)"
|
|
29
|
+
if "root_entity" not in envelope or "entities" not in envelope:
|
|
30
|
+
return False, "envelope missing root_entity or entities"
|
|
31
|
+
expected = envelope.get("checksum")
|
|
32
|
+
if expected:
|
|
33
|
+
canonical = json.dumps(
|
|
34
|
+
{"root_entity": envelope["root_entity"], "entities": envelope["entities"]},
|
|
35
|
+
sort_keys=True, ensure_ascii=False, separators=(",", ":"),
|
|
36
|
+
)
|
|
37
|
+
actual = "sha256:" + hashlib.sha256(canonical.encode("utf-8")).hexdigest()
|
|
38
|
+
if actual != expected:
|
|
39
|
+
return False, f"checksum mismatch (expected {expected[:23]}..., got {actual[:23]}...)"
|
|
40
|
+
return True, ""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def bundle_sha(envelope: dict) -> str:
|
|
44
|
+
"""Compute envelope-level sha for idempotency tracking (web.db.concept_imports)."""
|
|
45
|
+
body = json.dumps(envelope, sort_keys=True, ensure_ascii=False).encode("utf-8")
|
|
46
|
+
return hashlib.sha256(body).hexdigest()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def import_bundle(db_path: str, envelope: dict, conflict: str = "merge",
|
|
50
|
+
web_db_path: str | None = None) -> dict:
|
|
51
|
+
"""Replay envelope into the local knowledge graph.
|
|
52
|
+
|
|
53
|
+
conflict modes:
|
|
54
|
+
- skip: existing (entity, attribute, value) wins; imported dropped.
|
|
55
|
+
- merge: duplicate triples skipped; new (attribute, value) inserted as new.
|
|
56
|
+
- overwrite: retract conflicting active local triples, then assert imported.
|
|
57
|
+
"""
|
|
58
|
+
from triplestore import TripleStore
|
|
59
|
+
|
|
60
|
+
store = TripleStore(db_path)
|
|
61
|
+
|
|
62
|
+
# Group imported triples by their source_tx so we preserve atomicity.
|
|
63
|
+
# source_tx_id → list of (entity_id, attribute, value, value_type, retracted, valid_to, original_created_at)
|
|
64
|
+
by_source_tx: dict[int, list[tuple]] = {}
|
|
65
|
+
for ent in envelope.get("entities", []):
|
|
66
|
+
eid = ent.get("id")
|
|
67
|
+
for t in ent.get("triples", []):
|
|
68
|
+
stx = int(t.get("tx_id") or 0)
|
|
69
|
+
by_source_tx.setdefault(stx, []).append((
|
|
70
|
+
eid,
|
|
71
|
+
t["attribute"],
|
|
72
|
+
t["value"],
|
|
73
|
+
t.get("value_type", "string"),
|
|
74
|
+
int(t.get("retracted", 0)),
|
|
75
|
+
t.get("valid_to"),
|
|
76
|
+
t.get("created_at"),
|
|
77
|
+
))
|
|
78
|
+
|
|
79
|
+
inserted = 0
|
|
80
|
+
skipped_dup = 0
|
|
81
|
+
overwritten = 0
|
|
82
|
+
tx_mapping: dict[int, int] = {} # source_tx → new_tx
|
|
83
|
+
|
|
84
|
+
for source_tx in sorted(by_source_tx.keys()):
|
|
85
|
+
triples = by_source_tx[source_tx]
|
|
86
|
+
|
|
87
|
+
# Begin one local tx per source tx (preserves grouping)
|
|
88
|
+
new_tx = store.begin_tx(
|
|
89
|
+
source="concept-import",
|
|
90
|
+
metadata={"source_tx": source_tx, "bundle_root": envelope.get("root_entity")},
|
|
91
|
+
)
|
|
92
|
+
tx_mapping[source_tx] = new_tx
|
|
93
|
+
|
|
94
|
+
for (eid, attr, value, value_type, retracted, valid_to, created_at) in triples:
|
|
95
|
+
# Imported retracted triples → preserve retracted state. (They're
|
|
96
|
+
# part of the bundle's audit trail.)
|
|
97
|
+
if retracted:
|
|
98
|
+
# Insert as retracted; valid_to stays as-is.
|
|
99
|
+
# We use a direct INSERT to preserve the retracted flag — assert_triple
|
|
100
|
+
# always sets retracted=0.
|
|
101
|
+
store._conn.execute(
|
|
102
|
+
"""INSERT INTO triples
|
|
103
|
+
(tx_id, entity_id, attribute, value, value_type, retracted, retracted_tx, valid_to, created_at)
|
|
104
|
+
VALUES (?, ?, ?, ?, ?, 1, ?, ?, ?)""",
|
|
105
|
+
(new_tx, eid, attr, value, value_type, new_tx, valid_to, created_at or _iso_now()),
|
|
106
|
+
)
|
|
107
|
+
store._conn.execute(
|
|
108
|
+
"INSERT OR IGNORE INTO entity_types (entity_id, entity_type) VALUES (?, ?)",
|
|
109
|
+
(eid, eid.split(":", 1)[0] if ":" in eid else "unknown"),
|
|
110
|
+
)
|
|
111
|
+
inserted += 1
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
# Active triple — apply conflict mode for (entity, attribute, value)
|
|
115
|
+
existing = store._conn.execute(
|
|
116
|
+
"""SELECT id FROM triples WHERE entity_id = ? AND attribute = ?
|
|
117
|
+
AND value = ? AND retracted = 0 LIMIT 1""",
|
|
118
|
+
(eid, attr, value),
|
|
119
|
+
).fetchone()
|
|
120
|
+
|
|
121
|
+
if existing:
|
|
122
|
+
# Exact triple already present
|
|
123
|
+
if conflict == "skip" or conflict == "merge":
|
|
124
|
+
skipped_dup += 1
|
|
125
|
+
continue
|
|
126
|
+
if conflict == "overwrite":
|
|
127
|
+
# Retract conflicting then insert imported
|
|
128
|
+
store.retract_triple(new_tx, eid, attr, value)
|
|
129
|
+
overwritten += 1
|
|
130
|
+
|
|
131
|
+
# Insert
|
|
132
|
+
store.assert_triple(new_tx, eid, attr, value, value_type=value_type)
|
|
133
|
+
# Patch created_at to preserve source timeline (assert_triple uses now()).
|
|
134
|
+
if created_at:
|
|
135
|
+
store._conn.execute(
|
|
136
|
+
"""UPDATE triples SET created_at = ?
|
|
137
|
+
WHERE tx_id = ? AND entity_id = ? AND attribute = ? AND value = ?
|
|
138
|
+
AND id = (SELECT MAX(id) FROM triples WHERE tx_id = ? AND entity_id = ? AND attribute = ? AND value = ?)""",
|
|
139
|
+
(created_at, new_tx, eid, attr, value, new_tx, eid, attr, value),
|
|
140
|
+
)
|
|
141
|
+
inserted += 1
|
|
142
|
+
|
|
143
|
+
store._conn.commit()
|
|
144
|
+
|
|
145
|
+
store.close()
|
|
146
|
+
|
|
147
|
+
# Page cache reuse
|
|
148
|
+
page_cached = False
|
|
149
|
+
if web_db_path and envelope.get("rendered_page"):
|
|
150
|
+
page = envelope["rendered_page"]
|
|
151
|
+
# Map source tx_watermark to local — find max new_tx for facts in the bundle
|
|
152
|
+
local_watermark = max(tx_mapping.values()) if tx_mapping else 0
|
|
153
|
+
page["tx_watermark"] = local_watermark
|
|
154
|
+
try:
|
|
155
|
+
conn = sqlite3.connect(web_db_path)
|
|
156
|
+
conn.execute(
|
|
157
|
+
"""INSERT OR REPLACE INTO page_cache
|
|
158
|
+
(entity_id, tx_watermark, page_json, generated_at, tokens_in, tokens_out, cost_usd)
|
|
159
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
|
160
|
+
(
|
|
161
|
+
envelope["root_entity"],
|
|
162
|
+
local_watermark,
|
|
163
|
+
json.dumps(page, ensure_ascii=False),
|
|
164
|
+
int(time.time() * 1000),
|
|
165
|
+
(page.get("rendered_with") or {}).get("tokens_in"),
|
|
166
|
+
(page.get("rendered_with") or {}).get("tokens_out"),
|
|
167
|
+
(page.get("rendered_with") or {}).get("cost_usd"),
|
|
168
|
+
),
|
|
169
|
+
)
|
|
170
|
+
conn.commit()
|
|
171
|
+
conn.close()
|
|
172
|
+
page_cached = True
|
|
173
|
+
except Exception as e:
|
|
174
|
+
sys.stderr.write(f"page cache reuse failed: {e}\n")
|
|
175
|
+
|
|
176
|
+
# Audit row in concept_imports
|
|
177
|
+
if web_db_path:
|
|
178
|
+
try:
|
|
179
|
+
sha = bundle_sha(envelope)
|
|
180
|
+
conn = sqlite3.connect(web_db_path)
|
|
181
|
+
conn.execute(
|
|
182
|
+
"""INSERT INTO concept_imports
|
|
183
|
+
(imported_at, root_entity, source_tool, source_version, envelope_format,
|
|
184
|
+
bundle_sha256, conflict_mode, triples_count, redactions_seen, notes)
|
|
185
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
186
|
+
(
|
|
187
|
+
int(time.time() * 1000),
|
|
188
|
+
envelope["root_entity"],
|
|
189
|
+
(envelope.get("exporter") or {}).get("tool"),
|
|
190
|
+
(envelope.get("exporter") or {}).get("tool_version"),
|
|
191
|
+
envelope.get("format"),
|
|
192
|
+
sha,
|
|
193
|
+
conflict,
|
|
194
|
+
inserted,
|
|
195
|
+
json.dumps((envelope.get("redactions") or {}).get("applied", [])),
|
|
196
|
+
None,
|
|
197
|
+
),
|
|
198
|
+
)
|
|
199
|
+
conn.commit()
|
|
200
|
+
conn.close()
|
|
201
|
+
except Exception as e:
|
|
202
|
+
sys.stderr.write(f"concept_imports log failed: {e}\n")
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
"ok": True,
|
|
206
|
+
"imported": True,
|
|
207
|
+
"root_entity": envelope.get("root_entity"),
|
|
208
|
+
"stats": {
|
|
209
|
+
"entities_seen": len(envelope.get("entities", [])),
|
|
210
|
+
"triples_inserted": inserted,
|
|
211
|
+
"triples_skipped_duplicate": skipped_dup,
|
|
212
|
+
"triples_overwritten": overwritten,
|
|
213
|
+
"tx_mapping_count": len(tx_mapping),
|
|
214
|
+
},
|
|
215
|
+
"rendered_page_cached": page_cached,
|
|
216
|
+
"view_url": f"/knowledge/ui/entity/{envelope.get('root_entity', '').replace(':', '%3A')}",
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _iso_now() -> str:
|
|
221
|
+
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def main() -> None:
|
|
225
|
+
parser = argparse.ArgumentParser(description="Concept Import")
|
|
226
|
+
parser.add_argument("--db", required=True)
|
|
227
|
+
parser.add_argument("--bundle", required=True, help="Path to .sinain-concept.json (or - for stdin)")
|
|
228
|
+
parser.add_argument("--web-db", default=None)
|
|
229
|
+
parser.add_argument("--conflict", choices=["skip", "merge", "overwrite"], default="merge")
|
|
230
|
+
args = parser.parse_args()
|
|
231
|
+
|
|
232
|
+
if not Path(args.db).exists():
|
|
233
|
+
# Auto-create empty knowledge DB on first import — receiver may have nothing yet.
|
|
234
|
+
Path(args.db).parent.mkdir(parents=True, exist_ok=True)
|
|
235
|
+
from triplestore import TripleStore
|
|
236
|
+
TripleStore(args.db).close()
|
|
237
|
+
|
|
238
|
+
if args.bundle == "-":
|
|
239
|
+
envelope = json.load(sys.stdin)
|
|
240
|
+
else:
|
|
241
|
+
envelope = json.loads(Path(args.bundle).read_text(encoding="utf-8"))
|
|
242
|
+
|
|
243
|
+
ok, err = verify_envelope(envelope)
|
|
244
|
+
if not ok:
|
|
245
|
+
print(json.dumps({"ok": False, "error": err}))
|
|
246
|
+
sys.exit(1)
|
|
247
|
+
|
|
248
|
+
result = import_bundle(args.db, envelope, conflict=args.conflict,
|
|
249
|
+
web_db_path=args.web_db)
|
|
250
|
+
print(json.dumps(result, ensure_ascii=False))
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
if __name__ == "__main__":
|
|
254
|
+
main()
|