@geravant/sinain 1.19.0 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,254 @@
1
+ #!/usr/bin/env python3
2
+ """Concept Import — replay a sinain-concept/v1 bundle into the local triplestore.
3
+
4
+ Designed for idempotency: re-importing the same bundle in `merge` mode is a
5
+ no-op (existing triples skip-as-duplicate). The receiver re-issues tx_ids
6
+ locally but preserves the source-tx grouping, so the digest atomicity that
7
+ knowledge_integrator.py relies on survives the round-trip.
8
+
9
+ Usage:
10
+ python3 concept_import.py --db <kg.db> --bundle <bundle.json> \\
11
+ [--web-db <web.db>] [--conflict skip|merge|overwrite]
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import hashlib
17
+ import json
18
+ import sqlite3
19
+ import sys
20
+ import time
21
+ from pathlib import Path
22
+
23
+
24
+ def verify_envelope(envelope: dict) -> tuple[bool, str]:
25
+ """Validate format and checksum. Returns (ok, error_msg)."""
26
+ fmt = envelope.get("format")
27
+ if fmt != "sinain-concept/v1":
28
+ return False, f"unsupported format: {fmt!r} (need sinain-concept/v1)"
29
+ if "root_entity" not in envelope or "entities" not in envelope:
30
+ return False, "envelope missing root_entity or entities"
31
+ expected = envelope.get("checksum")
32
+ if expected:
33
+ canonical = json.dumps(
34
+ {"root_entity": envelope["root_entity"], "entities": envelope["entities"]},
35
+ sort_keys=True, ensure_ascii=False, separators=(",", ":"),
36
+ )
37
+ actual = "sha256:" + hashlib.sha256(canonical.encode("utf-8")).hexdigest()
38
+ if actual != expected:
39
+ return False, f"checksum mismatch (expected {expected[:23]}..., got {actual[:23]}...)"
40
+ return True, ""
41
+
42
+
43
+ def bundle_sha(envelope: dict) -> str:
44
+ """Compute envelope-level sha for idempotency tracking (web.db.concept_imports)."""
45
+ body = json.dumps(envelope, sort_keys=True, ensure_ascii=False).encode("utf-8")
46
+ return hashlib.sha256(body).hexdigest()
47
+
48
+
49
+ def import_bundle(db_path: str, envelope: dict, conflict: str = "merge",
50
+ web_db_path: str | None = None) -> dict:
51
+ """Replay envelope into the local knowledge graph.
52
+
53
+ conflict modes:
54
+ - skip: existing (entity, attribute, value) wins; imported dropped.
55
+ - merge: duplicate triples skipped; new (attribute, value) inserted as new.
56
+ - overwrite: retract conflicting active local triples, then assert imported.
57
+ """
58
+ from triplestore import TripleStore
59
+
60
+ store = TripleStore(db_path)
61
+
62
+ # Group imported triples by their source_tx so we preserve atomicity.
63
+ # source_tx_id → list of (entity_id, attribute, value, value_type, retracted, valid_to, original_created_at)
64
+ by_source_tx: dict[int, list[tuple]] = {}
65
+ for ent in envelope.get("entities", []):
66
+ eid = ent.get("id")
67
+ for t in ent.get("triples", []):
68
+ stx = int(t.get("tx_id") or 0)
69
+ by_source_tx.setdefault(stx, []).append((
70
+ eid,
71
+ t["attribute"],
72
+ t["value"],
73
+ t.get("value_type", "string"),
74
+ int(t.get("retracted", 0)),
75
+ t.get("valid_to"),
76
+ t.get("created_at"),
77
+ ))
78
+
79
+ inserted = 0
80
+ skipped_dup = 0
81
+ overwritten = 0
82
+ tx_mapping: dict[int, int] = {} # source_tx → new_tx
83
+
84
+ for source_tx in sorted(by_source_tx.keys()):
85
+ triples = by_source_tx[source_tx]
86
+
87
+ # Begin one local tx per source tx (preserves grouping)
88
+ new_tx = store.begin_tx(
89
+ source="concept-import",
90
+ metadata={"source_tx": source_tx, "bundle_root": envelope.get("root_entity")},
91
+ )
92
+ tx_mapping[source_tx] = new_tx
93
+
94
+ for (eid, attr, value, value_type, retracted, valid_to, created_at) in triples:
95
+ # Imported retracted triples → preserve retracted state. (They're
96
+ # part of the bundle's audit trail.)
97
+ if retracted:
98
+ # Insert as retracted; valid_to stays as-is.
99
+ # We use a direct INSERT to preserve the retracted flag — assert_triple
100
+ # always sets retracted=0.
101
+ store._conn.execute(
102
+ """INSERT INTO triples
103
+ (tx_id, entity_id, attribute, value, value_type, retracted, retracted_tx, valid_to, created_at)
104
+ VALUES (?, ?, ?, ?, ?, 1, ?, ?, ?)""",
105
+ (new_tx, eid, attr, value, value_type, new_tx, valid_to, created_at or _iso_now()),
106
+ )
107
+ store._conn.execute(
108
+ "INSERT OR IGNORE INTO entity_types (entity_id, entity_type) VALUES (?, ?)",
109
+ (eid, eid.split(":", 1)[0] if ":" in eid else "unknown"),
110
+ )
111
+ inserted += 1
112
+ continue
113
+
114
+ # Active triple — apply conflict mode for (entity, attribute, value)
115
+ existing = store._conn.execute(
116
+ """SELECT id FROM triples WHERE entity_id = ? AND attribute = ?
117
+ AND value = ? AND retracted = 0 LIMIT 1""",
118
+ (eid, attr, value),
119
+ ).fetchone()
120
+
121
+ if existing:
122
+ # Exact triple already present
123
+ if conflict == "skip" or conflict == "merge":
124
+ skipped_dup += 1
125
+ continue
126
+ if conflict == "overwrite":
127
+ # Retract conflicting then insert imported
128
+ store.retract_triple(new_tx, eid, attr, value)
129
+ overwritten += 1
130
+
131
+ # Insert
132
+ store.assert_triple(new_tx, eid, attr, value, value_type=value_type)
133
+ # Patch created_at to preserve source timeline (assert_triple uses now()).
134
+ if created_at:
135
+ store._conn.execute(
136
+ """UPDATE triples SET created_at = ?
137
+ WHERE tx_id = ? AND entity_id = ? AND attribute = ? AND value = ?
138
+ AND id = (SELECT MAX(id) FROM triples WHERE tx_id = ? AND entity_id = ? AND attribute = ? AND value = ?)""",
139
+ (created_at, new_tx, eid, attr, value, new_tx, eid, attr, value),
140
+ )
141
+ inserted += 1
142
+
143
+ store._conn.commit()
144
+
145
+ store.close()
146
+
147
+ # Page cache reuse
148
+ page_cached = False
149
+ if web_db_path and envelope.get("rendered_page"):
150
+ page = envelope["rendered_page"]
151
+ # Map source tx_watermark to local — find max new_tx for facts in the bundle
152
+ local_watermark = max(tx_mapping.values()) if tx_mapping else 0
153
+ page["tx_watermark"] = local_watermark
154
+ try:
155
+ conn = sqlite3.connect(web_db_path)
156
+ conn.execute(
157
+ """INSERT OR REPLACE INTO page_cache
158
+ (entity_id, tx_watermark, page_json, generated_at, tokens_in, tokens_out, cost_usd)
159
+ VALUES (?, ?, ?, ?, ?, ?, ?)""",
160
+ (
161
+ envelope["root_entity"],
162
+ local_watermark,
163
+ json.dumps(page, ensure_ascii=False),
164
+ int(time.time() * 1000),
165
+ (page.get("rendered_with") or {}).get("tokens_in"),
166
+ (page.get("rendered_with") or {}).get("tokens_out"),
167
+ (page.get("rendered_with") or {}).get("cost_usd"),
168
+ ),
169
+ )
170
+ conn.commit()
171
+ conn.close()
172
+ page_cached = True
173
+ except Exception as e:
174
+ sys.stderr.write(f"page cache reuse failed: {e}\n")
175
+
176
+ # Audit row in concept_imports
177
+ if web_db_path:
178
+ try:
179
+ sha = bundle_sha(envelope)
180
+ conn = sqlite3.connect(web_db_path)
181
+ conn.execute(
182
+ """INSERT INTO concept_imports
183
+ (imported_at, root_entity, source_tool, source_version, envelope_format,
184
+ bundle_sha256, conflict_mode, triples_count, redactions_seen, notes)
185
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
186
+ (
187
+ int(time.time() * 1000),
188
+ envelope["root_entity"],
189
+ (envelope.get("exporter") or {}).get("tool"),
190
+ (envelope.get("exporter") or {}).get("tool_version"),
191
+ envelope.get("format"),
192
+ sha,
193
+ conflict,
194
+ inserted,
195
+ json.dumps((envelope.get("redactions") or {}).get("applied", [])),
196
+ None,
197
+ ),
198
+ )
199
+ conn.commit()
200
+ conn.close()
201
+ except Exception as e:
202
+ sys.stderr.write(f"concept_imports log failed: {e}\n")
203
+
204
+ return {
205
+ "ok": True,
206
+ "imported": True,
207
+ "root_entity": envelope.get("root_entity"),
208
+ "stats": {
209
+ "entities_seen": len(envelope.get("entities", [])),
210
+ "triples_inserted": inserted,
211
+ "triples_skipped_duplicate": skipped_dup,
212
+ "triples_overwritten": overwritten,
213
+ "tx_mapping_count": len(tx_mapping),
214
+ },
215
+ "rendered_page_cached": page_cached,
216
+ "view_url": f"/knowledge/ui/entity/{envelope.get('root_entity', '').replace(':', '%3A')}",
217
+ }
218
+
219
+
220
+ def _iso_now() -> str:
221
+ return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
222
+
223
+
224
+ def main() -> None:
225
+ parser = argparse.ArgumentParser(description="Concept Import")
226
+ parser.add_argument("--db", required=True)
227
+ parser.add_argument("--bundle", required=True, help="Path to .sinain-concept.json (or - for stdin)")
228
+ parser.add_argument("--web-db", default=None)
229
+ parser.add_argument("--conflict", choices=["skip", "merge", "overwrite"], default="merge")
230
+ args = parser.parse_args()
231
+
232
+ if not Path(args.db).exists():
233
+ # Auto-create empty knowledge DB on first import — receiver may have nothing yet.
234
+ Path(args.db).parent.mkdir(parents=True, exist_ok=True)
235
+ from triplestore import TripleStore
236
+ TripleStore(args.db).close()
237
+
238
+ if args.bundle == "-":
239
+ envelope = json.load(sys.stdin)
240
+ else:
241
+ envelope = json.loads(Path(args.bundle).read_text(encoding="utf-8"))
242
+
243
+ ok, err = verify_envelope(envelope)
244
+ if not ok:
245
+ print(json.dumps({"ok": False, "error": err}))
246
+ sys.exit(1)
247
+
248
+ result = import_bundle(args.db, envelope, conflict=args.conflict,
249
+ web_db_path=args.web_db)
250
+ print(json.dumps(result, ensure_ascii=False))
251
+
252
+
253
+ if __name__ == "__main__":
254
+ main()