mempalace-code 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mempalace/export.py ADDED
@@ -0,0 +1,378 @@
1
+ """
2
+ export.py — Export and import drawers + KG triples as JSONL
3
+
4
+ Provides backup/restore for manually-added drawers, diary entries, and knowledge
5
+ graph triples that would otherwise be lost when nuking and re-seeding a palace.
6
+
7
+ Typical workflow:
8
+ # Before nuke-and-re-seed:
9
+ mempalace export --only-manual --with-kg --out backup.jsonl
10
+
11
+ # After re-mine:
12
+ mempalace import backup.jsonl
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import sys
19
+ from datetime import datetime
20
+ from typing import Any, Dict, Iterator, List, Optional
21
+
22
+ from .version import __version__
23
+
24
+ # Chunker strategies produced by manual writes (MCP add_drawer + diary)
25
+ _MANUAL_STRATEGIES = ("manual_v1", "diary_v1")
26
+
27
+
28
+ # ── Header ────────────────────────────────────────────────────────────────────
29
+
30
+
31
+ def _make_header(
32
+ palace_path: str,
33
+ filters: Dict[str, Any],
34
+ drawer_count: int,
35
+ kg_count: int,
36
+ ) -> Dict[str, Any]:
37
+ return {
38
+ "type": "export_header",
39
+ "version": __version__,
40
+ "palace_path": palace_path,
41
+ "exported_at": datetime.now().isoformat(),
42
+ "filters": filters,
43
+ "drawer_count": drawer_count,
44
+ "kg_count": kg_count,
45
+ }
46
+
47
+
48
+ # ── Export ────────────────────────────────────────────────────────────────────
49
+
50
+
51
+ def _build_drawer_where(
52
+ only_manual: bool = False,
53
+ wing: Optional[str] = None,
54
+ room: Optional[str] = None,
55
+ ) -> Optional[Dict[str, Any]]:
56
+ """Build a DrawerStore where-filter dict from export options."""
57
+ clauses = []
58
+ if only_manual:
59
+ clauses.append({"$or": [{"chunker_strategy": s} for s in _MANUAL_STRATEGIES]})
60
+ if wing:
61
+ clauses.append({"wing": wing})
62
+ if room:
63
+ clauses.append({"room": room})
64
+
65
+ if not clauses:
66
+ return None
67
+ if len(clauses) == 1:
68
+ return clauses[0]
69
+ return {"$and": clauses}
70
+
71
+
72
+ def export_drawers(
73
+ store,
74
+ only_manual: bool = False,
75
+ wing: Optional[str] = None,
76
+ room: Optional[str] = None,
77
+ since: Optional[str] = None,
78
+ include_vectors: bool = False,
79
+ ) -> Iterator[Dict[str, Any]]:
80
+ """Yield one drawer dict per record from the store."""
81
+ where = _build_drawer_where(only_manual=only_manual, wing=wing, room=room)
82
+ for batch in store.iter_all(where=where, include_vectors=include_vectors):
83
+ for row in batch:
84
+ # Post-filter by `since` on filed_at (string ISO date comparison)
85
+ if since and row.get("filed_at", "") < since:
86
+ continue
87
+ record = {"type": "drawer"}
88
+ record["id"] = row.get("id", "")
89
+ record["text"] = row.get("text", "")
90
+ # Embed vector or null
91
+ if include_vectors:
92
+ vec = row.get("vector")
93
+ record["embedding"] = list(vec) if vec is not None else None
94
+ else:
95
+ record["embedding"] = None
96
+ # All metadata fields (exclude 'type' to avoid collision with record type marker)
97
+ for key in (
98
+ "wing",
99
+ "room",
100
+ "source_file",
101
+ "chunk_index",
102
+ "added_by",
103
+ "filed_at",
104
+ "hall",
105
+ "topic",
106
+ "agent",
107
+ "date",
108
+ "ingest_mode",
109
+ "extract_mode",
110
+ "compression_ratio",
111
+ "original_tokens",
112
+ "language",
113
+ "symbol_name",
114
+ "symbol_type",
115
+ "source_hash",
116
+ "extractor_version",
117
+ "chunker_strategy",
118
+ ):
119
+ record[key] = row.get(key, "")
120
+ # `type` is overloaded — store drawer metadata `type` under `drawer_type`
121
+ record["drawer_type"] = row.get("type", "")
122
+ yield record
123
+
124
+
125
+ def export_kg(
126
+ kg,
127
+ since: Optional[str] = None,
128
+ ) -> Iterator[Dict[str, Any]]:
129
+ """Yield one KG triple dict per record."""
130
+ for batch in kg.iter_all_triples():
131
+ for triple in batch:
132
+ if since and triple.get("valid_from") and triple.get("valid_from") < since:
133
+ continue
134
+ record = {"type": "kg_triple"}
135
+ record.update(triple)
136
+ yield record
137
+
138
+
139
+ def _count_drawers(
140
+ store,
141
+ only_manual: bool = False,
142
+ wing: Optional[str] = None,
143
+ room: Optional[str] = None,
144
+ since: Optional[str] = None,
145
+ ) -> int:
146
+ """Count matching drawers for the export header."""
147
+ return sum(
148
+ 1 for _ in export_drawers(store, only_manual=only_manual, wing=wing, room=room, since=since)
149
+ )
150
+
151
+
152
+ def _count_kg(kg, since: Optional[str] = None) -> int:
153
+ return sum(1 for _ in export_kg(kg, since=since))
154
+
155
+
156
+ def write_jsonl(
157
+ path: str,
158
+ store,
159
+ kg=None,
160
+ only_manual: bool = False,
161
+ wing: Optional[str] = None,
162
+ room: Optional[str] = None,
163
+ since: Optional[str] = None,
164
+ include_vectors: bool = False,
165
+ include_kg: bool = False,
166
+ pretty: bool = False,
167
+ palace_path: str = "",
168
+ ) -> Dict[str, int]:
169
+ """Write export JSONL to *path* (use '-' for stdout).
170
+
171
+ Returns summary dict: {drawer_count, kg_count}.
172
+ """
173
+ indent = 2 if pretty else None
174
+
175
+ filters: Dict[str, Any] = {}
176
+ if only_manual:
177
+ filters["only_manual"] = True
178
+ if wing:
179
+ filters["wing"] = wing
180
+ if room:
181
+ filters["room"] = room
182
+ if since:
183
+ filters["since"] = since
184
+ if include_vectors:
185
+ filters["with_embeddings"] = True
186
+ if include_kg:
187
+ filters["with_kg"] = True
188
+
189
+ # Pre-count for header (two passes — acceptable for the sizes we handle)
190
+ drawer_count = _count_drawers(store, only_manual=only_manual, wing=wing, room=room, since=since)
191
+ kg_count = _count_kg(kg, since=since) if (include_kg and kg is not None) else 0
192
+
193
+ header = _make_header(
194
+ palace_path=palace_path,
195
+ filters=filters,
196
+ drawer_count=drawer_count,
197
+ kg_count=kg_count,
198
+ )
199
+
200
+ fh = sys.stdout if path == "-" else open(path, "w", encoding="utf-8")
201
+ try:
202
+ fh.write(json.dumps(header, indent=indent) + "\n")
203
+
204
+ for record in export_drawers(
205
+ store,
206
+ only_manual=only_manual,
207
+ wing=wing,
208
+ room=room,
209
+ since=since,
210
+ include_vectors=include_vectors,
211
+ ):
212
+ fh.write(json.dumps(record, indent=indent) + "\n")
213
+
214
+ if include_kg and kg is not None:
215
+ for record in export_kg(kg, since=since):
216
+ fh.write(json.dumps(record, indent=indent) + "\n")
217
+ finally:
218
+ if fh is not sys.stdout:
219
+ fh.close()
220
+
221
+ return {"drawer_count": drawer_count, "kg_count": kg_count}
222
+
223
+
224
+ # ── Import ────────────────────────────────────────────────────────────────────
225
+
226
+
227
+ def read_jsonl(path: str) -> Iterator[Dict[str, Any]]:
228
+ """Yield parsed JSON objects from a JSONL file."""
229
+ fh = sys.stdin if path == "-" else open(path, encoding="utf-8")
230
+ try:
231
+ for line in fh:
232
+ line = line.strip()
233
+ if line:
234
+ yield json.loads(line)
235
+ finally:
236
+ if fh is not sys.stdin:
237
+ fh.close()
238
+
239
+
240
+ def import_jsonl(
241
+ path: str,
242
+ store,
243
+ kg=None,
244
+ skip_dedup: bool = False,
245
+ skip_kg: bool = False,
246
+ dry_run: bool = False,
247
+ wing_override: Optional[str] = None,
248
+ ) -> Dict[str, Any]:
249
+ """Import drawers and KG triples from a JSONL export file.
250
+
251
+ Returns summary: {imported_drawers, skipped_duplicates, imported_triples, warnings}.
252
+ """
253
+ imported_drawers = 0
254
+ skipped_duplicates = 0
255
+ imported_triples = 0
256
+ warnings: List[str] = []
257
+
258
+ header_seen = False
259
+ no_header_warned = False
260
+
261
+ for record in read_jsonl(path):
262
+ rtype = record.get("type")
263
+
264
+ if rtype == "export_header":
265
+ header_seen = True
266
+ file_version = record.get("version", "")
267
+ if file_version and file_version != __version__:
268
+ msg = (
269
+ f"Version mismatch: export was created with {file_version}, "
270
+ f"current is {__version__}. Proceeding anyway."
271
+ )
272
+ warnings.append(msg)
273
+ print(f"WARNING: {msg}", file=sys.stderr)
274
+ continue
275
+
276
+ if not header_seen and not no_header_warned:
277
+ warnings.append("No export_header found at start of file — format may be invalid.")
278
+ no_header_warned = True
279
+
280
+ if rtype == "drawer":
281
+ wing = wing_override or record.get("wing", "")
282
+ room = record.get("room", "")
283
+ text = record.get("text", "")
284
+ drawer_id = record.get("id", "")
285
+
286
+ if not text:
287
+ continue
288
+
289
+ # Dedup check via cosine similarity
290
+ if not skip_dedup:
291
+ try:
292
+ results = store.query(
293
+ query_texts=[text],
294
+ n_results=1,
295
+ include=["distances"],
296
+ )
297
+ dists = results.get("distances", [[]])[0]
298
+ if dists:
299
+ # LanceDB returns L2 distance; convert to approximate cosine similarity
300
+ # For unit vectors: cosine_sim ≈ 1 - (L2^2 / 2)
301
+ # At threshold 0.9 cosine → L2^2 ≈ 0.2 → L2 ≈ 0.447
302
+ l2 = dists[0]
303
+ cosine_sim = max(0.0, 1.0 - (l2 * l2) / 2.0)
304
+ if cosine_sim >= 0.9:
305
+ skipped_duplicates += 1
306
+ continue
307
+ except Exception:
308
+ pass # If dedup check fails, proceed with import
309
+
310
+ if dry_run:
311
+ imported_drawers += 1
312
+ continue
313
+
314
+ # Build metadata
315
+ meta_keys = (
316
+ "source_file",
317
+ "chunk_index",
318
+ "added_by",
319
+ "filed_at",
320
+ "hall",
321
+ "topic",
322
+ "agent",
323
+ "date",
324
+ "ingest_mode",
325
+ "extract_mode",
326
+ "compression_ratio",
327
+ "original_tokens",
328
+ "language",
329
+ "symbol_name",
330
+ "symbol_type",
331
+ "source_hash",
332
+ "extractor_version",
333
+ "chunker_strategy",
334
+ )
335
+ meta: Dict[str, Any] = {"wing": wing, "room": room}
336
+ for k in meta_keys:
337
+ if k in record:
338
+ meta[k] = record[k]
339
+ # `drawer_type` was stored to avoid collision with the record `type` key
340
+ if "drawer_type" in record:
341
+ meta["type"] = record["drawer_type"]
342
+
343
+ try:
344
+ store.add(ids=[drawer_id], documents=[text], metadatas=[meta])
345
+ imported_drawers += 1
346
+ except Exception as exc:
347
+ # Duplicate ID — try upsert
348
+ try:
349
+ store.upsert(ids=[drawer_id], documents=[text], metadatas=[meta])
350
+ imported_drawers += 1
351
+ except Exception:
352
+ warnings.append(f"Failed to import drawer {drawer_id}: {exc}")
353
+
354
+ elif rtype == "kg_triple" and not skip_kg and kg is not None:
355
+ if dry_run:
356
+ imported_triples += 1
357
+ continue
358
+ try:
359
+ kg.add_triple(
360
+ subject=record.get("subject", ""),
361
+ predicate=record.get("predicate", ""),
362
+ obj=record.get("object", ""),
363
+ valid_from=record.get("valid_from"),
364
+ valid_to=record.get("valid_to"),
365
+ confidence=record.get("confidence", 1.0),
366
+ source_closet=record.get("source_closet"),
367
+ source_file=record.get("source_file"),
368
+ )
369
+ imported_triples += 1
370
+ except Exception as exc:
371
+ warnings.append(f"Failed to import KG triple: {exc}")
372
+
373
+ return {
374
+ "imported_drawers": imported_drawers,
375
+ "skipped_duplicates": skipped_duplicates,
376
+ "imported_triples": imported_triples,
377
+ "warnings": warnings,
378
+ }