mempalace-code 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mempalace/README.md +40 -0
- mempalace/__init__.py +6 -0
- mempalace/__main__.py +5 -0
- mempalace/cli.py +811 -0
- mempalace/config.py +149 -0
- mempalace/convo_miner.py +415 -0
- mempalace/dialect.py +1075 -0
- mempalace/entity_detector.py +853 -0
- mempalace/entity_registry.py +639 -0
- mempalace/export.py +378 -0
- mempalace/general_extractor.py +521 -0
- mempalace/knowledge_graph.py +410 -0
- mempalace/layers.py +515 -0
- mempalace/mcp_server.py +873 -0
- mempalace/migrate.py +153 -0
- mempalace/miner.py +1285 -0
- mempalace/normalize.py +328 -0
- mempalace/onboarding.py +489 -0
- mempalace/palace_graph.py +225 -0
- mempalace/py.typed +0 -0
- mempalace/room_detector_local.py +310 -0
- mempalace/searcher.py +305 -0
- mempalace/spellcheck.py +269 -0
- mempalace/split_mega_files.py +309 -0
- mempalace/storage.py +807 -0
- mempalace/version.py +3 -0
- mempalace_code-1.0.0.dist-info/METADATA +489 -0
- mempalace_code-1.0.0.dist-info/RECORD +32 -0
- mempalace_code-1.0.0.dist-info/WHEEL +4 -0
- mempalace_code-1.0.0.dist-info/entry_points.txt +2 -0
- mempalace_code-1.0.0.dist-info/licenses/LICENSE +192 -0
- mempalace_code-1.0.0.dist-info/licenses/NOTICE +17 -0
mempalace/export.py
ADDED
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
"""
|
|
2
|
+
export.py — Export and import drawers + KG triples as JSONL
|
|
3
|
+
|
|
4
|
+
Provides backup/restore for manually-added drawers, diary entries, and knowledge
|
|
5
|
+
graph triples that would otherwise be lost when nuking and re-seeding a palace.
|
|
6
|
+
|
|
7
|
+
Typical workflow:
|
|
8
|
+
# Before nuke-and-re-seed:
|
|
9
|
+
mempalace export --only-manual --with-kg --out backup.jsonl
|
|
10
|
+
|
|
11
|
+
# After re-mine:
|
|
12
|
+
mempalace import backup.jsonl
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import sys
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
from typing import Any, Dict, Iterator, List, Optional
|
|
21
|
+
|
|
22
|
+
from .version import __version__
|
|
23
|
+
|
|
24
|
+
# Chunker strategies produced by manual writes (MCP add_drawer + diary)
|
|
25
|
+
_MANUAL_STRATEGIES = ("manual_v1", "diary_v1")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ── Header ────────────────────────────────────────────────────────────────────
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _make_header(
|
|
32
|
+
palace_path: str,
|
|
33
|
+
filters: Dict[str, Any],
|
|
34
|
+
drawer_count: int,
|
|
35
|
+
kg_count: int,
|
|
36
|
+
) -> Dict[str, Any]:
|
|
37
|
+
return {
|
|
38
|
+
"type": "export_header",
|
|
39
|
+
"version": __version__,
|
|
40
|
+
"palace_path": palace_path,
|
|
41
|
+
"exported_at": datetime.now().isoformat(),
|
|
42
|
+
"filters": filters,
|
|
43
|
+
"drawer_count": drawer_count,
|
|
44
|
+
"kg_count": kg_count,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ── Export ────────────────────────────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _build_drawer_where(
|
|
52
|
+
only_manual: bool = False,
|
|
53
|
+
wing: Optional[str] = None,
|
|
54
|
+
room: Optional[str] = None,
|
|
55
|
+
) -> Optional[Dict[str, Any]]:
|
|
56
|
+
"""Build a DrawerStore where-filter dict from export options."""
|
|
57
|
+
clauses = []
|
|
58
|
+
if only_manual:
|
|
59
|
+
clauses.append({"$or": [{"chunker_strategy": s} for s in _MANUAL_STRATEGIES]})
|
|
60
|
+
if wing:
|
|
61
|
+
clauses.append({"wing": wing})
|
|
62
|
+
if room:
|
|
63
|
+
clauses.append({"room": room})
|
|
64
|
+
|
|
65
|
+
if not clauses:
|
|
66
|
+
return None
|
|
67
|
+
if len(clauses) == 1:
|
|
68
|
+
return clauses[0]
|
|
69
|
+
return {"$and": clauses}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def export_drawers(
|
|
73
|
+
store,
|
|
74
|
+
only_manual: bool = False,
|
|
75
|
+
wing: Optional[str] = None,
|
|
76
|
+
room: Optional[str] = None,
|
|
77
|
+
since: Optional[str] = None,
|
|
78
|
+
include_vectors: bool = False,
|
|
79
|
+
) -> Iterator[Dict[str, Any]]:
|
|
80
|
+
"""Yield one drawer dict per record from the store."""
|
|
81
|
+
where = _build_drawer_where(only_manual=only_manual, wing=wing, room=room)
|
|
82
|
+
for batch in store.iter_all(where=where, include_vectors=include_vectors):
|
|
83
|
+
for row in batch:
|
|
84
|
+
# Post-filter by `since` on filed_at (string ISO date comparison)
|
|
85
|
+
if since and row.get("filed_at", "") < since:
|
|
86
|
+
continue
|
|
87
|
+
record = {"type": "drawer"}
|
|
88
|
+
record["id"] = row.get("id", "")
|
|
89
|
+
record["text"] = row.get("text", "")
|
|
90
|
+
# Embed vector or null
|
|
91
|
+
if include_vectors:
|
|
92
|
+
vec = row.get("vector")
|
|
93
|
+
record["embedding"] = list(vec) if vec is not None else None
|
|
94
|
+
else:
|
|
95
|
+
record["embedding"] = None
|
|
96
|
+
# All metadata fields (exclude 'type' to avoid collision with record type marker)
|
|
97
|
+
for key in (
|
|
98
|
+
"wing",
|
|
99
|
+
"room",
|
|
100
|
+
"source_file",
|
|
101
|
+
"chunk_index",
|
|
102
|
+
"added_by",
|
|
103
|
+
"filed_at",
|
|
104
|
+
"hall",
|
|
105
|
+
"topic",
|
|
106
|
+
"agent",
|
|
107
|
+
"date",
|
|
108
|
+
"ingest_mode",
|
|
109
|
+
"extract_mode",
|
|
110
|
+
"compression_ratio",
|
|
111
|
+
"original_tokens",
|
|
112
|
+
"language",
|
|
113
|
+
"symbol_name",
|
|
114
|
+
"symbol_type",
|
|
115
|
+
"source_hash",
|
|
116
|
+
"extractor_version",
|
|
117
|
+
"chunker_strategy",
|
|
118
|
+
):
|
|
119
|
+
record[key] = row.get(key, "")
|
|
120
|
+
# `type` is overloaded — store drawer metadata `type` under `drawer_type`
|
|
121
|
+
record["drawer_type"] = row.get("type", "")
|
|
122
|
+
yield record
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def export_kg(
|
|
126
|
+
kg,
|
|
127
|
+
since: Optional[str] = None,
|
|
128
|
+
) -> Iterator[Dict[str, Any]]:
|
|
129
|
+
"""Yield one KG triple dict per record."""
|
|
130
|
+
for batch in kg.iter_all_triples():
|
|
131
|
+
for triple in batch:
|
|
132
|
+
if since and triple.get("valid_from") and triple.get("valid_from") < since:
|
|
133
|
+
continue
|
|
134
|
+
record = {"type": "kg_triple"}
|
|
135
|
+
record.update(triple)
|
|
136
|
+
yield record
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _count_drawers(
|
|
140
|
+
store,
|
|
141
|
+
only_manual: bool = False,
|
|
142
|
+
wing: Optional[str] = None,
|
|
143
|
+
room: Optional[str] = None,
|
|
144
|
+
since: Optional[str] = None,
|
|
145
|
+
) -> int:
|
|
146
|
+
"""Count matching drawers for the export header."""
|
|
147
|
+
return sum(
|
|
148
|
+
1 for _ in export_drawers(store, only_manual=only_manual, wing=wing, room=room, since=since)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _count_kg(kg, since: Optional[str] = None) -> int:
|
|
153
|
+
return sum(1 for _ in export_kg(kg, since=since))
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def write_jsonl(
|
|
157
|
+
path: str,
|
|
158
|
+
store,
|
|
159
|
+
kg=None,
|
|
160
|
+
only_manual: bool = False,
|
|
161
|
+
wing: Optional[str] = None,
|
|
162
|
+
room: Optional[str] = None,
|
|
163
|
+
since: Optional[str] = None,
|
|
164
|
+
include_vectors: bool = False,
|
|
165
|
+
include_kg: bool = False,
|
|
166
|
+
pretty: bool = False,
|
|
167
|
+
palace_path: str = "",
|
|
168
|
+
) -> Dict[str, int]:
|
|
169
|
+
"""Write export JSONL to *path* (use '-' for stdout).
|
|
170
|
+
|
|
171
|
+
Returns summary dict: {drawer_count, kg_count}.
|
|
172
|
+
"""
|
|
173
|
+
indent = 2 if pretty else None
|
|
174
|
+
|
|
175
|
+
filters: Dict[str, Any] = {}
|
|
176
|
+
if only_manual:
|
|
177
|
+
filters["only_manual"] = True
|
|
178
|
+
if wing:
|
|
179
|
+
filters["wing"] = wing
|
|
180
|
+
if room:
|
|
181
|
+
filters["room"] = room
|
|
182
|
+
if since:
|
|
183
|
+
filters["since"] = since
|
|
184
|
+
if include_vectors:
|
|
185
|
+
filters["with_embeddings"] = True
|
|
186
|
+
if include_kg:
|
|
187
|
+
filters["with_kg"] = True
|
|
188
|
+
|
|
189
|
+
# Pre-count for header (two passes — acceptable for the sizes we handle)
|
|
190
|
+
drawer_count = _count_drawers(store, only_manual=only_manual, wing=wing, room=room, since=since)
|
|
191
|
+
kg_count = _count_kg(kg, since=since) if (include_kg and kg is not None) else 0
|
|
192
|
+
|
|
193
|
+
header = _make_header(
|
|
194
|
+
palace_path=palace_path,
|
|
195
|
+
filters=filters,
|
|
196
|
+
drawer_count=drawer_count,
|
|
197
|
+
kg_count=kg_count,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
fh = sys.stdout if path == "-" else open(path, "w", encoding="utf-8")
|
|
201
|
+
try:
|
|
202
|
+
fh.write(json.dumps(header, indent=indent) + "\n")
|
|
203
|
+
|
|
204
|
+
for record in export_drawers(
|
|
205
|
+
store,
|
|
206
|
+
only_manual=only_manual,
|
|
207
|
+
wing=wing,
|
|
208
|
+
room=room,
|
|
209
|
+
since=since,
|
|
210
|
+
include_vectors=include_vectors,
|
|
211
|
+
):
|
|
212
|
+
fh.write(json.dumps(record, indent=indent) + "\n")
|
|
213
|
+
|
|
214
|
+
if include_kg and kg is not None:
|
|
215
|
+
for record in export_kg(kg, since=since):
|
|
216
|
+
fh.write(json.dumps(record, indent=indent) + "\n")
|
|
217
|
+
finally:
|
|
218
|
+
if fh is not sys.stdout:
|
|
219
|
+
fh.close()
|
|
220
|
+
|
|
221
|
+
return {"drawer_count": drawer_count, "kg_count": kg_count}
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# ── Import ────────────────────────────────────────────────────────────────────
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def read_jsonl(path: str) -> Iterator[Dict[str, Any]]:
|
|
228
|
+
"""Yield parsed JSON objects from a JSONL file."""
|
|
229
|
+
fh = sys.stdin if path == "-" else open(path, encoding="utf-8")
|
|
230
|
+
try:
|
|
231
|
+
for line in fh:
|
|
232
|
+
line = line.strip()
|
|
233
|
+
if line:
|
|
234
|
+
yield json.loads(line)
|
|
235
|
+
finally:
|
|
236
|
+
if fh is not sys.stdin:
|
|
237
|
+
fh.close()
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def import_jsonl(
|
|
241
|
+
path: str,
|
|
242
|
+
store,
|
|
243
|
+
kg=None,
|
|
244
|
+
skip_dedup: bool = False,
|
|
245
|
+
skip_kg: bool = False,
|
|
246
|
+
dry_run: bool = False,
|
|
247
|
+
wing_override: Optional[str] = None,
|
|
248
|
+
) -> Dict[str, Any]:
|
|
249
|
+
"""Import drawers and KG triples from a JSONL export file.
|
|
250
|
+
|
|
251
|
+
Returns summary: {imported_drawers, skipped_duplicates, imported_triples, warnings}.
|
|
252
|
+
"""
|
|
253
|
+
imported_drawers = 0
|
|
254
|
+
skipped_duplicates = 0
|
|
255
|
+
imported_triples = 0
|
|
256
|
+
warnings: List[str] = []
|
|
257
|
+
|
|
258
|
+
header_seen = False
|
|
259
|
+
no_header_warned = False
|
|
260
|
+
|
|
261
|
+
for record in read_jsonl(path):
|
|
262
|
+
rtype = record.get("type")
|
|
263
|
+
|
|
264
|
+
if rtype == "export_header":
|
|
265
|
+
header_seen = True
|
|
266
|
+
file_version = record.get("version", "")
|
|
267
|
+
if file_version and file_version != __version__:
|
|
268
|
+
msg = (
|
|
269
|
+
f"Version mismatch: export was created with {file_version}, "
|
|
270
|
+
f"current is {__version__}. Proceeding anyway."
|
|
271
|
+
)
|
|
272
|
+
warnings.append(msg)
|
|
273
|
+
print(f"WARNING: {msg}", file=sys.stderr)
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
if not header_seen and not no_header_warned:
|
|
277
|
+
warnings.append("No export_header found at start of file — format may be invalid.")
|
|
278
|
+
no_header_warned = True
|
|
279
|
+
|
|
280
|
+
if rtype == "drawer":
|
|
281
|
+
wing = wing_override or record.get("wing", "")
|
|
282
|
+
room = record.get("room", "")
|
|
283
|
+
text = record.get("text", "")
|
|
284
|
+
drawer_id = record.get("id", "")
|
|
285
|
+
|
|
286
|
+
if not text:
|
|
287
|
+
continue
|
|
288
|
+
|
|
289
|
+
# Dedup check via cosine similarity
|
|
290
|
+
if not skip_dedup:
|
|
291
|
+
try:
|
|
292
|
+
results = store.query(
|
|
293
|
+
query_texts=[text],
|
|
294
|
+
n_results=1,
|
|
295
|
+
include=["distances"],
|
|
296
|
+
)
|
|
297
|
+
dists = results.get("distances", [[]])[0]
|
|
298
|
+
if dists:
|
|
299
|
+
# LanceDB returns L2 distance; convert to approximate cosine similarity
|
|
300
|
+
# For unit vectors: cosine_sim ≈ 1 - (L2^2 / 2)
|
|
301
|
+
# At threshold 0.9 cosine → L2^2 ≈ 0.2 → L2 ≈ 0.447
|
|
302
|
+
l2 = dists[0]
|
|
303
|
+
cosine_sim = max(0.0, 1.0 - (l2 * l2) / 2.0)
|
|
304
|
+
if cosine_sim >= 0.9:
|
|
305
|
+
skipped_duplicates += 1
|
|
306
|
+
continue
|
|
307
|
+
except Exception:
|
|
308
|
+
pass # If dedup check fails, proceed with import
|
|
309
|
+
|
|
310
|
+
if dry_run:
|
|
311
|
+
imported_drawers += 1
|
|
312
|
+
continue
|
|
313
|
+
|
|
314
|
+
# Build metadata
|
|
315
|
+
meta_keys = (
|
|
316
|
+
"source_file",
|
|
317
|
+
"chunk_index",
|
|
318
|
+
"added_by",
|
|
319
|
+
"filed_at",
|
|
320
|
+
"hall",
|
|
321
|
+
"topic",
|
|
322
|
+
"agent",
|
|
323
|
+
"date",
|
|
324
|
+
"ingest_mode",
|
|
325
|
+
"extract_mode",
|
|
326
|
+
"compression_ratio",
|
|
327
|
+
"original_tokens",
|
|
328
|
+
"language",
|
|
329
|
+
"symbol_name",
|
|
330
|
+
"symbol_type",
|
|
331
|
+
"source_hash",
|
|
332
|
+
"extractor_version",
|
|
333
|
+
"chunker_strategy",
|
|
334
|
+
)
|
|
335
|
+
meta: Dict[str, Any] = {"wing": wing, "room": room}
|
|
336
|
+
for k in meta_keys:
|
|
337
|
+
if k in record:
|
|
338
|
+
meta[k] = record[k]
|
|
339
|
+
# `drawer_type` was stored to avoid collision with the record `type` key
|
|
340
|
+
if "drawer_type" in record:
|
|
341
|
+
meta["type"] = record["drawer_type"]
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
store.add(ids=[drawer_id], documents=[text], metadatas=[meta])
|
|
345
|
+
imported_drawers += 1
|
|
346
|
+
except Exception as exc:
|
|
347
|
+
# Duplicate ID — try upsert
|
|
348
|
+
try:
|
|
349
|
+
store.upsert(ids=[drawer_id], documents=[text], metadatas=[meta])
|
|
350
|
+
imported_drawers += 1
|
|
351
|
+
except Exception:
|
|
352
|
+
warnings.append(f"Failed to import drawer {drawer_id}: {exc}")
|
|
353
|
+
|
|
354
|
+
elif rtype == "kg_triple" and not skip_kg and kg is not None:
|
|
355
|
+
if dry_run:
|
|
356
|
+
imported_triples += 1
|
|
357
|
+
continue
|
|
358
|
+
try:
|
|
359
|
+
kg.add_triple(
|
|
360
|
+
subject=record.get("subject", ""),
|
|
361
|
+
predicate=record.get("predicate", ""),
|
|
362
|
+
obj=record.get("object", ""),
|
|
363
|
+
valid_from=record.get("valid_from"),
|
|
364
|
+
valid_to=record.get("valid_to"),
|
|
365
|
+
confidence=record.get("confidence", 1.0),
|
|
366
|
+
source_closet=record.get("source_closet"),
|
|
367
|
+
source_file=record.get("source_file"),
|
|
368
|
+
)
|
|
369
|
+
imported_triples += 1
|
|
370
|
+
except Exception as exc:
|
|
371
|
+
warnings.append(f"Failed to import KG triple: {exc}")
|
|
372
|
+
|
|
373
|
+
return {
|
|
374
|
+
"imported_drawers": imported_drawers,
|
|
375
|
+
"skipped_duplicates": skipped_duplicates,
|
|
376
|
+
"imported_triples": imported_triples,
|
|
377
|
+
"warnings": warnings,
|
|
378
|
+
}
|