mempalace-code 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mempalace/README.md +40 -0
- mempalace/__init__.py +6 -0
- mempalace/__main__.py +5 -0
- mempalace/cli.py +811 -0
- mempalace/config.py +149 -0
- mempalace/convo_miner.py +415 -0
- mempalace/dialect.py +1075 -0
- mempalace/entity_detector.py +853 -0
- mempalace/entity_registry.py +639 -0
- mempalace/export.py +378 -0
- mempalace/general_extractor.py +521 -0
- mempalace/knowledge_graph.py +410 -0
- mempalace/layers.py +515 -0
- mempalace/mcp_server.py +873 -0
- mempalace/migrate.py +153 -0
- mempalace/miner.py +1285 -0
- mempalace/normalize.py +328 -0
- mempalace/onboarding.py +489 -0
- mempalace/palace_graph.py +225 -0
- mempalace/py.typed +0 -0
- mempalace/room_detector_local.py +310 -0
- mempalace/searcher.py +305 -0
- mempalace/spellcheck.py +269 -0
- mempalace/split_mega_files.py +309 -0
- mempalace/storage.py +807 -0
- mempalace/version.py +3 -0
- mempalace_code-1.0.0.dist-info/METADATA +489 -0
- mempalace_code-1.0.0.dist-info/RECORD +32 -0
- mempalace_code-1.0.0.dist-info/WHEEL +4 -0
- mempalace_code-1.0.0.dist-info/entry_points.txt +2 -0
- mempalace_code-1.0.0.dist-info/licenses/LICENSE +192 -0
- mempalace_code-1.0.0.dist-info/licenses/NOTICE +17 -0
|
@@ -0,0 +1,639 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
entity_registry.py — Persistent personal entity registry for MemPalace.
|
|
4
|
+
|
|
5
|
+
Knows the difference between Riley (a person) and ever (an adverb).
|
|
6
|
+
Built from three sources, in priority order:
|
|
7
|
+
1. Onboarding — what the user explicitly told us
|
|
8
|
+
2. Learned — what we inferred from session history with high confidence
|
|
9
|
+
3. Researched — what we looked up via Wikipedia for unknown words
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
from mempalace.entity_registry import EntityRegistry
|
|
13
|
+
registry = EntityRegistry.load()
|
|
14
|
+
result = registry.lookup("Riley", context="I went with Riley today")
|
|
15
|
+
# → {"type": "person", "confidence": 1.0, "source": "onboarding"}
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
import urllib.request
|
|
21
|
+
import urllib.parse
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Optional
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
27
|
+
# Common English words that could be confused with names
|
|
28
|
+
# These get flagged as AMBIGUOUS and require context disambiguation
|
|
29
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
30
|
+
|
|
31
|
+
COMMON_ENGLISH_WORDS = {
|
|
32
|
+
# Words that are also common personal names
|
|
33
|
+
"ever",
|
|
34
|
+
"grace",
|
|
35
|
+
"will",
|
|
36
|
+
"bill",
|
|
37
|
+
"mark",
|
|
38
|
+
"april",
|
|
39
|
+
"may",
|
|
40
|
+
"june",
|
|
41
|
+
"joy",
|
|
42
|
+
"hope",
|
|
43
|
+
"faith",
|
|
44
|
+
"chance",
|
|
45
|
+
"chase",
|
|
46
|
+
"hunter",
|
|
47
|
+
"dash",
|
|
48
|
+
"flash",
|
|
49
|
+
"star",
|
|
50
|
+
"sky",
|
|
51
|
+
"river",
|
|
52
|
+
"brook",
|
|
53
|
+
"lane",
|
|
54
|
+
"art",
|
|
55
|
+
"clay",
|
|
56
|
+
"gil",
|
|
57
|
+
"nat",
|
|
58
|
+
"max",
|
|
59
|
+
"rex",
|
|
60
|
+
"ray",
|
|
61
|
+
"jay",
|
|
62
|
+
"rose",
|
|
63
|
+
"violet",
|
|
64
|
+
"lily",
|
|
65
|
+
"ivy",
|
|
66
|
+
"ash",
|
|
67
|
+
"reed",
|
|
68
|
+
"sage",
|
|
69
|
+
# Words that look like names at start of sentence
|
|
70
|
+
"monday",
|
|
71
|
+
"tuesday",
|
|
72
|
+
"wednesday",
|
|
73
|
+
"thursday",
|
|
74
|
+
"friday",
|
|
75
|
+
"saturday",
|
|
76
|
+
"sunday",
|
|
77
|
+
"january",
|
|
78
|
+
"february",
|
|
79
|
+
"march",
|
|
80
|
+
"july",
|
|
81
|
+
"august",
|
|
82
|
+
"september",
|
|
83
|
+
"october",
|
|
84
|
+
"november",
|
|
85
|
+
"december",
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# Context patterns that indicate a word is being used as a PERSON name
|
|
89
|
+
PERSON_CONTEXT_PATTERNS = [
|
|
90
|
+
r"\b{name}\s+said\b",
|
|
91
|
+
r"\b{name}\s+told\b",
|
|
92
|
+
r"\b{name}\s+asked\b",
|
|
93
|
+
r"\b{name}\s+laughed\b",
|
|
94
|
+
r"\b{name}\s+smiled\b",
|
|
95
|
+
r"\b{name}\s+was\b",
|
|
96
|
+
r"\b{name}\s+is\b",
|
|
97
|
+
r"\b{name}\s+called\b",
|
|
98
|
+
r"\b{name}\s+texted\b",
|
|
99
|
+
r"\bwith\s+{name}\b",
|
|
100
|
+
r"\bsaw\s+{name}\b",
|
|
101
|
+
r"\bcalled\s+{name}\b",
|
|
102
|
+
r"\btook\s+{name}\b",
|
|
103
|
+
r"\bpicked\s+up\s+{name}\b",
|
|
104
|
+
r"\bdrop(?:ped)?\s+(?:off\s+)?{name}\b",
|
|
105
|
+
r"\b{name}(?:'s|s')\b", # Riley's, Max's
|
|
106
|
+
r"\bhey\s+{name}\b",
|
|
107
|
+
r"\bthanks?\s+{name}\b",
|
|
108
|
+
r"^{name}[:\s]", # dialogue: "Riley: ..."
|
|
109
|
+
r"\bmy\s+(?:son|daughter|kid|child|brother|sister|friend|partner|colleague|coworker)\s+{name}\b",
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
# Context patterns that indicate a word is NOT being used as a name
|
|
113
|
+
CONCEPT_CONTEXT_PATTERNS = [
|
|
114
|
+
r"\bhave\s+you\s+{name}\b", # "have you ever"
|
|
115
|
+
r"\bif\s+you\s+{name}\b", # "if you ever"
|
|
116
|
+
r"\b{name}\s+since\b", # "ever since"
|
|
117
|
+
r"\b{name}\s+again\b", # "ever again"
|
|
118
|
+
r"\bnot\s+{name}\b", # "not ever"
|
|
119
|
+
r"\b{name}\s+more\b", # "ever more"
|
|
120
|
+
r"\bwould\s+{name}\b", # "would ever"
|
|
121
|
+
r"\bcould\s+{name}\b", # "could ever"
|
|
122
|
+
r"\bwill\s+{name}\b", # "will ever"
|
|
123
|
+
r"(?:the\s+)?{name}\s+(?:of|in|at|for|to)\b", # "the grace of", "the mark of"
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
128
|
+
# Wikipedia lookup for unknown words
|
|
129
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
# Phrases in Wikipedia summaries that indicate a personal name
|
|
132
|
+
NAME_INDICATOR_PHRASES = [
|
|
133
|
+
"given name",
|
|
134
|
+
"personal name",
|
|
135
|
+
"first name",
|
|
136
|
+
"forename",
|
|
137
|
+
"masculine name",
|
|
138
|
+
"feminine name",
|
|
139
|
+
"boy's name",
|
|
140
|
+
"girl's name",
|
|
141
|
+
"male name",
|
|
142
|
+
"female name",
|
|
143
|
+
"irish name",
|
|
144
|
+
"welsh name",
|
|
145
|
+
"scottish name",
|
|
146
|
+
"gaelic name",
|
|
147
|
+
"hebrew name",
|
|
148
|
+
"arabic name",
|
|
149
|
+
"norse name",
|
|
150
|
+
"old english name",
|
|
151
|
+
"is a name",
|
|
152
|
+
"as a name",
|
|
153
|
+
"name meaning",
|
|
154
|
+
"name derived from",
|
|
155
|
+
"legendary irish",
|
|
156
|
+
"legendary welsh",
|
|
157
|
+
"legendary scottish",
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
PLACE_INDICATOR_PHRASES = [
|
|
161
|
+
"city in",
|
|
162
|
+
"town in",
|
|
163
|
+
"village in",
|
|
164
|
+
"municipality",
|
|
165
|
+
"capital of",
|
|
166
|
+
"district of",
|
|
167
|
+
"county",
|
|
168
|
+
"province",
|
|
169
|
+
"region of",
|
|
170
|
+
"island of",
|
|
171
|
+
"mountain in",
|
|
172
|
+
"river in",
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _wikipedia_lookup(word: str) -> dict:
|
|
177
|
+
"""
|
|
178
|
+
Look up a word via Wikipedia REST API.
|
|
179
|
+
Returns inferred type (person/place/concept/unknown) + confidence + summary.
|
|
180
|
+
Free, no API key, handles disambiguation pages.
|
|
181
|
+
"""
|
|
182
|
+
try:
|
|
183
|
+
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(word)}"
|
|
184
|
+
req = urllib.request.Request(url, headers={"User-Agent": "MemPalace/1.0"})
|
|
185
|
+
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
186
|
+
data = json.loads(resp.read())
|
|
187
|
+
|
|
188
|
+
page_type = data.get("type", "")
|
|
189
|
+
extract = data.get("extract", "").lower()
|
|
190
|
+
title = data.get("title", word)
|
|
191
|
+
|
|
192
|
+
# Disambiguation — look at description
|
|
193
|
+
if page_type == "disambiguation":
|
|
194
|
+
desc = data.get("description", "").lower()
|
|
195
|
+
if any(p in desc for p in ["name", "given name"]):
|
|
196
|
+
return {
|
|
197
|
+
"inferred_type": "person",
|
|
198
|
+
"confidence": 0.65,
|
|
199
|
+
"wiki_summary": extract[:200],
|
|
200
|
+
"wiki_title": title,
|
|
201
|
+
"note": "disambiguation page with name entries",
|
|
202
|
+
}
|
|
203
|
+
return {
|
|
204
|
+
"inferred_type": "ambiguous",
|
|
205
|
+
"confidence": 0.4,
|
|
206
|
+
"wiki_summary": extract[:200],
|
|
207
|
+
"wiki_title": title,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# Check for name indicators
|
|
211
|
+
if any(phrase in extract for phrase in NAME_INDICATOR_PHRASES):
|
|
212
|
+
# Higher confidence if the word itself is described as a name
|
|
213
|
+
confidence = (
|
|
214
|
+
0.90
|
|
215
|
+
if any(
|
|
216
|
+
f"{word.lower()} is a" in extract or f"{word.lower()} (name" in extract
|
|
217
|
+
for _ in [1]
|
|
218
|
+
)
|
|
219
|
+
else 0.80
|
|
220
|
+
)
|
|
221
|
+
return {
|
|
222
|
+
"inferred_type": "person",
|
|
223
|
+
"confidence": confidence,
|
|
224
|
+
"wiki_summary": extract[:200],
|
|
225
|
+
"wiki_title": title,
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
# Check for place indicators
|
|
229
|
+
if any(phrase in extract for phrase in PLACE_INDICATOR_PHRASES):
|
|
230
|
+
return {
|
|
231
|
+
"inferred_type": "place",
|
|
232
|
+
"confidence": 0.80,
|
|
233
|
+
"wiki_summary": extract[:200],
|
|
234
|
+
"wiki_title": title,
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
# Found but doesn't match name/place patterns
|
|
238
|
+
return {
|
|
239
|
+
"inferred_type": "concept",
|
|
240
|
+
"confidence": 0.60,
|
|
241
|
+
"wiki_summary": extract[:200],
|
|
242
|
+
"wiki_title": title,
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
except urllib.error.HTTPError as e:
|
|
246
|
+
if e.code == 404:
|
|
247
|
+
# Not in Wikipedia — strong signal it's a proper noun (unusual name, nickname)
|
|
248
|
+
return {
|
|
249
|
+
"inferred_type": "person",
|
|
250
|
+
"confidence": 0.70,
|
|
251
|
+
"wiki_summary": None,
|
|
252
|
+
"wiki_title": None,
|
|
253
|
+
"note": "not found in Wikipedia — likely a proper noun or unusual name",
|
|
254
|
+
}
|
|
255
|
+
return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
|
|
256
|
+
except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError):
|
|
257
|
+
return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
261
|
+
# Entity Registry
|
|
262
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class EntityRegistry:
|
|
266
|
+
"""
|
|
267
|
+
Persistent personal entity registry.
|
|
268
|
+
|
|
269
|
+
Stored at ~/.mempalace/entity_registry.json
|
|
270
|
+
Schema:
|
|
271
|
+
{
|
|
272
|
+
"mode": "personal", # work | personal | combo
|
|
273
|
+
"version": 1,
|
|
274
|
+
"people": {
|
|
275
|
+
"Riley": {
|
|
276
|
+
"source": "onboarding",
|
|
277
|
+
"contexts": ["personal"],
|
|
278
|
+
"aliases": [],
|
|
279
|
+
"relationship": "daughter",
|
|
280
|
+
"confidence": 1.0
|
|
281
|
+
}
|
|
282
|
+
},
|
|
283
|
+
"projects": ["MemPalace", "Acme"],
|
|
284
|
+
"ambiguous_flags": ["riley", "max"],
|
|
285
|
+
"wiki_cache": {
|
|
286
|
+
"Sam": {"inferred_type": "person", "confidence": 0.9, "confirmed": true, ...}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
DEFAULT_PATH = Path.home() / ".mempalace" / "entity_registry.json"
|
|
292
|
+
|
|
293
|
+
def __init__(self, data: dict, path: Path):
|
|
294
|
+
self._data = data
|
|
295
|
+
self._path = path
|
|
296
|
+
|
|
297
|
+
# ── Load / Save ──────────────────────────────────────────────────────────
|
|
298
|
+
|
|
299
|
+
@classmethod
|
|
300
|
+
def load(cls, config_dir: Optional[Path] = None) -> "EntityRegistry":
|
|
301
|
+
path = (Path(config_dir) / "entity_registry.json") if config_dir else cls.DEFAULT_PATH
|
|
302
|
+
if path.exists():
|
|
303
|
+
try:
|
|
304
|
+
data = json.loads(path.read_text())
|
|
305
|
+
return cls(data, path)
|
|
306
|
+
except (json.JSONDecodeError, OSError):
|
|
307
|
+
pass
|
|
308
|
+
return cls(cls._empty(), path)
|
|
309
|
+
|
|
310
|
+
def save(self):
|
|
311
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
312
|
+
self._path.write_text(json.dumps(self._data, indent=2))
|
|
313
|
+
|
|
314
|
+
@staticmethod
|
|
315
|
+
def _empty() -> dict:
|
|
316
|
+
return {
|
|
317
|
+
"version": 1,
|
|
318
|
+
"mode": "personal",
|
|
319
|
+
"people": {},
|
|
320
|
+
"projects": [],
|
|
321
|
+
"ambiguous_flags": [],
|
|
322
|
+
"wiki_cache": {},
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
# ── Properties ───────────────────────────────────────────────────────────
|
|
326
|
+
|
|
327
|
+
@property
|
|
328
|
+
def mode(self) -> str:
|
|
329
|
+
return self._data.get("mode", "personal")
|
|
330
|
+
|
|
331
|
+
@property
|
|
332
|
+
def people(self) -> dict:
|
|
333
|
+
return self._data.get("people", {})
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def projects(self) -> list:
|
|
337
|
+
return self._data.get("projects", [])
|
|
338
|
+
|
|
339
|
+
@property
|
|
340
|
+
def ambiguous_flags(self) -> list:
|
|
341
|
+
return self._data.get("ambiguous_flags", [])
|
|
342
|
+
|
|
343
|
+
# ── Seed from onboarding ─────────────────────────────────────────────────
|
|
344
|
+
|
|
345
|
+
def seed(self, mode: str, people: list, projects: list, aliases: dict = None):
|
|
346
|
+
"""
|
|
347
|
+
Seed the registry from onboarding data.
|
|
348
|
+
|
|
349
|
+
people: list of dicts {"name": str, "relationship": str, "context": str}
|
|
350
|
+
projects: list of str
|
|
351
|
+
aliases: dict {"Max": "Maxwell", ...}
|
|
352
|
+
"""
|
|
353
|
+
self._data["mode"] = mode
|
|
354
|
+
self._data["projects"] = list(projects)
|
|
355
|
+
|
|
356
|
+
aliases = aliases or {}
|
|
357
|
+
reverse_aliases = {v: k for k, v in aliases.items()} # Maxwell → Max
|
|
358
|
+
|
|
359
|
+
for entry in people:
|
|
360
|
+
name = entry["name"].strip()
|
|
361
|
+
if not name:
|
|
362
|
+
continue
|
|
363
|
+
context = entry.get("context", "personal")
|
|
364
|
+
relationship = entry.get("relationship", "")
|
|
365
|
+
|
|
366
|
+
self._data["people"][name] = {
|
|
367
|
+
"source": "onboarding",
|
|
368
|
+
"contexts": [context],
|
|
369
|
+
"aliases": [reverse_aliases[name]] if name in reverse_aliases else [],
|
|
370
|
+
"relationship": relationship,
|
|
371
|
+
"confidence": 1.0,
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
# Also register aliases
|
|
375
|
+
if name in reverse_aliases:
|
|
376
|
+
alias = reverse_aliases[name]
|
|
377
|
+
self._data["people"][alias] = {
|
|
378
|
+
"source": "onboarding",
|
|
379
|
+
"contexts": [context],
|
|
380
|
+
"aliases": [name],
|
|
381
|
+
"relationship": relationship,
|
|
382
|
+
"confidence": 1.0,
|
|
383
|
+
"canonical": name,
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
# Flag ambiguous names (also common English words)
|
|
387
|
+
ambiguous = []
|
|
388
|
+
for name in self._data["people"]:
|
|
389
|
+
if name.lower() in COMMON_ENGLISH_WORDS:
|
|
390
|
+
ambiguous.append(name.lower())
|
|
391
|
+
self._data["ambiguous_flags"] = ambiguous
|
|
392
|
+
|
|
393
|
+
self.save()
|
|
394
|
+
|
|
395
|
+
# ── Lookup ───────────────────────────────────────────────────────────────
|
|
396
|
+
|
|
397
|
+
def lookup(self, word: str, context: str = "") -> dict:
|
|
398
|
+
"""
|
|
399
|
+
Look up a word. Returns entity classification.
|
|
400
|
+
|
|
401
|
+
context: surrounding sentence (used for disambiguation of ambiguous words)
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
{"type": "person"|"project"|"concept"|"unknown",
|
|
405
|
+
"confidence": float,
|
|
406
|
+
"source": "onboarding"|"learned"|"wiki"|"inferred",
|
|
407
|
+
"name": canonical name if found,
|
|
408
|
+
"needs_disambiguation": bool}
|
|
409
|
+
"""
|
|
410
|
+
# 1. Exact match in people registry
|
|
411
|
+
for canonical, info in self.people.items():
|
|
412
|
+
if word.lower() == canonical.lower() or word.lower() in [
|
|
413
|
+
a.lower() for a in info.get("aliases", [])
|
|
414
|
+
]:
|
|
415
|
+
# Check if this is an ambiguous word
|
|
416
|
+
if word.lower() in self.ambiguous_flags and context:
|
|
417
|
+
resolved = self._disambiguate(word, context, info)
|
|
418
|
+
if resolved is not None:
|
|
419
|
+
return resolved
|
|
420
|
+
return {
|
|
421
|
+
"type": "person",
|
|
422
|
+
"confidence": info["confidence"],
|
|
423
|
+
"source": info["source"],
|
|
424
|
+
"name": canonical,
|
|
425
|
+
"context": info.get("contexts", ["personal"]),
|
|
426
|
+
"needs_disambiguation": False,
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
# 2. Project match
|
|
430
|
+
for proj in self.projects:
|
|
431
|
+
if word.lower() == proj.lower():
|
|
432
|
+
return {
|
|
433
|
+
"type": "project",
|
|
434
|
+
"confidence": 1.0,
|
|
435
|
+
"source": "onboarding",
|
|
436
|
+
"name": proj,
|
|
437
|
+
"needs_disambiguation": False,
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
# 3. Wiki cache
|
|
441
|
+
cache = self._data.get("wiki_cache", {})
|
|
442
|
+
for cached_word, cached_result in cache.items():
|
|
443
|
+
if word.lower() == cached_word.lower() and cached_result.get("confirmed"):
|
|
444
|
+
return {
|
|
445
|
+
"type": cached_result["inferred_type"],
|
|
446
|
+
"confidence": cached_result["confidence"],
|
|
447
|
+
"source": "wiki",
|
|
448
|
+
"name": word,
|
|
449
|
+
"needs_disambiguation": False,
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
return {
|
|
453
|
+
"type": "unknown",
|
|
454
|
+
"confidence": 0.0,
|
|
455
|
+
"source": "none",
|
|
456
|
+
"name": word,
|
|
457
|
+
"needs_disambiguation": False,
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
def _disambiguate(self, word: str, context: str, person_info: dict) -> Optional[dict]:
|
|
461
|
+
"""
|
|
462
|
+
When a word is both a name and a common word, check context.
|
|
463
|
+
Returns person result if context suggests a name, None if ambiguous.
|
|
464
|
+
"""
|
|
465
|
+
name_lower = word.lower()
|
|
466
|
+
ctx_lower = context.lower()
|
|
467
|
+
|
|
468
|
+
# Check person context patterns
|
|
469
|
+
person_score = 0
|
|
470
|
+
for pat in PERSON_CONTEXT_PATTERNS:
|
|
471
|
+
if re.search(pat.format(name=re.escape(name_lower)), ctx_lower):
|
|
472
|
+
person_score += 1
|
|
473
|
+
|
|
474
|
+
# Check concept context patterns
|
|
475
|
+
concept_score = 0
|
|
476
|
+
for pat in CONCEPT_CONTEXT_PATTERNS:
|
|
477
|
+
if re.search(pat.format(name=re.escape(name_lower)), ctx_lower):
|
|
478
|
+
concept_score += 1
|
|
479
|
+
|
|
480
|
+
if person_score > concept_score:
|
|
481
|
+
return {
|
|
482
|
+
"type": "person",
|
|
483
|
+
"confidence": min(0.95, 0.7 + person_score * 0.1),
|
|
484
|
+
"source": person_info["source"],
|
|
485
|
+
"name": word,
|
|
486
|
+
"context": person_info.get("contexts", ["personal"]),
|
|
487
|
+
"needs_disambiguation": False,
|
|
488
|
+
"disambiguated_by": "context_patterns",
|
|
489
|
+
}
|
|
490
|
+
elif concept_score > person_score:
|
|
491
|
+
return {
|
|
492
|
+
"type": "concept",
|
|
493
|
+
"confidence": min(0.90, 0.7 + concept_score * 0.1),
|
|
494
|
+
"source": "context_disambiguated",
|
|
495
|
+
"name": word,
|
|
496
|
+
"needs_disambiguation": False,
|
|
497
|
+
"disambiguated_by": "context_patterns",
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
# Truly ambiguous — return None to fall through to person (registered name)
|
|
501
|
+
return None
|
|
502
|
+
|
|
503
|
+
# ── Research unknown words ───────────────────────────────────────────────
|
|
504
|
+
|
|
505
|
+
def research(self, word: str, auto_confirm: bool = False) -> dict:
|
|
506
|
+
"""
|
|
507
|
+
Research an unknown word via Wikipedia.
|
|
508
|
+
Caches result. If auto_confirm=False, marks as unconfirmed (needs user review).
|
|
509
|
+
Returns the lookup result.
|
|
510
|
+
"""
|
|
511
|
+
# Already cached?
|
|
512
|
+
cache = self._data.setdefault("wiki_cache", {})
|
|
513
|
+
if word in cache:
|
|
514
|
+
return cache[word]
|
|
515
|
+
|
|
516
|
+
result = _wikipedia_lookup(word)
|
|
517
|
+
result["word"] = word
|
|
518
|
+
result["confirmed"] = auto_confirm
|
|
519
|
+
|
|
520
|
+
cache[word] = result
|
|
521
|
+
self.save()
|
|
522
|
+
return result
|
|
523
|
+
|
|
524
|
+
def confirm_research(
|
|
525
|
+
self, word: str, entity_type: str, relationship: str = "", context: str = "personal"
|
|
526
|
+
):
|
|
527
|
+
"""Mark a researched word as confirmed and add to people registry."""
|
|
528
|
+
cache = self._data.get("wiki_cache", {})
|
|
529
|
+
if word in cache:
|
|
530
|
+
cache[word]["confirmed"] = True
|
|
531
|
+
cache[word]["confirmed_type"] = entity_type
|
|
532
|
+
|
|
533
|
+
if entity_type == "person":
|
|
534
|
+
self._data["people"][word] = {
|
|
535
|
+
"source": "wiki",
|
|
536
|
+
"contexts": [context],
|
|
537
|
+
"aliases": [],
|
|
538
|
+
"relationship": relationship,
|
|
539
|
+
"confidence": 0.90,
|
|
540
|
+
}
|
|
541
|
+
if word.lower() in COMMON_ENGLISH_WORDS:
|
|
542
|
+
flags = self._data.setdefault("ambiguous_flags", [])
|
|
543
|
+
if word.lower() not in flags:
|
|
544
|
+
flags.append(word.lower())
|
|
545
|
+
|
|
546
|
+
self.save()
|
|
547
|
+
|
|
548
|
+
# ── Learn from sessions ──────────────────────────────────────────────────
|
|
549
|
+
|
|
550
|
+
def learn_from_text(self, text: str, min_confidence: float = 0.75) -> list:
|
|
551
|
+
"""
|
|
552
|
+
Scan session text for new entity candidates.
|
|
553
|
+
Returns list of newly discovered candidates for review.
|
|
554
|
+
"""
|
|
555
|
+
from mempalace.entity_detector import extract_candidates, score_entity, classify_entity
|
|
556
|
+
|
|
557
|
+
lines = text.splitlines()
|
|
558
|
+
candidates = extract_candidates(text)
|
|
559
|
+
new_candidates = []
|
|
560
|
+
|
|
561
|
+
for name, frequency in candidates.items():
|
|
562
|
+
# Skip if already known
|
|
563
|
+
if name in self.people or name in self.projects:
|
|
564
|
+
continue
|
|
565
|
+
|
|
566
|
+
scores = score_entity(name, text, lines)
|
|
567
|
+
entity = classify_entity(name, frequency, scores)
|
|
568
|
+
|
|
569
|
+
if entity["type"] == "person" and entity["confidence"] >= min_confidence:
|
|
570
|
+
self._data["people"][name] = {
|
|
571
|
+
"source": "learned",
|
|
572
|
+
"contexts": [self.mode if self.mode != "combo" else "personal"],
|
|
573
|
+
"aliases": [],
|
|
574
|
+
"relationship": "",
|
|
575
|
+
"confidence": entity["confidence"],
|
|
576
|
+
"seen_count": frequency,
|
|
577
|
+
}
|
|
578
|
+
if name.lower() in COMMON_ENGLISH_WORDS:
|
|
579
|
+
flags = self._data.setdefault("ambiguous_flags", [])
|
|
580
|
+
if name.lower() not in flags:
|
|
581
|
+
flags.append(name.lower())
|
|
582
|
+
new_candidates.append(entity)
|
|
583
|
+
|
|
584
|
+
if new_candidates:
|
|
585
|
+
self.save()
|
|
586
|
+
|
|
587
|
+
return new_candidates
|
|
588
|
+
|
|
589
|
+
# ── Query helpers for retrieval ──────────────────────────────────────────
|
|
590
|
+
|
|
591
|
+
def extract_people_from_query(self, query: str) -> list:
|
|
592
|
+
"""
|
|
593
|
+
Extract known person names from a query string.
|
|
594
|
+
Returns list of canonical names found.
|
|
595
|
+
"""
|
|
596
|
+
found = []
|
|
597
|
+
|
|
598
|
+
for canonical, info in self.people.items():
|
|
599
|
+
names_to_check = [canonical] + info.get("aliases", [])
|
|
600
|
+
for name in names_to_check:
|
|
601
|
+
# Word boundary match
|
|
602
|
+
if re.search(rf"\b{re.escape(name)}\b", query, re.IGNORECASE):
|
|
603
|
+
# For ambiguous words, check context
|
|
604
|
+
if name.lower() in self.ambiguous_flags:
|
|
605
|
+
result = self._disambiguate(name, query, info)
|
|
606
|
+
if result and result["type"] == "person":
|
|
607
|
+
if canonical not in found:
|
|
608
|
+
found.append(canonical)
|
|
609
|
+
else:
|
|
610
|
+
if canonical not in found:
|
|
611
|
+
found.append(canonical)
|
|
612
|
+
return found
|
|
613
|
+
|
|
614
|
+
def extract_unknown_candidates(self, query: str) -> list:
|
|
615
|
+
"""
|
|
616
|
+
Find capitalized words in query that aren't in registry or common words.
|
|
617
|
+
These are candidates for Wikipedia research.
|
|
618
|
+
"""
|
|
619
|
+
candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query)
|
|
620
|
+
unknown = []
|
|
621
|
+
for word in set(candidates):
|
|
622
|
+
if word.lower() in COMMON_ENGLISH_WORDS:
|
|
623
|
+
continue
|
|
624
|
+
result = self.lookup(word)
|
|
625
|
+
if result["type"] == "unknown":
|
|
626
|
+
unknown.append(word)
|
|
627
|
+
return unknown
|
|
628
|
+
|
|
629
|
+
# ── Summary ──────────────────────────────────────────────────────────────
|
|
630
|
+
|
|
631
|
+
def summary(self) -> str:
|
|
632
|
+
lines = [
|
|
633
|
+
f"Mode: {self.mode}",
|
|
634
|
+
f"People: {len(self.people)} ({', '.join(list(self.people.keys())[:8])}{'...' if len(self.people) > 8 else ''})",
|
|
635
|
+
f"Projects: {', '.join(self.projects) or '(none)'}",
|
|
636
|
+
f"Ambiguous flags: {', '.join(self.ambiguous_flags) or '(none)'}",
|
|
637
|
+
f"Wiki cache: {len(self._data.get('wiki_cache', {}))} entries",
|
|
638
|
+
]
|
|
639
|
+
return "\n".join(lines)
|