mempalace-code 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,639 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ entity_registry.py — Persistent personal entity registry for MemPalace.
4
+
5
+ Knows the difference between Riley (a person) and ever (an adverb).
6
+ Built from three sources, in priority order:
7
+ 1. Onboarding — what the user explicitly told us
8
+ 2. Learned — what we inferred from session history with high confidence
9
+ 3. Researched — what we looked up via Wikipedia for unknown words
10
+
11
+ Usage:
12
+ from mempalace.entity_registry import EntityRegistry
13
+ registry = EntityRegistry.load()
14
+ result = registry.lookup("Riley", context="I went with Riley today")
15
+ # → {"type": "person", "confidence": 1.0, "source": "onboarding"}
16
+ """
17
+
18
+ import json
19
+ import re
20
+ import urllib.request
21
+ import urllib.parse
22
+ from pathlib import Path
23
+ from typing import Optional
24
+
25
+
26
+ # ─────────────────────────────────────────────────────────────────────────────
27
+ # Common English words that could be confused with names
28
+ # These get flagged as AMBIGUOUS and require context disambiguation
29
+ # ─────────────────────────────────────────────────────────────────────────────
30
+
31
+ COMMON_ENGLISH_WORDS = {
32
+ # Words that are also common personal names
33
+ "ever",
34
+ "grace",
35
+ "will",
36
+ "bill",
37
+ "mark",
38
+ "april",
39
+ "may",
40
+ "june",
41
+ "joy",
42
+ "hope",
43
+ "faith",
44
+ "chance",
45
+ "chase",
46
+ "hunter",
47
+ "dash",
48
+ "flash",
49
+ "star",
50
+ "sky",
51
+ "river",
52
+ "brook",
53
+ "lane",
54
+ "art",
55
+ "clay",
56
+ "gil",
57
+ "nat",
58
+ "max",
59
+ "rex",
60
+ "ray",
61
+ "jay",
62
+ "rose",
63
+ "violet",
64
+ "lily",
65
+ "ivy",
66
+ "ash",
67
+ "reed",
68
+ "sage",
69
+ # Words that look like names at start of sentence
70
+ "monday",
71
+ "tuesday",
72
+ "wednesday",
73
+ "thursday",
74
+ "friday",
75
+ "saturday",
76
+ "sunday",
77
+ "january",
78
+ "february",
79
+ "march",
80
+ "july",
81
+ "august",
82
+ "september",
83
+ "october",
84
+ "november",
85
+ "december",
86
+ }
87
+
88
+ # Context patterns that indicate a word is being used as a PERSON name
89
+ PERSON_CONTEXT_PATTERNS = [
90
+ r"\b{name}\s+said\b",
91
+ r"\b{name}\s+told\b",
92
+ r"\b{name}\s+asked\b",
93
+ r"\b{name}\s+laughed\b",
94
+ r"\b{name}\s+smiled\b",
95
+ r"\b{name}\s+was\b",
96
+ r"\b{name}\s+is\b",
97
+ r"\b{name}\s+called\b",
98
+ r"\b{name}\s+texted\b",
99
+ r"\bwith\s+{name}\b",
100
+ r"\bsaw\s+{name}\b",
101
+ r"\bcalled\s+{name}\b",
102
+ r"\btook\s+{name}\b",
103
+ r"\bpicked\s+up\s+{name}\b",
104
+ r"\bdrop(?:ped)?\s+(?:off\s+)?{name}\b",
105
+ r"\b{name}(?:'s|s')\b", # Riley's, Max's
106
+ r"\bhey\s+{name}\b",
107
+ r"\bthanks?\s+{name}\b",
108
+ r"^{name}[:\s]", # dialogue: "Riley: ..."
109
+ r"\bmy\s+(?:son|daughter|kid|child|brother|sister|friend|partner|colleague|coworker)\s+{name}\b",
110
+ ]
111
+
112
+ # Context patterns that indicate a word is NOT being used as a name
113
+ CONCEPT_CONTEXT_PATTERNS = [
114
+ r"\bhave\s+you\s+{name}\b", # "have you ever"
115
+ r"\bif\s+you\s+{name}\b", # "if you ever"
116
+ r"\b{name}\s+since\b", # "ever since"
117
+ r"\b{name}\s+again\b", # "ever again"
118
+ r"\bnot\s+{name}\b", # "not ever"
119
+ r"\b{name}\s+more\b", # "ever more"
120
+ r"\bwould\s+{name}\b", # "would ever"
121
+ r"\bcould\s+{name}\b", # "could ever"
122
+ r"\bwill\s+{name}\b", # "will ever"
123
+ r"(?:the\s+)?{name}\s+(?:of|in|at|for|to)\b", # "the grace of", "the mark of"
124
+ ]
125
+
126
+
127
+ # ─────────────────────────────────────────────────────────────────────────────
128
+ # Wikipedia lookup for unknown words
129
+ # ─────────────────────────────────────────────────────────────────────────────
130
+
131
+ # Phrases in Wikipedia summaries that indicate a personal name
132
+ NAME_INDICATOR_PHRASES = [
133
+ "given name",
134
+ "personal name",
135
+ "first name",
136
+ "forename",
137
+ "masculine name",
138
+ "feminine name",
139
+ "boy's name",
140
+ "girl's name",
141
+ "male name",
142
+ "female name",
143
+ "irish name",
144
+ "welsh name",
145
+ "scottish name",
146
+ "gaelic name",
147
+ "hebrew name",
148
+ "arabic name",
149
+ "norse name",
150
+ "old english name",
151
+ "is a name",
152
+ "as a name",
153
+ "name meaning",
154
+ "name derived from",
155
+ "legendary irish",
156
+ "legendary welsh",
157
+ "legendary scottish",
158
+ ]
159
+
160
+ PLACE_INDICATOR_PHRASES = [
161
+ "city in",
162
+ "town in",
163
+ "village in",
164
+ "municipality",
165
+ "capital of",
166
+ "district of",
167
+ "county",
168
+ "province",
169
+ "region of",
170
+ "island of",
171
+ "mountain in",
172
+ "river in",
173
+ ]
174
+
175
+
176
+ def _wikipedia_lookup(word: str) -> dict:
177
+ """
178
+ Look up a word via Wikipedia REST API.
179
+ Returns inferred type (person/place/concept/unknown) + confidence + summary.
180
+ Free, no API key, handles disambiguation pages.
181
+ """
182
+ try:
183
+ url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(word)}"
184
+ req = urllib.request.Request(url, headers={"User-Agent": "MemPalace/1.0"})
185
+ with urllib.request.urlopen(req, timeout=5) as resp:
186
+ data = json.loads(resp.read())
187
+
188
+ page_type = data.get("type", "")
189
+ extract = data.get("extract", "").lower()
190
+ title = data.get("title", word)
191
+
192
+ # Disambiguation — look at description
193
+ if page_type == "disambiguation":
194
+ desc = data.get("description", "").lower()
195
+ if any(p in desc for p in ["name", "given name"]):
196
+ return {
197
+ "inferred_type": "person",
198
+ "confidence": 0.65,
199
+ "wiki_summary": extract[:200],
200
+ "wiki_title": title,
201
+ "note": "disambiguation page with name entries",
202
+ }
203
+ return {
204
+ "inferred_type": "ambiguous",
205
+ "confidence": 0.4,
206
+ "wiki_summary": extract[:200],
207
+ "wiki_title": title,
208
+ }
209
+
210
+ # Check for name indicators
211
+ if any(phrase in extract for phrase in NAME_INDICATOR_PHRASES):
212
+ # Higher confidence if the word itself is described as a name
213
+ confidence = (
214
+ 0.90
215
+ if any(
216
+ f"{word.lower()} is a" in extract or f"{word.lower()} (name" in extract
217
+ for _ in [1]
218
+ )
219
+ else 0.80
220
+ )
221
+ return {
222
+ "inferred_type": "person",
223
+ "confidence": confidence,
224
+ "wiki_summary": extract[:200],
225
+ "wiki_title": title,
226
+ }
227
+
228
+ # Check for place indicators
229
+ if any(phrase in extract for phrase in PLACE_INDICATOR_PHRASES):
230
+ return {
231
+ "inferred_type": "place",
232
+ "confidence": 0.80,
233
+ "wiki_summary": extract[:200],
234
+ "wiki_title": title,
235
+ }
236
+
237
+ # Found but doesn't match name/place patterns
238
+ return {
239
+ "inferred_type": "concept",
240
+ "confidence": 0.60,
241
+ "wiki_summary": extract[:200],
242
+ "wiki_title": title,
243
+ }
244
+
245
+ except urllib.error.HTTPError as e:
246
+ if e.code == 404:
247
+ # Not in Wikipedia — strong signal it's a proper noun (unusual name, nickname)
248
+ return {
249
+ "inferred_type": "person",
250
+ "confidence": 0.70,
251
+ "wiki_summary": None,
252
+ "wiki_title": None,
253
+ "note": "not found in Wikipedia — likely a proper noun or unusual name",
254
+ }
255
+ return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
256
+ except (urllib.error.URLError, OSError, json.JSONDecodeError, KeyError):
257
+ return {"inferred_type": "unknown", "confidence": 0.0, "wiki_summary": None}
258
+
259
+
260
+ # ─────────────────────────────────────────────────────────────────────────────
261
+ # Entity Registry
262
+ # ─────────────────────────────────────────────────────────────────────────────
263
+
264
+
265
+ class EntityRegistry:
266
+ """
267
+ Persistent personal entity registry.
268
+
269
+ Stored at ~/.mempalace/entity_registry.json
270
+ Schema:
271
+ {
272
+ "mode": "personal", # work | personal | combo
273
+ "version": 1,
274
+ "people": {
275
+ "Riley": {
276
+ "source": "onboarding",
277
+ "contexts": ["personal"],
278
+ "aliases": [],
279
+ "relationship": "daughter",
280
+ "confidence": 1.0
281
+ }
282
+ },
283
+ "projects": ["MemPalace", "Acme"],
284
+ "ambiguous_flags": ["riley", "max"],
285
+ "wiki_cache": {
286
+ "Sam": {"inferred_type": "person", "confidence": 0.9, "confirmed": true, ...}
287
+ }
288
+ }
289
+ """
290
+
291
+ DEFAULT_PATH = Path.home() / ".mempalace" / "entity_registry.json"
292
+
293
+ def __init__(self, data: dict, path: Path):
294
+ self._data = data
295
+ self._path = path
296
+
297
+ # ── Load / Save ──────────────────────────────────────────────────────────
298
+
299
+ @classmethod
300
+ def load(cls, config_dir: Optional[Path] = None) -> "EntityRegistry":
301
+ path = (Path(config_dir) / "entity_registry.json") if config_dir else cls.DEFAULT_PATH
302
+ if path.exists():
303
+ try:
304
+ data = json.loads(path.read_text())
305
+ return cls(data, path)
306
+ except (json.JSONDecodeError, OSError):
307
+ pass
308
+ return cls(cls._empty(), path)
309
+
310
+ def save(self):
311
+ self._path.parent.mkdir(parents=True, exist_ok=True)
312
+ self._path.write_text(json.dumps(self._data, indent=2))
313
+
314
+ @staticmethod
315
+ def _empty() -> dict:
316
+ return {
317
+ "version": 1,
318
+ "mode": "personal",
319
+ "people": {},
320
+ "projects": [],
321
+ "ambiguous_flags": [],
322
+ "wiki_cache": {},
323
+ }
324
+
325
+ # ── Properties ───────────────────────────────────────────────────────────
326
+
327
+ @property
328
+ def mode(self) -> str:
329
+ return self._data.get("mode", "personal")
330
+
331
+ @property
332
+ def people(self) -> dict:
333
+ return self._data.get("people", {})
334
+
335
+ @property
336
+ def projects(self) -> list:
337
+ return self._data.get("projects", [])
338
+
339
+ @property
340
+ def ambiguous_flags(self) -> list:
341
+ return self._data.get("ambiguous_flags", [])
342
+
343
+ # ── Seed from onboarding ─────────────────────────────────────────────────
344
+
345
+ def seed(self, mode: str, people: list, projects: list, aliases: dict = None):
346
+ """
347
+ Seed the registry from onboarding data.
348
+
349
+ people: list of dicts {"name": str, "relationship": str, "context": str}
350
+ projects: list of str
351
+ aliases: dict {"Max": "Maxwell", ...}
352
+ """
353
+ self._data["mode"] = mode
354
+ self._data["projects"] = list(projects)
355
+
356
+ aliases = aliases or {}
357
+ reverse_aliases = {v: k for k, v in aliases.items()} # Maxwell → Max
358
+
359
+ for entry in people:
360
+ name = entry["name"].strip()
361
+ if not name:
362
+ continue
363
+ context = entry.get("context", "personal")
364
+ relationship = entry.get("relationship", "")
365
+
366
+ self._data["people"][name] = {
367
+ "source": "onboarding",
368
+ "contexts": [context],
369
+ "aliases": [reverse_aliases[name]] if name in reverse_aliases else [],
370
+ "relationship": relationship,
371
+ "confidence": 1.0,
372
+ }
373
+
374
+ # Also register aliases
375
+ if name in reverse_aliases:
376
+ alias = reverse_aliases[name]
377
+ self._data["people"][alias] = {
378
+ "source": "onboarding",
379
+ "contexts": [context],
380
+ "aliases": [name],
381
+ "relationship": relationship,
382
+ "confidence": 1.0,
383
+ "canonical": name,
384
+ }
385
+
386
+ # Flag ambiguous names (also common English words)
387
+ ambiguous = []
388
+ for name in self._data["people"]:
389
+ if name.lower() in COMMON_ENGLISH_WORDS:
390
+ ambiguous.append(name.lower())
391
+ self._data["ambiguous_flags"] = ambiguous
392
+
393
+ self.save()
394
+
395
+ # ── Lookup ───────────────────────────────────────────────────────────────
396
+
397
+ def lookup(self, word: str, context: str = "") -> dict:
398
+ """
399
+ Look up a word. Returns entity classification.
400
+
401
+ context: surrounding sentence (used for disambiguation of ambiguous words)
402
+
403
+ Returns:
404
+ {"type": "person"|"project"|"concept"|"unknown",
405
+ "confidence": float,
406
+ "source": "onboarding"|"learned"|"wiki"|"inferred",
407
+ "name": canonical name if found,
408
+ "needs_disambiguation": bool}
409
+ """
410
+ # 1. Exact match in people registry
411
+ for canonical, info in self.people.items():
412
+ if word.lower() == canonical.lower() or word.lower() in [
413
+ a.lower() for a in info.get("aliases", [])
414
+ ]:
415
+ # Check if this is an ambiguous word
416
+ if word.lower() in self.ambiguous_flags and context:
417
+ resolved = self._disambiguate(word, context, info)
418
+ if resolved is not None:
419
+ return resolved
420
+ return {
421
+ "type": "person",
422
+ "confidence": info["confidence"],
423
+ "source": info["source"],
424
+ "name": canonical,
425
+ "context": info.get("contexts", ["personal"]),
426
+ "needs_disambiguation": False,
427
+ }
428
+
429
+ # 2. Project match
430
+ for proj in self.projects:
431
+ if word.lower() == proj.lower():
432
+ return {
433
+ "type": "project",
434
+ "confidence": 1.0,
435
+ "source": "onboarding",
436
+ "name": proj,
437
+ "needs_disambiguation": False,
438
+ }
439
+
440
+ # 3. Wiki cache
441
+ cache = self._data.get("wiki_cache", {})
442
+ for cached_word, cached_result in cache.items():
443
+ if word.lower() == cached_word.lower() and cached_result.get("confirmed"):
444
+ return {
445
+ "type": cached_result["inferred_type"],
446
+ "confidence": cached_result["confidence"],
447
+ "source": "wiki",
448
+ "name": word,
449
+ "needs_disambiguation": False,
450
+ }
451
+
452
+ return {
453
+ "type": "unknown",
454
+ "confidence": 0.0,
455
+ "source": "none",
456
+ "name": word,
457
+ "needs_disambiguation": False,
458
+ }
459
+
460
+ def _disambiguate(self, word: str, context: str, person_info: dict) -> Optional[dict]:
461
+ """
462
+ When a word is both a name and a common word, check context.
463
+ Returns person result if context suggests a name, None if ambiguous.
464
+ """
465
+ name_lower = word.lower()
466
+ ctx_lower = context.lower()
467
+
468
+ # Check person context patterns
469
+ person_score = 0
470
+ for pat in PERSON_CONTEXT_PATTERNS:
471
+ if re.search(pat.format(name=re.escape(name_lower)), ctx_lower):
472
+ person_score += 1
473
+
474
+ # Check concept context patterns
475
+ concept_score = 0
476
+ for pat in CONCEPT_CONTEXT_PATTERNS:
477
+ if re.search(pat.format(name=re.escape(name_lower)), ctx_lower):
478
+ concept_score += 1
479
+
480
+ if person_score > concept_score:
481
+ return {
482
+ "type": "person",
483
+ "confidence": min(0.95, 0.7 + person_score * 0.1),
484
+ "source": person_info["source"],
485
+ "name": word,
486
+ "context": person_info.get("contexts", ["personal"]),
487
+ "needs_disambiguation": False,
488
+ "disambiguated_by": "context_patterns",
489
+ }
490
+ elif concept_score > person_score:
491
+ return {
492
+ "type": "concept",
493
+ "confidence": min(0.90, 0.7 + concept_score * 0.1),
494
+ "source": "context_disambiguated",
495
+ "name": word,
496
+ "needs_disambiguation": False,
497
+ "disambiguated_by": "context_patterns",
498
+ }
499
+
500
+ # Truly ambiguous — return None to fall through to person (registered name)
501
+ return None
502
+
503
+ # ── Research unknown words ───────────────────────────────────────────────
504
+
505
+ def research(self, word: str, auto_confirm: bool = False) -> dict:
506
+ """
507
+ Research an unknown word via Wikipedia.
508
+ Caches result. If auto_confirm=False, marks as unconfirmed (needs user review).
509
+ Returns the lookup result.
510
+ """
511
+ # Already cached?
512
+ cache = self._data.setdefault("wiki_cache", {})
513
+ if word in cache:
514
+ return cache[word]
515
+
516
+ result = _wikipedia_lookup(word)
517
+ result["word"] = word
518
+ result["confirmed"] = auto_confirm
519
+
520
+ cache[word] = result
521
+ self.save()
522
+ return result
523
+
524
+ def confirm_research(
525
+ self, word: str, entity_type: str, relationship: str = "", context: str = "personal"
526
+ ):
527
+ """Mark a researched word as confirmed and add to people registry."""
528
+ cache = self._data.get("wiki_cache", {})
529
+ if word in cache:
530
+ cache[word]["confirmed"] = True
531
+ cache[word]["confirmed_type"] = entity_type
532
+
533
+ if entity_type == "person":
534
+ self._data["people"][word] = {
535
+ "source": "wiki",
536
+ "contexts": [context],
537
+ "aliases": [],
538
+ "relationship": relationship,
539
+ "confidence": 0.90,
540
+ }
541
+ if word.lower() in COMMON_ENGLISH_WORDS:
542
+ flags = self._data.setdefault("ambiguous_flags", [])
543
+ if word.lower() not in flags:
544
+ flags.append(word.lower())
545
+
546
+ self.save()
547
+
548
+ # ── Learn from sessions ──────────────────────────────────────────────────
549
+
550
+ def learn_from_text(self, text: str, min_confidence: float = 0.75) -> list:
551
+ """
552
+ Scan session text for new entity candidates.
553
+ Returns list of newly discovered candidates for review.
554
+ """
555
+ from mempalace.entity_detector import extract_candidates, score_entity, classify_entity
556
+
557
+ lines = text.splitlines()
558
+ candidates = extract_candidates(text)
559
+ new_candidates = []
560
+
561
+ for name, frequency in candidates.items():
562
+ # Skip if already known
563
+ if name in self.people or name in self.projects:
564
+ continue
565
+
566
+ scores = score_entity(name, text, lines)
567
+ entity = classify_entity(name, frequency, scores)
568
+
569
+ if entity["type"] == "person" and entity["confidence"] >= min_confidence:
570
+ self._data["people"][name] = {
571
+ "source": "learned",
572
+ "contexts": [self.mode if self.mode != "combo" else "personal"],
573
+ "aliases": [],
574
+ "relationship": "",
575
+ "confidence": entity["confidence"],
576
+ "seen_count": frequency,
577
+ }
578
+ if name.lower() in COMMON_ENGLISH_WORDS:
579
+ flags = self._data.setdefault("ambiguous_flags", [])
580
+ if name.lower() not in flags:
581
+ flags.append(name.lower())
582
+ new_candidates.append(entity)
583
+
584
+ if new_candidates:
585
+ self.save()
586
+
587
+ return new_candidates
588
+
589
+ # ── Query helpers for retrieval ──────────────────────────────────────────
590
+
591
+ def extract_people_from_query(self, query: str) -> list:
592
+ """
593
+ Extract known person names from a query string.
594
+ Returns list of canonical names found.
595
+ """
596
+ found = []
597
+
598
+ for canonical, info in self.people.items():
599
+ names_to_check = [canonical] + info.get("aliases", [])
600
+ for name in names_to_check:
601
+ # Word boundary match
602
+ if re.search(rf"\b{re.escape(name)}\b", query, re.IGNORECASE):
603
+ # For ambiguous words, check context
604
+ if name.lower() in self.ambiguous_flags:
605
+ result = self._disambiguate(name, query, info)
606
+ if result and result["type"] == "person":
607
+ if canonical not in found:
608
+ found.append(canonical)
609
+ else:
610
+ if canonical not in found:
611
+ found.append(canonical)
612
+ return found
613
+
614
+ def extract_unknown_candidates(self, query: str) -> list:
615
+ """
616
+ Find capitalized words in query that aren't in registry or common words.
617
+ These are candidates for Wikipedia research.
618
+ """
619
+ candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query)
620
+ unknown = []
621
+ for word in set(candidates):
622
+ if word.lower() in COMMON_ENGLISH_WORDS:
623
+ continue
624
+ result = self.lookup(word)
625
+ if result["type"] == "unknown":
626
+ unknown.append(word)
627
+ return unknown
628
+
629
+ # ── Summary ──────────────────────────────────────────────────────────────
630
+
631
+ def summary(self) -> str:
632
+ lines = [
633
+ f"Mode: {self.mode}",
634
+ f"People: {len(self.people)} ({', '.join(list(self.people.keys())[:8])}{'...' if len(self.people) > 8 else ''})",
635
+ f"Projects: {', '.join(self.projects) or '(none)'}",
636
+ f"Ambiguous flags: {', '.join(self.ambiguous_flags) or '(none)'}",
637
+ f"Wiki cache: {len(self._data.get('wiki_cache', {}))} entries",
638
+ ]
639
+ return "\n".join(lines)