mempalace-code 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mempalace/README.md +40 -0
- mempalace/__init__.py +6 -0
- mempalace/__main__.py +5 -0
- mempalace/cli.py +811 -0
- mempalace/config.py +149 -0
- mempalace/convo_miner.py +415 -0
- mempalace/dialect.py +1075 -0
- mempalace/entity_detector.py +853 -0
- mempalace/entity_registry.py +639 -0
- mempalace/export.py +378 -0
- mempalace/general_extractor.py +521 -0
- mempalace/knowledge_graph.py +410 -0
- mempalace/layers.py +515 -0
- mempalace/mcp_server.py +873 -0
- mempalace/migrate.py +153 -0
- mempalace/miner.py +1285 -0
- mempalace/normalize.py +328 -0
- mempalace/onboarding.py +489 -0
- mempalace/palace_graph.py +225 -0
- mempalace/py.typed +0 -0
- mempalace/room_detector_local.py +310 -0
- mempalace/searcher.py +305 -0
- mempalace/spellcheck.py +269 -0
- mempalace/split_mega_files.py +309 -0
- mempalace/storage.py +807 -0
- mempalace/version.py +3 -0
- mempalace_code-1.0.0.dist-info/METADATA +489 -0
- mempalace_code-1.0.0.dist-info/RECORD +32 -0
- mempalace_code-1.0.0.dist-info/WHEEL +4 -0
- mempalace_code-1.0.0.dist-info/entry_points.txt +2 -0
- mempalace_code-1.0.0.dist-info/licenses/LICENSE +192 -0
- mempalace_code-1.0.0.dist-info/licenses/NOTICE +17 -0
|
@@ -0,0 +1,853 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
entity_detector.py — Auto-detect people and projects from file content.
|
|
4
|
+
|
|
5
|
+
Two-pass approach:
|
|
6
|
+
Pass 1: scan files, extract entity candidates with signal counts
|
|
7
|
+
Pass 2: score and classify each candidate as person, project, or uncertain
|
|
8
|
+
|
|
9
|
+
Used by mempalace init before mining begins.
|
|
10
|
+
The confirmed entity map feeds the miner as the taxonomy.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from entity_detector import detect_entities, confirm_entities
|
|
14
|
+
candidates = detect_entities(file_paths)
|
|
15
|
+
confirmed = confirm_entities(candidates) # interactive review
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
import os
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ==================== SIGNAL PATTERNS ====================
|
|
25
|
+
|
|
26
|
+
# Person signals — things people do
|
|
27
|
+
PERSON_VERB_PATTERNS = [
|
|
28
|
+
r"\b{name}\s+said\b",
|
|
29
|
+
r"\b{name}\s+asked\b",
|
|
30
|
+
r"\b{name}\s+told\b",
|
|
31
|
+
r"\b{name}\s+replied\b",
|
|
32
|
+
r"\b{name}\s+laughed\b",
|
|
33
|
+
r"\b{name}\s+smiled\b",
|
|
34
|
+
r"\b{name}\s+cried\b",
|
|
35
|
+
r"\b{name}\s+felt\b",
|
|
36
|
+
r"\b{name}\s+thinks?\b",
|
|
37
|
+
r"\b{name}\s+wants?\b",
|
|
38
|
+
r"\b{name}\s+loves?\b",
|
|
39
|
+
r"\b{name}\s+hates?\b",
|
|
40
|
+
r"\b{name}\s+knows?\b",
|
|
41
|
+
r"\b{name}\s+decided\b",
|
|
42
|
+
r"\b{name}\s+pushed\b",
|
|
43
|
+
r"\b{name}\s+wrote\b",
|
|
44
|
+
r"\bhey\s+{name}\b",
|
|
45
|
+
r"\bthanks?\s+{name}\b",
|
|
46
|
+
r"\bhi\s+{name}\b",
|
|
47
|
+
r"\bdear\s+{name}\b",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
# Person signals — pronouns resolving nearby
|
|
51
|
+
PRONOUN_PATTERNS = [
|
|
52
|
+
r"\bshe\b",
|
|
53
|
+
r"\bher\b",
|
|
54
|
+
r"\bhers\b",
|
|
55
|
+
r"\bhe\b",
|
|
56
|
+
r"\bhim\b",
|
|
57
|
+
r"\bhis\b",
|
|
58
|
+
r"\bthey\b",
|
|
59
|
+
r"\bthem\b",
|
|
60
|
+
r"\btheir\b",
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
# Person signals — dialogue markers
|
|
64
|
+
DIALOGUE_PATTERNS = [
|
|
65
|
+
r"^>\s*{name}[:\s]", # > Speaker: ...
|
|
66
|
+
r"^{name}:\s", # Speaker: ...
|
|
67
|
+
r"^\[{name}\]", # [Speaker]
|
|
68
|
+
r'"{name}\s+said',
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
# Project signals — things projects have/do
|
|
72
|
+
PROJECT_VERB_PATTERNS = [
|
|
73
|
+
r"\bbuilding\s+{name}\b",
|
|
74
|
+
r"\bbuilt\s+{name}\b",
|
|
75
|
+
r"\bship(?:ping|ped)?\s+{name}\b",
|
|
76
|
+
r"\blaunch(?:ing|ed)?\s+{name}\b",
|
|
77
|
+
r"\bdeploy(?:ing|ed)?\s+{name}\b",
|
|
78
|
+
r"\binstall(?:ing|ed)?\s+{name}\b",
|
|
79
|
+
r"\bthe\s+{name}\s+architecture\b",
|
|
80
|
+
r"\bthe\s+{name}\s+pipeline\b",
|
|
81
|
+
r"\bthe\s+{name}\s+system\b",
|
|
82
|
+
r"\bthe\s+{name}\s+repo\b",
|
|
83
|
+
r"\b{name}\s+v\d+\b", # MemPal v2
|
|
84
|
+
r"\b{name}\.py\b", # mempalace.py
|
|
85
|
+
r"\b{name}-core\b", # mempal-core (hyphen only, not underscore)
|
|
86
|
+
r"\b{name}-local\b",
|
|
87
|
+
r"\bimport\s+{name}\b",
|
|
88
|
+
r"\bpip\s+install\s+{name}\b",
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
# Words that are almost certainly NOT entities
|
|
92
|
+
STOPWORDS = {
|
|
93
|
+
"the",
|
|
94
|
+
"a",
|
|
95
|
+
"an",
|
|
96
|
+
"and",
|
|
97
|
+
"or",
|
|
98
|
+
"but",
|
|
99
|
+
"in",
|
|
100
|
+
"on",
|
|
101
|
+
"at",
|
|
102
|
+
"to",
|
|
103
|
+
"for",
|
|
104
|
+
"of",
|
|
105
|
+
"with",
|
|
106
|
+
"by",
|
|
107
|
+
"from",
|
|
108
|
+
"as",
|
|
109
|
+
"is",
|
|
110
|
+
"was",
|
|
111
|
+
"are",
|
|
112
|
+
"were",
|
|
113
|
+
"be",
|
|
114
|
+
"been",
|
|
115
|
+
"being",
|
|
116
|
+
"have",
|
|
117
|
+
"has",
|
|
118
|
+
"had",
|
|
119
|
+
"do",
|
|
120
|
+
"does",
|
|
121
|
+
"did",
|
|
122
|
+
"will",
|
|
123
|
+
"would",
|
|
124
|
+
"could",
|
|
125
|
+
"should",
|
|
126
|
+
"may",
|
|
127
|
+
"might",
|
|
128
|
+
"must",
|
|
129
|
+
"shall",
|
|
130
|
+
"can",
|
|
131
|
+
"this",
|
|
132
|
+
"that",
|
|
133
|
+
"these",
|
|
134
|
+
"those",
|
|
135
|
+
"it",
|
|
136
|
+
"its",
|
|
137
|
+
"they",
|
|
138
|
+
"them",
|
|
139
|
+
"their",
|
|
140
|
+
"we",
|
|
141
|
+
"our",
|
|
142
|
+
"you",
|
|
143
|
+
"your",
|
|
144
|
+
"i",
|
|
145
|
+
"my",
|
|
146
|
+
"me",
|
|
147
|
+
"he",
|
|
148
|
+
"she",
|
|
149
|
+
"his",
|
|
150
|
+
"her",
|
|
151
|
+
"who",
|
|
152
|
+
"what",
|
|
153
|
+
"when",
|
|
154
|
+
"where",
|
|
155
|
+
"why",
|
|
156
|
+
"how",
|
|
157
|
+
"which",
|
|
158
|
+
"if",
|
|
159
|
+
"then",
|
|
160
|
+
"so",
|
|
161
|
+
"not",
|
|
162
|
+
"no",
|
|
163
|
+
"yes",
|
|
164
|
+
"ok",
|
|
165
|
+
"okay",
|
|
166
|
+
"just",
|
|
167
|
+
"very",
|
|
168
|
+
"really",
|
|
169
|
+
"also",
|
|
170
|
+
"already",
|
|
171
|
+
"still",
|
|
172
|
+
"even",
|
|
173
|
+
"only",
|
|
174
|
+
"here",
|
|
175
|
+
"there",
|
|
176
|
+
"now",
|
|
177
|
+
"then",
|
|
178
|
+
"too",
|
|
179
|
+
"up",
|
|
180
|
+
"out",
|
|
181
|
+
"about",
|
|
182
|
+
"like",
|
|
183
|
+
"use",
|
|
184
|
+
"get",
|
|
185
|
+
"got",
|
|
186
|
+
"make",
|
|
187
|
+
"made",
|
|
188
|
+
"take",
|
|
189
|
+
"put",
|
|
190
|
+
"come",
|
|
191
|
+
"go",
|
|
192
|
+
"see",
|
|
193
|
+
"know",
|
|
194
|
+
"think",
|
|
195
|
+
"true",
|
|
196
|
+
"false",
|
|
197
|
+
"none",
|
|
198
|
+
"null",
|
|
199
|
+
"new",
|
|
200
|
+
"old",
|
|
201
|
+
"all",
|
|
202
|
+
"any",
|
|
203
|
+
"some",
|
|
204
|
+
"true",
|
|
205
|
+
"false",
|
|
206
|
+
"return",
|
|
207
|
+
"print",
|
|
208
|
+
"def",
|
|
209
|
+
"class",
|
|
210
|
+
"import",
|
|
211
|
+
"from",
|
|
212
|
+
# Common capitalized words in prose that aren't entities
|
|
213
|
+
"step",
|
|
214
|
+
"usage",
|
|
215
|
+
"run",
|
|
216
|
+
"check",
|
|
217
|
+
"find",
|
|
218
|
+
"add",
|
|
219
|
+
"get",
|
|
220
|
+
"set",
|
|
221
|
+
"list",
|
|
222
|
+
"args",
|
|
223
|
+
"dict",
|
|
224
|
+
"str",
|
|
225
|
+
"int",
|
|
226
|
+
"bool",
|
|
227
|
+
"path",
|
|
228
|
+
"file",
|
|
229
|
+
"type",
|
|
230
|
+
"name",
|
|
231
|
+
"note",
|
|
232
|
+
"example",
|
|
233
|
+
"option",
|
|
234
|
+
"result",
|
|
235
|
+
"error",
|
|
236
|
+
"warning",
|
|
237
|
+
"info",
|
|
238
|
+
"every",
|
|
239
|
+
"each",
|
|
240
|
+
"more",
|
|
241
|
+
"less",
|
|
242
|
+
"next",
|
|
243
|
+
"last",
|
|
244
|
+
"first",
|
|
245
|
+
"second",
|
|
246
|
+
"stack",
|
|
247
|
+
"layer",
|
|
248
|
+
"mode",
|
|
249
|
+
"test",
|
|
250
|
+
"stop",
|
|
251
|
+
"start",
|
|
252
|
+
"copy",
|
|
253
|
+
"move",
|
|
254
|
+
"source",
|
|
255
|
+
"target",
|
|
256
|
+
"output",
|
|
257
|
+
"input",
|
|
258
|
+
"data",
|
|
259
|
+
"item",
|
|
260
|
+
"key",
|
|
261
|
+
"value",
|
|
262
|
+
"returns",
|
|
263
|
+
"raises",
|
|
264
|
+
"yields",
|
|
265
|
+
"none",
|
|
266
|
+
"self",
|
|
267
|
+
"cls",
|
|
268
|
+
"kwargs",
|
|
269
|
+
# Common sentence-starting / abstract words that aren't entities
|
|
270
|
+
"world",
|
|
271
|
+
"well",
|
|
272
|
+
"want",
|
|
273
|
+
"topic",
|
|
274
|
+
"choose",
|
|
275
|
+
"social",
|
|
276
|
+
"cars",
|
|
277
|
+
"phones",
|
|
278
|
+
"healthcare",
|
|
279
|
+
"ex",
|
|
280
|
+
"machina",
|
|
281
|
+
"deus",
|
|
282
|
+
"human",
|
|
283
|
+
"humans",
|
|
284
|
+
"people",
|
|
285
|
+
"things",
|
|
286
|
+
"something",
|
|
287
|
+
"nothing",
|
|
288
|
+
"everything",
|
|
289
|
+
"anything",
|
|
290
|
+
"someone",
|
|
291
|
+
"everyone",
|
|
292
|
+
"anyone",
|
|
293
|
+
"way",
|
|
294
|
+
"time",
|
|
295
|
+
"day",
|
|
296
|
+
"life",
|
|
297
|
+
"place",
|
|
298
|
+
"thing",
|
|
299
|
+
"part",
|
|
300
|
+
"kind",
|
|
301
|
+
"sort",
|
|
302
|
+
"case",
|
|
303
|
+
"point",
|
|
304
|
+
"idea",
|
|
305
|
+
"fact",
|
|
306
|
+
"sense",
|
|
307
|
+
"question",
|
|
308
|
+
"answer",
|
|
309
|
+
"reason",
|
|
310
|
+
"number",
|
|
311
|
+
"version",
|
|
312
|
+
"system",
|
|
313
|
+
# Greetings and filler words at sentence starts
|
|
314
|
+
"hey",
|
|
315
|
+
"hi",
|
|
316
|
+
"hello",
|
|
317
|
+
"thanks",
|
|
318
|
+
"thank",
|
|
319
|
+
"right",
|
|
320
|
+
"let",
|
|
321
|
+
"ok",
|
|
322
|
+
# UI/action words that appear in how-to content
|
|
323
|
+
"click",
|
|
324
|
+
"hit",
|
|
325
|
+
"press",
|
|
326
|
+
"tap",
|
|
327
|
+
"drag",
|
|
328
|
+
"drop",
|
|
329
|
+
"open",
|
|
330
|
+
"close",
|
|
331
|
+
"save",
|
|
332
|
+
"load",
|
|
333
|
+
"launch",
|
|
334
|
+
"install",
|
|
335
|
+
"download",
|
|
336
|
+
"upload",
|
|
337
|
+
"scroll",
|
|
338
|
+
"select",
|
|
339
|
+
"enter",
|
|
340
|
+
"submit",
|
|
341
|
+
"cancel",
|
|
342
|
+
"confirm",
|
|
343
|
+
"delete",
|
|
344
|
+
"copy",
|
|
345
|
+
"paste",
|
|
346
|
+
"type",
|
|
347
|
+
"write",
|
|
348
|
+
"read",
|
|
349
|
+
"search",
|
|
350
|
+
"find",
|
|
351
|
+
"show",
|
|
352
|
+
"hide",
|
|
353
|
+
# Common filesystem/technical capitalized words
|
|
354
|
+
"desktop",
|
|
355
|
+
"documents",
|
|
356
|
+
"downloads",
|
|
357
|
+
"users",
|
|
358
|
+
"home",
|
|
359
|
+
"library",
|
|
360
|
+
"applications",
|
|
361
|
+
"system",
|
|
362
|
+
"preferences",
|
|
363
|
+
"settings",
|
|
364
|
+
"terminal",
|
|
365
|
+
# Abstract/topic words
|
|
366
|
+
"actor",
|
|
367
|
+
"vector",
|
|
368
|
+
"remote",
|
|
369
|
+
"control",
|
|
370
|
+
"duration",
|
|
371
|
+
"fetch",
|
|
372
|
+
# Abstract concepts that appear as subjects but aren't entities
|
|
373
|
+
"agents",
|
|
374
|
+
"tools",
|
|
375
|
+
"others",
|
|
376
|
+
"guards",
|
|
377
|
+
"ethics",
|
|
378
|
+
"regulation",
|
|
379
|
+
"learning",
|
|
380
|
+
"thinking",
|
|
381
|
+
"memory",
|
|
382
|
+
"language",
|
|
383
|
+
"intelligence",
|
|
384
|
+
"technology",
|
|
385
|
+
"society",
|
|
386
|
+
"culture",
|
|
387
|
+
"future",
|
|
388
|
+
"history",
|
|
389
|
+
"science",
|
|
390
|
+
"model",
|
|
391
|
+
"models",
|
|
392
|
+
"network",
|
|
393
|
+
"networks",
|
|
394
|
+
"training",
|
|
395
|
+
"inference",
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
# For entity detection — prose only, no code files
|
|
399
|
+
# Code files have too many capitalized names (classes, functions) that aren't entities
|
|
400
|
+
PROSE_EXTENSIONS = {
|
|
401
|
+
".txt",
|
|
402
|
+
".md",
|
|
403
|
+
".rst",
|
|
404
|
+
".csv",
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
READABLE_EXTENSIONS = {
|
|
408
|
+
".txt",
|
|
409
|
+
".md",
|
|
410
|
+
".py",
|
|
411
|
+
".js",
|
|
412
|
+
".ts",
|
|
413
|
+
".json",
|
|
414
|
+
".yaml",
|
|
415
|
+
".yml",
|
|
416
|
+
".csv",
|
|
417
|
+
".rst",
|
|
418
|
+
".toml",
|
|
419
|
+
".sh",
|
|
420
|
+
".rb",
|
|
421
|
+
".go",
|
|
422
|
+
".rs",
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
SKIP_DIRS = {
|
|
426
|
+
".git",
|
|
427
|
+
"node_modules",
|
|
428
|
+
"__pycache__",
|
|
429
|
+
".venv",
|
|
430
|
+
"venv",
|
|
431
|
+
"env",
|
|
432
|
+
"dist",
|
|
433
|
+
"build",
|
|
434
|
+
".next",
|
|
435
|
+
"coverage",
|
|
436
|
+
".mempalace",
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
# ==================== CANDIDATE EXTRACTION ====================
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def extract_candidates(text: str) -> dict:
|
|
444
|
+
"""
|
|
445
|
+
Extract all capitalized proper noun candidates from text.
|
|
446
|
+
Returns {name: frequency} for names appearing 3+ times.
|
|
447
|
+
"""
|
|
448
|
+
# Find all capitalized words (not at sentence start — harder, so we use frequency as filter)
|
|
449
|
+
raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text)
|
|
450
|
+
|
|
451
|
+
counts = defaultdict(int)
|
|
452
|
+
for word in raw:
|
|
453
|
+
if word.lower() not in STOPWORDS and len(word) > 1:
|
|
454
|
+
counts[word] += 1
|
|
455
|
+
|
|
456
|
+
# Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code")
|
|
457
|
+
multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text)
|
|
458
|
+
for phrase in multi:
|
|
459
|
+
if not any(w.lower() in STOPWORDS for w in phrase.split()):
|
|
460
|
+
counts[phrase] += 1
|
|
461
|
+
|
|
462
|
+
# Filter: must appear at least 3 times to be a candidate
|
|
463
|
+
return {name: count for name, count in counts.items() if count >= 3}
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
# ==================== SIGNAL SCORING ====================
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _build_patterns(name: str) -> dict:
|
|
470
|
+
"""Pre-compile all regex patterns for a single entity name."""
|
|
471
|
+
n = re.escape(name)
|
|
472
|
+
return {
|
|
473
|
+
"dialogue": [
|
|
474
|
+
re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in DIALOGUE_PATTERNS
|
|
475
|
+
],
|
|
476
|
+
"person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in PERSON_VERB_PATTERNS],
|
|
477
|
+
"project_verbs": [
|
|
478
|
+
re.compile(p.format(name=n), re.IGNORECASE) for p in PROJECT_VERB_PATTERNS
|
|
479
|
+
],
|
|
480
|
+
"direct": re.compile(rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b", re.IGNORECASE),
|
|
481
|
+
"versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
|
|
482
|
+
"code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def score_entity(name: str, text: str, lines: list) -> dict:
|
|
487
|
+
"""
|
|
488
|
+
Score a candidate entity as person vs project.
|
|
489
|
+
Returns scores and the signals that fired.
|
|
490
|
+
"""
|
|
491
|
+
patterns = _build_patterns(name)
|
|
492
|
+
person_score = 0
|
|
493
|
+
project_score = 0
|
|
494
|
+
person_signals = []
|
|
495
|
+
project_signals = []
|
|
496
|
+
|
|
497
|
+
# --- Person signals ---
|
|
498
|
+
|
|
499
|
+
# Dialogue markers (strong signal)
|
|
500
|
+
for rx in patterns["dialogue"]:
|
|
501
|
+
matches = len(rx.findall(text))
|
|
502
|
+
if matches > 0:
|
|
503
|
+
person_score += matches * 3
|
|
504
|
+
person_signals.append(f"dialogue marker ({matches}x)")
|
|
505
|
+
|
|
506
|
+
# Person verbs
|
|
507
|
+
for rx in patterns["person_verbs"]:
|
|
508
|
+
matches = len(rx.findall(text))
|
|
509
|
+
if matches > 0:
|
|
510
|
+
person_score += matches * 2
|
|
511
|
+
person_signals.append(f"'{name} ...' action ({matches}x)")
|
|
512
|
+
|
|
513
|
+
# Pronoun proximity — pronouns within 3 lines of the name
|
|
514
|
+
name_lower = name.lower()
|
|
515
|
+
name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
|
|
516
|
+
pronoun_hits = 0
|
|
517
|
+
for idx in name_line_indices:
|
|
518
|
+
window_text = " ".join(lines[max(0, idx - 2) : idx + 3]).lower()
|
|
519
|
+
for pronoun_pattern in PRONOUN_PATTERNS:
|
|
520
|
+
if re.search(pronoun_pattern, window_text):
|
|
521
|
+
pronoun_hits += 1
|
|
522
|
+
break
|
|
523
|
+
if pronoun_hits > 0:
|
|
524
|
+
person_score += pronoun_hits * 2
|
|
525
|
+
person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
|
|
526
|
+
|
|
527
|
+
# Direct address
|
|
528
|
+
direct = len(patterns["direct"].findall(text))
|
|
529
|
+
if direct > 0:
|
|
530
|
+
person_score += direct * 4
|
|
531
|
+
person_signals.append(f"addressed directly ({direct}x)")
|
|
532
|
+
|
|
533
|
+
# --- Project signals ---
|
|
534
|
+
|
|
535
|
+
for rx in patterns["project_verbs"]:
|
|
536
|
+
matches = len(rx.findall(text))
|
|
537
|
+
if matches > 0:
|
|
538
|
+
project_score += matches * 2
|
|
539
|
+
project_signals.append(f"project verb ({matches}x)")
|
|
540
|
+
|
|
541
|
+
versioned = len(patterns["versioned"].findall(text))
|
|
542
|
+
if versioned > 0:
|
|
543
|
+
project_score += versioned * 3
|
|
544
|
+
project_signals.append(f"versioned/hyphenated ({versioned}x)")
|
|
545
|
+
|
|
546
|
+
code_ref = len(patterns["code_ref"].findall(text))
|
|
547
|
+
if code_ref > 0:
|
|
548
|
+
project_score += code_ref * 3
|
|
549
|
+
project_signals.append(f"code file reference ({code_ref}x)")
|
|
550
|
+
|
|
551
|
+
return {
|
|
552
|
+
"person_score": person_score,
|
|
553
|
+
"project_score": project_score,
|
|
554
|
+
"person_signals": person_signals[:3],
|
|
555
|
+
"project_signals": project_signals[:3],
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
# ==================== CLASSIFY ====================
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def classify_entity(name: str, frequency: int, scores: dict) -> dict:
|
|
563
|
+
"""
|
|
564
|
+
Given scores, classify as person / project / uncertain.
|
|
565
|
+
Returns entity dict with confidence.
|
|
566
|
+
"""
|
|
567
|
+
ps = scores["person_score"]
|
|
568
|
+
prs = scores["project_score"]
|
|
569
|
+
total = ps + prs
|
|
570
|
+
|
|
571
|
+
if total == 0:
|
|
572
|
+
# No strong signals — frequency-only candidate, uncertain
|
|
573
|
+
confidence = min(0.4, frequency / 50)
|
|
574
|
+
return {
|
|
575
|
+
"name": name,
|
|
576
|
+
"type": "uncertain",
|
|
577
|
+
"confidence": round(confidence, 2),
|
|
578
|
+
"frequency": frequency,
|
|
579
|
+
"signals": [f"appears {frequency}x, no strong type signals"],
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
person_ratio = ps / total if total > 0 else 0
|
|
583
|
+
|
|
584
|
+
# Require TWO different signal categories to confidently classify as a person.
|
|
585
|
+
# One signal type with many hits (e.g. "Click, click, click...") is not enough —
|
|
586
|
+
# it just means that word appears often in a particular syntactic position.
|
|
587
|
+
signal_categories = set()
|
|
588
|
+
for s in scores["person_signals"]:
|
|
589
|
+
if "dialogue" in s:
|
|
590
|
+
signal_categories.add("dialogue")
|
|
591
|
+
elif "action" in s:
|
|
592
|
+
signal_categories.add("action")
|
|
593
|
+
elif "pronoun" in s:
|
|
594
|
+
signal_categories.add("pronoun")
|
|
595
|
+
elif "addressed" in s:
|
|
596
|
+
signal_categories.add("addressed")
|
|
597
|
+
|
|
598
|
+
has_two_signal_types = len(signal_categories) >= 2
|
|
599
|
+
_ = signal_categories - {"pronoun"} # reserved for future thresholds
|
|
600
|
+
|
|
601
|
+
if person_ratio >= 0.7 and has_two_signal_types and ps >= 5:
|
|
602
|
+
entity_type = "person"
|
|
603
|
+
confidence = min(0.99, 0.5 + person_ratio * 0.5)
|
|
604
|
+
signals = scores["person_signals"] or [f"appears {frequency}x"]
|
|
605
|
+
elif person_ratio >= 0.7 and (not has_two_signal_types or ps < 5):
|
|
606
|
+
# Pronoun-only match — downgrade to uncertain
|
|
607
|
+
entity_type = "uncertain"
|
|
608
|
+
confidence = 0.4
|
|
609
|
+
signals = scores["person_signals"] + [f"appears {frequency}x — pronoun-only match"]
|
|
610
|
+
elif person_ratio <= 0.3:
|
|
611
|
+
entity_type = "project"
|
|
612
|
+
confidence = min(0.99, 0.5 + (1 - person_ratio) * 0.5)
|
|
613
|
+
signals = scores["project_signals"] or [f"appears {frequency}x"]
|
|
614
|
+
else:
|
|
615
|
+
entity_type = "uncertain"
|
|
616
|
+
confidence = 0.5
|
|
617
|
+
signals = (scores["person_signals"] + scores["project_signals"])[:3]
|
|
618
|
+
signals.append("mixed signals — needs review")
|
|
619
|
+
|
|
620
|
+
return {
|
|
621
|
+
"name": name,
|
|
622
|
+
"type": entity_type,
|
|
623
|
+
"confidence": round(confidence, 2),
|
|
624
|
+
"frequency": frequency,
|
|
625
|
+
"signals": signals,
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
# ==================== MAIN DETECT ====================
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def detect_entities(file_paths: list, max_files: int = 10) -> dict:
|
|
633
|
+
"""
|
|
634
|
+
Scan files and detect entity candidates.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
file_paths: List of Path objects to scan
|
|
638
|
+
max_files: Max files to read (for speed)
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
{
|
|
642
|
+
"people": [...entity dicts...],
|
|
643
|
+
"projects": [...entity dicts...],
|
|
644
|
+
"uncertain":[...entity dicts...],
|
|
645
|
+
}
|
|
646
|
+
"""
|
|
647
|
+
# Collect text from files
|
|
648
|
+
all_text = []
|
|
649
|
+
all_lines = []
|
|
650
|
+
files_read = 0
|
|
651
|
+
|
|
652
|
+
MAX_BYTES_PER_FILE = 5_000 # first 5KB per file — enough to catch recurring entities
|
|
653
|
+
|
|
654
|
+
for filepath in file_paths:
|
|
655
|
+
if files_read >= max_files:
|
|
656
|
+
break
|
|
657
|
+
try:
|
|
658
|
+
with open(filepath, encoding="utf-8", errors="replace") as f:
|
|
659
|
+
content = f.read(MAX_BYTES_PER_FILE)
|
|
660
|
+
all_text.append(content)
|
|
661
|
+
all_lines.extend(content.splitlines())
|
|
662
|
+
files_read += 1
|
|
663
|
+
except OSError:
|
|
664
|
+
continue
|
|
665
|
+
|
|
666
|
+
combined_text = "\n".join(all_text)
|
|
667
|
+
|
|
668
|
+
# Extract candidates
|
|
669
|
+
candidates = extract_candidates(combined_text)
|
|
670
|
+
|
|
671
|
+
if not candidates:
|
|
672
|
+
return {"people": [], "projects": [], "uncertain": []}
|
|
673
|
+
|
|
674
|
+
# Score and classify each candidate
|
|
675
|
+
people = []
|
|
676
|
+
projects = []
|
|
677
|
+
uncertain = []
|
|
678
|
+
|
|
679
|
+
for name, frequency in sorted(candidates.items(), key=lambda x: x[1], reverse=True):
|
|
680
|
+
scores = score_entity(name, combined_text, all_lines)
|
|
681
|
+
entity = classify_entity(name, frequency, scores)
|
|
682
|
+
|
|
683
|
+
if entity["type"] == "person":
|
|
684
|
+
people.append(entity)
|
|
685
|
+
elif entity["type"] == "project":
|
|
686
|
+
projects.append(entity)
|
|
687
|
+
else:
|
|
688
|
+
uncertain.append(entity)
|
|
689
|
+
|
|
690
|
+
# Sort by confidence descending
|
|
691
|
+
people.sort(key=lambda x: x["confidence"], reverse=True)
|
|
692
|
+
projects.sort(key=lambda x: x["confidence"], reverse=True)
|
|
693
|
+
uncertain.sort(key=lambda x: x["frequency"], reverse=True)
|
|
694
|
+
|
|
695
|
+
# Cap results to most relevant
|
|
696
|
+
return {
|
|
697
|
+
"people": people[:15],
|
|
698
|
+
"projects": projects[:10],
|
|
699
|
+
"uncertain": uncertain[:8],
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
# ==================== INTERACTIVE CONFIRM ====================
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
def _print_entity_list(entities: list, label: str):
|
|
707
|
+
print(f"\n {label}:")
|
|
708
|
+
if not entities:
|
|
709
|
+
print(" (none detected)")
|
|
710
|
+
return
|
|
711
|
+
for i, e in enumerate(entities):
|
|
712
|
+
confidence_bar = "●" * int(e["confidence"] * 5) + "○" * (5 - int(e["confidence"] * 5))
|
|
713
|
+
signals_str = ", ".join(e["signals"][:2]) if e["signals"] else ""
|
|
714
|
+
print(f" {i + 1:2}. {e['name']:20} [{confidence_bar}] {signals_str}")
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
def confirm_entities(detected: dict, yes: bool = False) -> dict:
|
|
718
|
+
"""
|
|
719
|
+
Interactive confirmation step.
|
|
720
|
+
User reviews detected entities, removes wrong ones, adds missing ones.
|
|
721
|
+
Returns confirmed {people: [names], projects: [names]}
|
|
722
|
+
|
|
723
|
+
Pass yes=True to auto-accept all detected entities without prompting.
|
|
724
|
+
"""
|
|
725
|
+
print(f"\n{'=' * 58}")
|
|
726
|
+
print(" MemPalace — Entity Detection")
|
|
727
|
+
print(f"{'=' * 58}")
|
|
728
|
+
print("\n Scanned your files. Here's what we found:\n")
|
|
729
|
+
|
|
730
|
+
_print_entity_list(detected["people"], "PEOPLE")
|
|
731
|
+
_print_entity_list(detected["projects"], "PROJECTS")
|
|
732
|
+
|
|
733
|
+
if detected["uncertain"]:
|
|
734
|
+
_print_entity_list(detected["uncertain"], "UNCERTAIN (need your call)")
|
|
735
|
+
|
|
736
|
+
confirmed_people = [e["name"] for e in detected["people"]]
|
|
737
|
+
confirmed_projects = [e["name"] for e in detected["projects"]]
|
|
738
|
+
|
|
739
|
+
if yes:
|
|
740
|
+
# Auto-accept: include all detected (skip uncertain — ambiguous without user input)
|
|
741
|
+
print(
|
|
742
|
+
f"\n Auto-accepting {len(confirmed_people)} people, {len(confirmed_projects)} projects."
|
|
743
|
+
)
|
|
744
|
+
return {"people": confirmed_people, "projects": confirmed_projects}
|
|
745
|
+
|
|
746
|
+
print(f"\n{'─' * 58}")
|
|
747
|
+
print(" Options:")
|
|
748
|
+
print(" [enter] Accept all")
|
|
749
|
+
print(" [edit] Remove wrong entries or reclassify uncertain")
|
|
750
|
+
print(" [add] Add missing people or projects")
|
|
751
|
+
print()
|
|
752
|
+
|
|
753
|
+
choice = input(" Your choice [enter/edit/add]: ").strip().lower()
|
|
754
|
+
|
|
755
|
+
confirmed_people = [e["name"] for e in detected["people"]]
|
|
756
|
+
confirmed_projects = [e["name"] for e in detected["projects"]]
|
|
757
|
+
|
|
758
|
+
if choice == "edit":
|
|
759
|
+
# Handle uncertain first
|
|
760
|
+
if detected["uncertain"]:
|
|
761
|
+
print("\n Uncertain entities — classify each:")
|
|
762
|
+
for e in detected["uncertain"]:
|
|
763
|
+
ans = input(f" {e['name']} — (p)erson, (r)roject, or (s)kip? ").strip().lower()
|
|
764
|
+
if ans == "p":
|
|
765
|
+
confirmed_people.append(e["name"])
|
|
766
|
+
elif ans == "r":
|
|
767
|
+
confirmed_projects.append(e["name"])
|
|
768
|
+
|
|
769
|
+
# Remove wrong people
|
|
770
|
+
print(f"\n Current people: {', '.join(confirmed_people) or '(none)'}")
|
|
771
|
+
remove = input(
|
|
772
|
+
" Numbers to REMOVE from people (comma-separated, or enter to skip): "
|
|
773
|
+
).strip()
|
|
774
|
+
if remove:
|
|
775
|
+
to_remove = {int(x.strip()) - 1 for x in remove.split(",") if x.strip().isdigit()}
|
|
776
|
+
confirmed_people = [p for i, p in enumerate(confirmed_people) if i not in to_remove]
|
|
777
|
+
|
|
778
|
+
# Remove wrong projects
|
|
779
|
+
print(f"\n Current projects: {', '.join(confirmed_projects) or '(none)'}")
|
|
780
|
+
remove = input(
|
|
781
|
+
" Numbers to REMOVE from projects (comma-separated, or enter to skip): "
|
|
782
|
+
).strip()
|
|
783
|
+
if remove:
|
|
784
|
+
to_remove = {int(x.strip()) - 1 for x in remove.split(",") if x.strip().isdigit()}
|
|
785
|
+
confirmed_projects = [p for i, p in enumerate(confirmed_projects) if i not in to_remove]
|
|
786
|
+
|
|
787
|
+
if choice == "add" or input("\n Add any missing? [y/N]: ").strip().lower() == "y":
|
|
788
|
+
while True:
|
|
789
|
+
name = input(" Name (or enter to stop): ").strip()
|
|
790
|
+
if not name:
|
|
791
|
+
break
|
|
792
|
+
kind = input(f" Is '{name}' a (p)erson or p(r)oject? ").strip().lower()
|
|
793
|
+
if kind == "p":
|
|
794
|
+
confirmed_people.append(name)
|
|
795
|
+
elif kind == "r":
|
|
796
|
+
confirmed_projects.append(name)
|
|
797
|
+
|
|
798
|
+
print(f"\n{'=' * 58}")
|
|
799
|
+
print(" Confirmed:")
|
|
800
|
+
print(f" People: {', '.join(confirmed_people) or '(none)'}")
|
|
801
|
+
print(f" Projects: {', '.join(confirmed_projects) or '(none)'}")
|
|
802
|
+
print(f"{'=' * 58}\n")
|
|
803
|
+
|
|
804
|
+
return {
|
|
805
|
+
"people": confirmed_people,
|
|
806
|
+
"projects": confirmed_projects,
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
# ==================== SCAN HELPER ====================
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
def scan_for_detection(project_dir: str, max_files: int = 10) -> list:
|
|
814
|
+
"""
|
|
815
|
+
Collect prose file paths for entity detection.
|
|
816
|
+
Prose only (.txt, .md, .rst, .csv) — code files produce too many false positives.
|
|
817
|
+
Falls back to all readable files if no prose found.
|
|
818
|
+
"""
|
|
819
|
+
project_path = Path(project_dir).expanduser().resolve()
|
|
820
|
+
prose_files = []
|
|
821
|
+
all_files = []
|
|
822
|
+
|
|
823
|
+
for root, dirs, filenames in os.walk(project_path):
|
|
824
|
+
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
|
|
825
|
+
for filename in filenames:
|
|
826
|
+
filepath = Path(root) / filename
|
|
827
|
+
ext = filepath.suffix.lower()
|
|
828
|
+
if ext in PROSE_EXTENSIONS:
|
|
829
|
+
prose_files.append(filepath)
|
|
830
|
+
elif ext in READABLE_EXTENSIONS:
|
|
831
|
+
all_files.append(filepath)
|
|
832
|
+
|
|
833
|
+
# Prefer prose files — fall back to all readable if too few prose files
|
|
834
|
+
files = prose_files if len(prose_files) >= 3 else prose_files + all_files
|
|
835
|
+
return files[:max_files]
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
# ==================== CLI ====================
|
|
839
|
+
|
|
840
|
+
if __name__ == "__main__":
|
|
841
|
+
import sys
|
|
842
|
+
|
|
843
|
+
if len(sys.argv) < 2:
|
|
844
|
+
print("Usage: python entity_detector.py <directory>")
|
|
845
|
+
sys.exit(1)
|
|
846
|
+
|
|
847
|
+
project_dir = sys.argv[1]
|
|
848
|
+
print(f"Scanning: {project_dir}")
|
|
849
|
+
files = scan_for_detection(project_dir)
|
|
850
|
+
print(f"Reading {len(files)} files...")
|
|
851
|
+
detected = detect_entities(files)
|
|
852
|
+
confirmed = confirm_entities(detected)
|
|
853
|
+
print("Confirmed entities:", confirmed)
|