mempalace-code 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,853 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ entity_detector.py — Auto-detect people and projects from file content.
4
+
5
+ Two-pass approach:
6
+ Pass 1: scan files, extract entity candidates with signal counts
7
+ Pass 2: score and classify each candidate as person, project, or uncertain
8
+
9
+ Used by mempalace init before mining begins.
10
+ The confirmed entity map feeds the miner as the taxonomy.
11
+
12
+ Usage:
13
+ from entity_detector import detect_entities, confirm_entities
14
+ candidates = detect_entities(file_paths)
15
+ confirmed = confirm_entities(candidates) # interactive review
16
+ """
17
+
18
+ import re
19
+ import os
20
+ from pathlib import Path
21
+ from collections import defaultdict
22
+
23
+
24
+ # ==================== SIGNAL PATTERNS ====================
25
+
26
+ # Person signals — things people do
27
+ PERSON_VERB_PATTERNS = [
28
+ r"\b{name}\s+said\b",
29
+ r"\b{name}\s+asked\b",
30
+ r"\b{name}\s+told\b",
31
+ r"\b{name}\s+replied\b",
32
+ r"\b{name}\s+laughed\b",
33
+ r"\b{name}\s+smiled\b",
34
+ r"\b{name}\s+cried\b",
35
+ r"\b{name}\s+felt\b",
36
+ r"\b{name}\s+thinks?\b",
37
+ r"\b{name}\s+wants?\b",
38
+ r"\b{name}\s+loves?\b",
39
+ r"\b{name}\s+hates?\b",
40
+ r"\b{name}\s+knows?\b",
41
+ r"\b{name}\s+decided\b",
42
+ r"\b{name}\s+pushed\b",
43
+ r"\b{name}\s+wrote\b",
44
+ r"\bhey\s+{name}\b",
45
+ r"\bthanks?\s+{name}\b",
46
+ r"\bhi\s+{name}\b",
47
+ r"\bdear\s+{name}\b",
48
+ ]
49
+
50
+ # Person signals — pronouns resolving nearby
51
+ PRONOUN_PATTERNS = [
52
+ r"\bshe\b",
53
+ r"\bher\b",
54
+ r"\bhers\b",
55
+ r"\bhe\b",
56
+ r"\bhim\b",
57
+ r"\bhis\b",
58
+ r"\bthey\b",
59
+ r"\bthem\b",
60
+ r"\btheir\b",
61
+ ]
62
+
63
+ # Person signals — dialogue markers
64
+ DIALOGUE_PATTERNS = [
65
+ r"^>\s*{name}[:\s]", # > Speaker: ...
66
+ r"^{name}:\s", # Speaker: ...
67
+ r"^\[{name}\]", # [Speaker]
68
+ r'"{name}\s+said',
69
+ ]
70
+
71
+ # Project signals — things projects have/do
72
+ PROJECT_VERB_PATTERNS = [
73
+ r"\bbuilding\s+{name}\b",
74
+ r"\bbuilt\s+{name}\b",
75
+ r"\bship(?:ping|ped)?\s+{name}\b",
76
+ r"\blaunch(?:ing|ed)?\s+{name}\b",
77
+ r"\bdeploy(?:ing|ed)?\s+{name}\b",
78
+ r"\binstall(?:ing|ed)?\s+{name}\b",
79
+ r"\bthe\s+{name}\s+architecture\b",
80
+ r"\bthe\s+{name}\s+pipeline\b",
81
+ r"\bthe\s+{name}\s+system\b",
82
+ r"\bthe\s+{name}\s+repo\b",
83
+ r"\b{name}\s+v\d+\b", # MemPal v2
84
+ r"\b{name}\.py\b", # mempalace.py
85
+ r"\b{name}-core\b", # mempal-core (hyphen only, not underscore)
86
+ r"\b{name}-local\b",
87
+ r"\bimport\s+{name}\b",
88
+ r"\bpip\s+install\s+{name}\b",
89
+ ]
90
+
91
+ # Words that are almost certainly NOT entities
92
+ STOPWORDS = {
93
+ "the",
94
+ "a",
95
+ "an",
96
+ "and",
97
+ "or",
98
+ "but",
99
+ "in",
100
+ "on",
101
+ "at",
102
+ "to",
103
+ "for",
104
+ "of",
105
+ "with",
106
+ "by",
107
+ "from",
108
+ "as",
109
+ "is",
110
+ "was",
111
+ "are",
112
+ "were",
113
+ "be",
114
+ "been",
115
+ "being",
116
+ "have",
117
+ "has",
118
+ "had",
119
+ "do",
120
+ "does",
121
+ "did",
122
+ "will",
123
+ "would",
124
+ "could",
125
+ "should",
126
+ "may",
127
+ "might",
128
+ "must",
129
+ "shall",
130
+ "can",
131
+ "this",
132
+ "that",
133
+ "these",
134
+ "those",
135
+ "it",
136
+ "its",
137
+ "they",
138
+ "them",
139
+ "their",
140
+ "we",
141
+ "our",
142
+ "you",
143
+ "your",
144
+ "i",
145
+ "my",
146
+ "me",
147
+ "he",
148
+ "she",
149
+ "his",
150
+ "her",
151
+ "who",
152
+ "what",
153
+ "when",
154
+ "where",
155
+ "why",
156
+ "how",
157
+ "which",
158
+ "if",
159
+ "then",
160
+ "so",
161
+ "not",
162
+ "no",
163
+ "yes",
164
+ "ok",
165
+ "okay",
166
+ "just",
167
+ "very",
168
+ "really",
169
+ "also",
170
+ "already",
171
+ "still",
172
+ "even",
173
+ "only",
174
+ "here",
175
+ "there",
176
+ "now",
177
+ "then",
178
+ "too",
179
+ "up",
180
+ "out",
181
+ "about",
182
+ "like",
183
+ "use",
184
+ "get",
185
+ "got",
186
+ "make",
187
+ "made",
188
+ "take",
189
+ "put",
190
+ "come",
191
+ "go",
192
+ "see",
193
+ "know",
194
+ "think",
195
+ "true",
196
+ "false",
197
+ "none",
198
+ "null",
199
+ "new",
200
+ "old",
201
+ "all",
202
+ "any",
203
+ "some",
204
+ "true",
205
+ "false",
206
+ "return",
207
+ "print",
208
+ "def",
209
+ "class",
210
+ "import",
211
+ "from",
212
+ # Common capitalized words in prose that aren't entities
213
+ "step",
214
+ "usage",
215
+ "run",
216
+ "check",
217
+ "find",
218
+ "add",
219
+ "get",
220
+ "set",
221
+ "list",
222
+ "args",
223
+ "dict",
224
+ "str",
225
+ "int",
226
+ "bool",
227
+ "path",
228
+ "file",
229
+ "type",
230
+ "name",
231
+ "note",
232
+ "example",
233
+ "option",
234
+ "result",
235
+ "error",
236
+ "warning",
237
+ "info",
238
+ "every",
239
+ "each",
240
+ "more",
241
+ "less",
242
+ "next",
243
+ "last",
244
+ "first",
245
+ "second",
246
+ "stack",
247
+ "layer",
248
+ "mode",
249
+ "test",
250
+ "stop",
251
+ "start",
252
+ "copy",
253
+ "move",
254
+ "source",
255
+ "target",
256
+ "output",
257
+ "input",
258
+ "data",
259
+ "item",
260
+ "key",
261
+ "value",
262
+ "returns",
263
+ "raises",
264
+ "yields",
265
+ "none",
266
+ "self",
267
+ "cls",
268
+ "kwargs",
269
+ # Common sentence-starting / abstract words that aren't entities
270
+ "world",
271
+ "well",
272
+ "want",
273
+ "topic",
274
+ "choose",
275
+ "social",
276
+ "cars",
277
+ "phones",
278
+ "healthcare",
279
+ "ex",
280
+ "machina",
281
+ "deus",
282
+ "human",
283
+ "humans",
284
+ "people",
285
+ "things",
286
+ "something",
287
+ "nothing",
288
+ "everything",
289
+ "anything",
290
+ "someone",
291
+ "everyone",
292
+ "anyone",
293
+ "way",
294
+ "time",
295
+ "day",
296
+ "life",
297
+ "place",
298
+ "thing",
299
+ "part",
300
+ "kind",
301
+ "sort",
302
+ "case",
303
+ "point",
304
+ "idea",
305
+ "fact",
306
+ "sense",
307
+ "question",
308
+ "answer",
309
+ "reason",
310
+ "number",
311
+ "version",
312
+ "system",
313
+ # Greetings and filler words at sentence starts
314
+ "hey",
315
+ "hi",
316
+ "hello",
317
+ "thanks",
318
+ "thank",
319
+ "right",
320
+ "let",
321
+ "ok",
322
+ # UI/action words that appear in how-to content
323
+ "click",
324
+ "hit",
325
+ "press",
326
+ "tap",
327
+ "drag",
328
+ "drop",
329
+ "open",
330
+ "close",
331
+ "save",
332
+ "load",
333
+ "launch",
334
+ "install",
335
+ "download",
336
+ "upload",
337
+ "scroll",
338
+ "select",
339
+ "enter",
340
+ "submit",
341
+ "cancel",
342
+ "confirm",
343
+ "delete",
344
+ "copy",
345
+ "paste",
346
+ "type",
347
+ "write",
348
+ "read",
349
+ "search",
350
+ "find",
351
+ "show",
352
+ "hide",
353
+ # Common filesystem/technical capitalized words
354
+ "desktop",
355
+ "documents",
356
+ "downloads",
357
+ "users",
358
+ "home",
359
+ "library",
360
+ "applications",
361
+ "system",
362
+ "preferences",
363
+ "settings",
364
+ "terminal",
365
+ # Abstract/topic words
366
+ "actor",
367
+ "vector",
368
+ "remote",
369
+ "control",
370
+ "duration",
371
+ "fetch",
372
+ # Abstract concepts that appear as subjects but aren't entities
373
+ "agents",
374
+ "tools",
375
+ "others",
376
+ "guards",
377
+ "ethics",
378
+ "regulation",
379
+ "learning",
380
+ "thinking",
381
+ "memory",
382
+ "language",
383
+ "intelligence",
384
+ "technology",
385
+ "society",
386
+ "culture",
387
+ "future",
388
+ "history",
389
+ "science",
390
+ "model",
391
+ "models",
392
+ "network",
393
+ "networks",
394
+ "training",
395
+ "inference",
396
+ }
397
+
398
+ # For entity detection — prose only, no code files
399
+ # Code files have too many capitalized names (classes, functions) that aren't entities
400
+ PROSE_EXTENSIONS = {
401
+ ".txt",
402
+ ".md",
403
+ ".rst",
404
+ ".csv",
405
+ }
406
+
407
+ READABLE_EXTENSIONS = {
408
+ ".txt",
409
+ ".md",
410
+ ".py",
411
+ ".js",
412
+ ".ts",
413
+ ".json",
414
+ ".yaml",
415
+ ".yml",
416
+ ".csv",
417
+ ".rst",
418
+ ".toml",
419
+ ".sh",
420
+ ".rb",
421
+ ".go",
422
+ ".rs",
423
+ }
424
+
425
+ SKIP_DIRS = {
426
+ ".git",
427
+ "node_modules",
428
+ "__pycache__",
429
+ ".venv",
430
+ "venv",
431
+ "env",
432
+ "dist",
433
+ "build",
434
+ ".next",
435
+ "coverage",
436
+ ".mempalace",
437
+ }
438
+
439
+
440
+ # ==================== CANDIDATE EXTRACTION ====================
441
+
442
+
443
+ def extract_candidates(text: str) -> dict:
444
+ """
445
+ Extract all capitalized proper noun candidates from text.
446
+ Returns {name: frequency} for names appearing 3+ times.
447
+ """
448
+ # Find all capitalized words (not at sentence start — harder, so we use frequency as filter)
449
+ raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text)
450
+
451
+ counts = defaultdict(int)
452
+ for word in raw:
453
+ if word.lower() not in STOPWORDS and len(word) > 1:
454
+ counts[word] += 1
455
+
456
+ # Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code")
457
+ multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text)
458
+ for phrase in multi:
459
+ if not any(w.lower() in STOPWORDS for w in phrase.split()):
460
+ counts[phrase] += 1
461
+
462
+ # Filter: must appear at least 3 times to be a candidate
463
+ return {name: count for name, count in counts.items() if count >= 3}
464
+
465
+
466
+ # ==================== SIGNAL SCORING ====================
467
+
468
+
469
+ def _build_patterns(name: str) -> dict:
470
+ """Pre-compile all regex patterns for a single entity name."""
471
+ n = re.escape(name)
472
+ return {
473
+ "dialogue": [
474
+ re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in DIALOGUE_PATTERNS
475
+ ],
476
+ "person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in PERSON_VERB_PATTERNS],
477
+ "project_verbs": [
478
+ re.compile(p.format(name=n), re.IGNORECASE) for p in PROJECT_VERB_PATTERNS
479
+ ],
480
+ "direct": re.compile(rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b", re.IGNORECASE),
481
+ "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
482
+ "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
483
+ }
484
+
485
+
486
+ def score_entity(name: str, text: str, lines: list) -> dict:
487
+ """
488
+ Score a candidate entity as person vs project.
489
+ Returns scores and the signals that fired.
490
+ """
491
+ patterns = _build_patterns(name)
492
+ person_score = 0
493
+ project_score = 0
494
+ person_signals = []
495
+ project_signals = []
496
+
497
+ # --- Person signals ---
498
+
499
+ # Dialogue markers (strong signal)
500
+ for rx in patterns["dialogue"]:
501
+ matches = len(rx.findall(text))
502
+ if matches > 0:
503
+ person_score += matches * 3
504
+ person_signals.append(f"dialogue marker ({matches}x)")
505
+
506
+ # Person verbs
507
+ for rx in patterns["person_verbs"]:
508
+ matches = len(rx.findall(text))
509
+ if matches > 0:
510
+ person_score += matches * 2
511
+ person_signals.append(f"'{name} ...' action ({matches}x)")
512
+
513
+ # Pronoun proximity — pronouns within 3 lines of the name
514
+ name_lower = name.lower()
515
+ name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
516
+ pronoun_hits = 0
517
+ for idx in name_line_indices:
518
+ window_text = " ".join(lines[max(0, idx - 2) : idx + 3]).lower()
519
+ for pronoun_pattern in PRONOUN_PATTERNS:
520
+ if re.search(pronoun_pattern, window_text):
521
+ pronoun_hits += 1
522
+ break
523
+ if pronoun_hits > 0:
524
+ person_score += pronoun_hits * 2
525
+ person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
526
+
527
+ # Direct address
528
+ direct = len(patterns["direct"].findall(text))
529
+ if direct > 0:
530
+ person_score += direct * 4
531
+ person_signals.append(f"addressed directly ({direct}x)")
532
+
533
+ # --- Project signals ---
534
+
535
+ for rx in patterns["project_verbs"]:
536
+ matches = len(rx.findall(text))
537
+ if matches > 0:
538
+ project_score += matches * 2
539
+ project_signals.append(f"project verb ({matches}x)")
540
+
541
+ versioned = len(patterns["versioned"].findall(text))
542
+ if versioned > 0:
543
+ project_score += versioned * 3
544
+ project_signals.append(f"versioned/hyphenated ({versioned}x)")
545
+
546
+ code_ref = len(patterns["code_ref"].findall(text))
547
+ if code_ref > 0:
548
+ project_score += code_ref * 3
549
+ project_signals.append(f"code file reference ({code_ref}x)")
550
+
551
+ return {
552
+ "person_score": person_score,
553
+ "project_score": project_score,
554
+ "person_signals": person_signals[:3],
555
+ "project_signals": project_signals[:3],
556
+ }
557
+
558
+
559
+ # ==================== CLASSIFY ====================
560
+
561
+
562
+ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
563
+ """
564
+ Given scores, classify as person / project / uncertain.
565
+ Returns entity dict with confidence.
566
+ """
567
+ ps = scores["person_score"]
568
+ prs = scores["project_score"]
569
+ total = ps + prs
570
+
571
+ if total == 0:
572
+ # No strong signals — frequency-only candidate, uncertain
573
+ confidence = min(0.4, frequency / 50)
574
+ return {
575
+ "name": name,
576
+ "type": "uncertain",
577
+ "confidence": round(confidence, 2),
578
+ "frequency": frequency,
579
+ "signals": [f"appears {frequency}x, no strong type signals"],
580
+ }
581
+
582
+ person_ratio = ps / total if total > 0 else 0
583
+
584
+ # Require TWO different signal categories to confidently classify as a person.
585
+ # One signal type with many hits (e.g. "Click, click, click...") is not enough —
586
+ # it just means that word appears often in a particular syntactic position.
587
+ signal_categories = set()
588
+ for s in scores["person_signals"]:
589
+ if "dialogue" in s:
590
+ signal_categories.add("dialogue")
591
+ elif "action" in s:
592
+ signal_categories.add("action")
593
+ elif "pronoun" in s:
594
+ signal_categories.add("pronoun")
595
+ elif "addressed" in s:
596
+ signal_categories.add("addressed")
597
+
598
+ has_two_signal_types = len(signal_categories) >= 2
599
+ _ = signal_categories - {"pronoun"} # reserved for future thresholds
600
+
601
+ if person_ratio >= 0.7 and has_two_signal_types and ps >= 5:
602
+ entity_type = "person"
603
+ confidence = min(0.99, 0.5 + person_ratio * 0.5)
604
+ signals = scores["person_signals"] or [f"appears {frequency}x"]
605
+ elif person_ratio >= 0.7 and (not has_two_signal_types or ps < 5):
606
+ # Pronoun-only match — downgrade to uncertain
607
+ entity_type = "uncertain"
608
+ confidence = 0.4
609
+ signals = scores["person_signals"] + [f"appears {frequency}x — pronoun-only match"]
610
+ elif person_ratio <= 0.3:
611
+ entity_type = "project"
612
+ confidence = min(0.99, 0.5 + (1 - person_ratio) * 0.5)
613
+ signals = scores["project_signals"] or [f"appears {frequency}x"]
614
+ else:
615
+ entity_type = "uncertain"
616
+ confidence = 0.5
617
+ signals = (scores["person_signals"] + scores["project_signals"])[:3]
618
+ signals.append("mixed signals — needs review")
619
+
620
+ return {
621
+ "name": name,
622
+ "type": entity_type,
623
+ "confidence": round(confidence, 2),
624
+ "frequency": frequency,
625
+ "signals": signals,
626
+ }
627
+
628
+
629
+ # ==================== MAIN DETECT ====================
630
+
631
+
632
+ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
633
+ """
634
+ Scan files and detect entity candidates.
635
+
636
+ Args:
637
+ file_paths: List of Path objects to scan
638
+ max_files: Max files to read (for speed)
639
+
640
+ Returns:
641
+ {
642
+ "people": [...entity dicts...],
643
+ "projects": [...entity dicts...],
644
+ "uncertain":[...entity dicts...],
645
+ }
646
+ """
647
+ # Collect text from files
648
+ all_text = []
649
+ all_lines = []
650
+ files_read = 0
651
+
652
+ MAX_BYTES_PER_FILE = 5_000 # first 5KB per file — enough to catch recurring entities
653
+
654
+ for filepath in file_paths:
655
+ if files_read >= max_files:
656
+ break
657
+ try:
658
+ with open(filepath, encoding="utf-8", errors="replace") as f:
659
+ content = f.read(MAX_BYTES_PER_FILE)
660
+ all_text.append(content)
661
+ all_lines.extend(content.splitlines())
662
+ files_read += 1
663
+ except OSError:
664
+ continue
665
+
666
+ combined_text = "\n".join(all_text)
667
+
668
+ # Extract candidates
669
+ candidates = extract_candidates(combined_text)
670
+
671
+ if not candidates:
672
+ return {"people": [], "projects": [], "uncertain": []}
673
+
674
+ # Score and classify each candidate
675
+ people = []
676
+ projects = []
677
+ uncertain = []
678
+
679
+ for name, frequency in sorted(candidates.items(), key=lambda x: x[1], reverse=True):
680
+ scores = score_entity(name, combined_text, all_lines)
681
+ entity = classify_entity(name, frequency, scores)
682
+
683
+ if entity["type"] == "person":
684
+ people.append(entity)
685
+ elif entity["type"] == "project":
686
+ projects.append(entity)
687
+ else:
688
+ uncertain.append(entity)
689
+
690
+ # Sort by confidence descending
691
+ people.sort(key=lambda x: x["confidence"], reverse=True)
692
+ projects.sort(key=lambda x: x["confidence"], reverse=True)
693
+ uncertain.sort(key=lambda x: x["frequency"], reverse=True)
694
+
695
+ # Cap results to most relevant
696
+ return {
697
+ "people": people[:15],
698
+ "projects": projects[:10],
699
+ "uncertain": uncertain[:8],
700
+ }
701
+
702
+
703
+ # ==================== INTERACTIVE CONFIRM ====================
704
+
705
+
706
+ def _print_entity_list(entities: list, label: str):
707
+ print(f"\n {label}:")
708
+ if not entities:
709
+ print(" (none detected)")
710
+ return
711
+ for i, e in enumerate(entities):
712
+ confidence_bar = "●" * int(e["confidence"] * 5) + "○" * (5 - int(e["confidence"] * 5))
713
+ signals_str = ", ".join(e["signals"][:2]) if e["signals"] else ""
714
+ print(f" {i + 1:2}. {e['name']:20} [{confidence_bar}] {signals_str}")
715
+
716
+
717
+ def confirm_entities(detected: dict, yes: bool = False) -> dict:
718
+ """
719
+ Interactive confirmation step.
720
+ User reviews detected entities, removes wrong ones, adds missing ones.
721
+ Returns confirmed {people: [names], projects: [names]}
722
+
723
+ Pass yes=True to auto-accept all detected entities without prompting.
724
+ """
725
+ print(f"\n{'=' * 58}")
726
+ print(" MemPalace — Entity Detection")
727
+ print(f"{'=' * 58}")
728
+ print("\n Scanned your files. Here's what we found:\n")
729
+
730
+ _print_entity_list(detected["people"], "PEOPLE")
731
+ _print_entity_list(detected["projects"], "PROJECTS")
732
+
733
+ if detected["uncertain"]:
734
+ _print_entity_list(detected["uncertain"], "UNCERTAIN (need your call)")
735
+
736
+ confirmed_people = [e["name"] for e in detected["people"]]
737
+ confirmed_projects = [e["name"] for e in detected["projects"]]
738
+
739
+ if yes:
740
+ # Auto-accept: include all detected (skip uncertain — ambiguous without user input)
741
+ print(
742
+ f"\n Auto-accepting {len(confirmed_people)} people, {len(confirmed_projects)} projects."
743
+ )
744
+ return {"people": confirmed_people, "projects": confirmed_projects}
745
+
746
+ print(f"\n{'─' * 58}")
747
+ print(" Options:")
748
+ print(" [enter] Accept all")
749
+ print(" [edit] Remove wrong entries or reclassify uncertain")
750
+ print(" [add] Add missing people or projects")
751
+ print()
752
+
753
+ choice = input(" Your choice [enter/edit/add]: ").strip().lower()
754
+
755
+ confirmed_people = [e["name"] for e in detected["people"]]
756
+ confirmed_projects = [e["name"] for e in detected["projects"]]
757
+
758
+ if choice == "edit":
759
+ # Handle uncertain first
760
+ if detected["uncertain"]:
761
+ print("\n Uncertain entities — classify each:")
762
+ for e in detected["uncertain"]:
763
+ ans = input(f" {e['name']} — (p)erson, (r)roject, or (s)kip? ").strip().lower()
764
+ if ans == "p":
765
+ confirmed_people.append(e["name"])
766
+ elif ans == "r":
767
+ confirmed_projects.append(e["name"])
768
+
769
+ # Remove wrong people
770
+ print(f"\n Current people: {', '.join(confirmed_people) or '(none)'}")
771
+ remove = input(
772
+ " Numbers to REMOVE from people (comma-separated, or enter to skip): "
773
+ ).strip()
774
+ if remove:
775
+ to_remove = {int(x.strip()) - 1 for x in remove.split(",") if x.strip().isdigit()}
776
+ confirmed_people = [p for i, p in enumerate(confirmed_people) if i not in to_remove]
777
+
778
+ # Remove wrong projects
779
+ print(f"\n Current projects: {', '.join(confirmed_projects) or '(none)'}")
780
+ remove = input(
781
+ " Numbers to REMOVE from projects (comma-separated, or enter to skip): "
782
+ ).strip()
783
+ if remove:
784
+ to_remove = {int(x.strip()) - 1 for x in remove.split(",") if x.strip().isdigit()}
785
+ confirmed_projects = [p for i, p in enumerate(confirmed_projects) if i not in to_remove]
786
+
787
+ if choice == "add" or input("\n Add any missing? [y/N]: ").strip().lower() == "y":
788
+ while True:
789
+ name = input(" Name (or enter to stop): ").strip()
790
+ if not name:
791
+ break
792
+ kind = input(f" Is '{name}' a (p)erson or p(r)oject? ").strip().lower()
793
+ if kind == "p":
794
+ confirmed_people.append(name)
795
+ elif kind == "r":
796
+ confirmed_projects.append(name)
797
+
798
+ print(f"\n{'=' * 58}")
799
+ print(" Confirmed:")
800
+ print(f" People: {', '.join(confirmed_people) or '(none)'}")
801
+ print(f" Projects: {', '.join(confirmed_projects) or '(none)'}")
802
+ print(f"{'=' * 58}\n")
803
+
804
+ return {
805
+ "people": confirmed_people,
806
+ "projects": confirmed_projects,
807
+ }
808
+
809
+
810
+ # ==================== SCAN HELPER ====================
811
+
812
+
813
+ def scan_for_detection(project_dir: str, max_files: int = 10) -> list:
814
+ """
815
+ Collect prose file paths for entity detection.
816
+ Prose only (.txt, .md, .rst, .csv) — code files produce too many false positives.
817
+ Falls back to all readable files if no prose found.
818
+ """
819
+ project_path = Path(project_dir).expanduser().resolve()
820
+ prose_files = []
821
+ all_files = []
822
+
823
+ for root, dirs, filenames in os.walk(project_path):
824
+ dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
825
+ for filename in filenames:
826
+ filepath = Path(root) / filename
827
+ ext = filepath.suffix.lower()
828
+ if ext in PROSE_EXTENSIONS:
829
+ prose_files.append(filepath)
830
+ elif ext in READABLE_EXTENSIONS:
831
+ all_files.append(filepath)
832
+
833
+ # Prefer prose files — fall back to all readable if too few prose files
834
+ files = prose_files if len(prose_files) >= 3 else prose_files + all_files
835
+ return files[:max_files]
836
+
837
+
838
+ # ==================== CLI ====================
839
+
840
+ if __name__ == "__main__":
841
+ import sys
842
+
843
+ if len(sys.argv) < 2:
844
+ print("Usage: python entity_detector.py <directory>")
845
+ sys.exit(1)
846
+
847
+ project_dir = sys.argv[1]
848
+ print(f"Scanning: {project_dir}")
849
+ files = scan_for_detection(project_dir)
850
+ print(f"Reading {len(files)} files...")
851
+ detected = detect_entities(files)
852
+ confirmed = confirm_entities(detected)
853
+ print("Confirmed entities:", confirmed)