memorytrace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. engram/__init__.py +8 -0
  2. engram/__main__.py +6 -0
  3. engram/cli/__init__.py +1 -0
  4. engram/cli/app.py +291 -0
  5. engram/cli/formatters.py +90 -0
  6. engram/cli/simple.py +267 -0
  7. engram/config.py +72 -0
  8. engram/engine.py +612 -0
  9. engram/exceptions.py +41 -0
  10. engram/extraction/__init__.py +6 -0
  11. engram/extraction/base.py +20 -0
  12. engram/extraction/llm_extractor.py +197 -0
  13. engram/extraction/ner/__init__.py +7 -0
  14. engram/extraction/ner/cjk.py +63 -0
  15. engram/extraction/ner/english.py +109 -0
  16. engram/extraction/ner/korean.py +106 -0
  17. engram/extraction/regex_extractor.py +188 -0
  18. engram/integrations/__init__.py +1 -0
  19. engram/integrations/mcp_server.py +213 -0
  20. engram/integrations/sdk.py +194 -0
  21. engram/models/__init__.py +19 -0
  22. engram/models/entity.py +72 -0
  23. engram/models/fact.py +58 -0
  24. engram/models/quality.py +61 -0
  25. engram/models/relation.py +26 -0
  26. engram/models/search.py +96 -0
  27. engram/models/session.py +53 -0
  28. engram/models/source.py +73 -0
  29. engram/quality/__init__.py +8 -0
  30. engram/quality/confidence.py +38 -0
  31. engram/quality/conflict.py +79 -0
  32. engram/quality/decay.py +28 -0
  33. engram/quality/gate.py +120 -0
  34. engram/quality/pii.py +80 -0
  35. engram/search/__init__.py +13 -0
  36. engram/search/base.py +20 -0
  37. engram/search/fts5_search.py +210 -0
  38. engram/search/hybrid.py +99 -0
  39. engram/search/semantic.py +186 -0
  40. engram/search/tokenizer.py +85 -0
  41. engram/session/__init__.py +6 -0
  42. engram/session/context.py +87 -0
  43. engram/session/manager.py +152 -0
  44. engram/session/working_memory.py +57 -0
  45. engram/storage/__init__.py +6 -0
  46. engram/storage/base.py +63 -0
  47. engram/storage/markdown_export.py +144 -0
  48. engram/storage/migrations.py +30 -0
  49. engram/storage/sqlite_store.py +615 -0
  50. memorytrace-0.1.0.dist-info/METADATA +138 -0
  51. memorytrace-0.1.0.dist-info/RECORD +54 -0
  52. memorytrace-0.1.0.dist-info/WHEEL +4 -0
  53. memorytrace-0.1.0.dist-info/entry_points.txt +3 -0
  54. memorytrace-0.1.0.dist-info/licenses/LICENSE +21 -0
engram/exceptions.py ADDED
@@ -0,0 +1,41 @@
1
+ """Engram exception hierarchy."""
2
+
3
+
4
+ class EngramError(Exception):
5
+ """Base exception for all Engram errors."""
6
+
7
+
8
+ class EntityNotFoundError(EngramError):
9
+ """Raised when an entity lookup fails."""
10
+
11
+
12
+ class EntityAlreadyExistsError(EngramError):
13
+ """Raised when creating an entity that already exists."""
14
+
15
+
16
+ class StorageError(EngramError):
17
+ """Raised on storage backend failures (I/O, corruption, etc.)."""
18
+
19
+
20
+ class ValidationError(EngramError):
21
+ """Raised when data fails quality gate validation."""
22
+
23
+
24
+ class ConflictError(EngramError):
25
+ """Raised when a fact conflicts with existing data."""
26
+
27
+
28
+ class SessionError(EngramError):
29
+ """Raised on session lifecycle errors."""
30
+
31
+
32
+ class ConfigError(EngramError):
33
+ """Raised on configuration errors."""
34
+
35
+
36
+ class ExtractionError(EngramError):
37
+ """Raised when entity/fact extraction fails."""
38
+
39
+
40
+ class SecurityError(EngramError):
41
+ """Raised on security violations (path traversal, etc.)."""
@@ -0,0 +1,6 @@
1
+ """Extraction module for Engram."""
2
+
3
+ from engram.extraction.base import Extractor
4
+ from engram.extraction.regex_extractor import RegexExtractor
5
+
6
+ __all__ = ["Extractor", "RegexExtractor"]
@@ -0,0 +1,20 @@
1
+ """Extractor protocol — the contract all extractors implement."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol, runtime_checkable
6
+
7
+ from engram.models.entity import Entity
8
+ from engram.models.fact import Fact
9
+ from engram.models.relation import Relation
10
+
11
+
12
+ @runtime_checkable
13
+ class Extractor(Protocol):
14
+ """Abstract extraction interface."""
15
+
16
+ def extract_entities(self, text: str) -> list[Entity]: ...
17
+
18
+ def extract_facts(self, text: str, entities: list[Entity]) -> list[Fact]: ...
19
+
20
+ def extract_relations(self, text: str, entities: list[Entity]) -> list[Relation]: ...
@@ -0,0 +1,197 @@
1
+ """LLM-based extractor — high-quality extraction using Claude or GPT.
2
+
3
+ Requires: pip install engram[llm]
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ from typing import Optional
10
+
11
+ from engram.models.entity import Entity
12
+ from engram.models.fact import Fact
13
+ from engram.models.relation import Relation
14
+
15
+ _VALID_ENTITY_TYPES = frozenset({"person", "organization", "project", "concept"})
16
+ _MAX_NAME_LENGTH = 100
17
+
18
+ _ENTITY_PROMPT = """Extract named entities from the following text.
19
+ Return a JSON array of objects with fields: name, entity_type (person/organization/project/concept).
20
+ Only extract clearly identifiable entities. Do not guess.
21
+
22
+ Text:
23
+ {text}
24
+
25
+ Return ONLY the JSON array, no other text."""
26
+
27
+ _FACT_PROMPT = """Given the following text and known entities, extract factual statements.
28
+ Return a JSON array of objects with fields: subject, predicate, object, raw_text, confidence (0.0-1.0).
29
+
30
+ Known entities: {entities}
31
+
32
+ Text:
33
+ {text}
34
+
35
+ Return ONLY the JSON array, no other text."""
36
+
37
+ _RELATION_PROMPT = """Given the following text and known entities, extract relationships between entities.
38
+ Return a JSON array of objects with fields: from_name, to_name, relation_type (e.g., CEO_OF, WORKS_AT, INVESTED_IN).
39
+
40
+ Known entities: {entities}
41
+
42
+ Text:
43
+ {text}
44
+
45
+ Return ONLY the JSON array, no other text."""
46
+
47
+
48
+ class LLMExtractor:
49
+ """LLM-based entity, fact, and relation extraction.
50
+
51
+ Uses Claude (Anthropic) or GPT (OpenAI) for high-quality extraction.
52
+ Falls back gracefully if API calls fail.
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ provider: str = "anthropic",
58
+ model: Optional[str] = None,
59
+ ):
60
+ self.provider = provider
61
+ self._client = None
62
+
63
+ if provider == "anthropic":
64
+ self.model = model or "claude-sonnet-4-6"
65
+ try:
66
+ from anthropic import Anthropic
67
+ self._client = Anthropic()
68
+ except ImportError:
69
+ raise ImportError(
70
+ "Anthropic SDK not installed. Run: pip install engram[llm]"
71
+ )
72
+ elif provider == "openai":
73
+ self.model = model or "gpt-4o-mini"
74
+ try:
75
+ from openai import OpenAI
76
+ self._client = OpenAI()
77
+ except ImportError:
78
+ raise ImportError(
79
+ "OpenAI SDK not installed. Run: pip install engram[llm]"
80
+ )
81
+ else:
82
+ raise ValueError(f"Unknown provider: {provider}. Use 'anthropic' or 'openai'.")
83
+
84
+ def _call_llm(self, prompt: str) -> str:
85
+ """Call the LLM and return the response text."""
86
+ if self.provider == "anthropic":
87
+ response = self._client.messages.create(
88
+ model=self.model,
89
+ max_tokens=2048,
90
+ messages=[{"role": "user", "content": prompt}],
91
+ )
92
+ return response.content[0].text
93
+ else:
94
+ response = self._client.chat.completions.create(
95
+ model=self.model,
96
+ messages=[{"role": "user", "content": prompt}],
97
+ max_tokens=2048,
98
+ )
99
+ return response.choices[0].message.content or ""
100
+
101
+ def _parse_json_response(self, text: str) -> list:
102
+ """Parse JSON from LLM response, handling markdown code blocks."""
103
+ text = text.strip()
104
+ if text.startswith("```"):
105
+ lines = text.split("\n")
106
+ # Remove opening ``` line
107
+ start = 1
108
+ # Remove closing ``` line if present
109
+ end = len(lines)
110
+ if lines[-1].strip().startswith("```"):
111
+ end = -1
112
+ text = "\n".join(lines[start:end])
113
+ try:
114
+ result = json.loads(text)
115
+ return result if isinstance(result, list) else []
116
+ except json.JSONDecodeError:
117
+ return []
118
+
119
+ def extract_entities(self, text: str) -> list[Entity]:
120
+ """Extract entities using LLM."""
121
+ if not text.strip():
122
+ return []
123
+ try:
124
+ prompt = _ENTITY_PROMPT.format(text=text[:4000])
125
+ response = self._call_llm(prompt)
126
+ items = self._parse_json_response(response)
127
+ entities = []
128
+ seen = set()
129
+ for item in items:
130
+ name = item.get("name", "").strip()
131
+ if not name or name.lower() in seen:
132
+ continue
133
+ if len(name) > _MAX_NAME_LENGTH:
134
+ name = name[:_MAX_NAME_LENGTH]
135
+ entity_type = item.get("entity_type", "person")
136
+ if entity_type not in _VALID_ENTITY_TYPES:
137
+ entity_type = "person"
138
+ seen.add(name.lower())
139
+ entities.append(Entity(
140
+ name=name,
141
+ entity_type=entity_type,
142
+ ))
143
+ return entities
144
+ except Exception:
145
+ return []
146
+
147
+ def extract_facts(self, text: str, entities: list[Entity]) -> list[Fact]:
148
+ """Extract facts using LLM."""
149
+ if not text.strip() or not entities:
150
+ return []
151
+ try:
152
+ entity_names = ", ".join(e.name for e in entities)
153
+ entity_map = {e.name.lower(): e for e in entities}
154
+ prompt = _FACT_PROMPT.format(text=text[:4000], entities=entity_names)
155
+ response = self._call_llm(prompt)
156
+ items = self._parse_json_response(response)
157
+ facts = []
158
+ for item in items:
159
+ subject = item.get("subject", "").strip()
160
+ entity = entity_map.get(subject.lower())
161
+ if not entity:
162
+ continue
163
+ facts.append(Fact(
164
+ entity_id=entity.id,
165
+ subject=entity.name,
166
+ predicate=item.get("predicate", "attribute"),
167
+ object=item.get("object", ""),
168
+ raw_text=item.get("raw_text", ""),
169
+ confidence=min(1.0, max(0.0, item.get("confidence", 0.7))),
170
+ ))
171
+ return facts
172
+ except Exception:
173
+ return []
174
+
175
+ def extract_relations(self, text: str, entities: list[Entity]) -> list[Relation]:
176
+ """Extract relations using LLM."""
177
+ if not text.strip() or len(entities) < 2:
178
+ return []
179
+ try:
180
+ entity_names = ", ".join(e.name for e in entities)
181
+ entity_map = {e.name.lower(): e for e in entities}
182
+ prompt = _RELATION_PROMPT.format(text=text[:4000], entities=entity_names)
183
+ response = self._call_llm(prompt)
184
+ items = self._parse_json_response(response)
185
+ relations = []
186
+ for item in items:
187
+ from_entity = entity_map.get(item.get("from_name", "").lower())
188
+ to_entity = entity_map.get(item.get("to_name", "").lower())
189
+ if from_entity and to_entity:
190
+ relations.append(Relation(
191
+ from_entity_id=from_entity.id,
192
+ to_entity_id=to_entity.id,
193
+ relation_type=item.get("relation_type", "RELATED_TO"),
194
+ ))
195
+ return relations
196
+ except Exception:
197
+ return []
@@ -0,0 +1,7 @@
1
+ """Named Entity Recognition modules."""
2
+
3
+ from engram.extraction.ner.english import extract_english_entities
4
+ from engram.extraction.ner.korean import extract_korean_entities
5
+ from engram.extraction.ner.cjk import extract_cjk_entities
6
+
7
+ __all__ = ["extract_english_entities", "extract_korean_entities", "extract_cjk_entities"]
@@ -0,0 +1,63 @@
1
+ """CJK (Chinese/Japanese) named entity recognition — basic fallback."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from engram.models.entity import Entity
8
+
9
+ # Common Chinese surnames (top ~50)
10
+ _CHINESE_SURNAMES = frozenset({
11
+ "王", "李", "张", "刘", "陈", "杨", "黄", "赵", "吴", "周",
12
+ "徐", "孙", "马", "朱", "胡", "郭", "林", "何", "高", "罗",
13
+ "郑", "梁", "谢", "宋", "唐", "韩", "曹", "许", "邓", "萧",
14
+ "冯", "曾", "程", "蔡", "彭", "潘", "袁", "于", "董", "余",
15
+ "苏", "叶", "吕", "魏", "蒋", "田", "杜", "丁", "沈", "姜",
16
+ })
17
+
18
+ # Common Japanese surnames (top ~50)
19
+ _JAPANESE_SURNAMES = frozenset({
20
+ "佐藤", "鈴木", "高橋", "田中", "伊藤", "渡辺", "山本", "中村",
21
+ "小林", "加藤", "吉田", "山田", "佐々木", "松本", "井上", "木村",
22
+ "林", "斎藤", "清水", "山崎", "池田", "橋本", "阿部", "石川",
23
+ "山下", "中島", "前田", "藤田", "小川", "後藤", "岡田", "長谷川",
24
+ "村上", "近藤", "石井", "遠藤", "坂本", "青木", "藤井", "西村",
25
+ "福田", "太田", "三浦", "藤原", "松田", "岡本", "中川", "中野",
26
+ })
27
+
28
+ # Chinese name: surname(1) + given(1-2) = 2-3 chars
29
+ _CN_NAME_PATTERN = re.compile(r'([\u4E00-\u9FFF])([\u4E00-\u9FFF]{1,2})')
30
+
31
+ # Japanese name: surname(1-3 chars) + given(1-3 chars)
32
+ _JP_SURNAME_PATTERN = re.compile(
33
+ '(' + '|'.join(re.escape(s) for s in _JAPANESE_SURNAMES) + r')([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]{1,3})'
34
+ )
35
+
36
+
37
+ def extract_cjk_entities(text: str) -> list[Entity]:
38
+ """Extract Chinese and Japanese named entities from text.
39
+
40
+ Basic approach:
41
+ - Chinese: surname dictionary (1 char) + given name (1-2 chars)
42
+ - Japanese: surname dictionary (1-3 chars) + given name (1-3 chars)
43
+ """
44
+ entities: list[Entity] = []
45
+ seen: set[str] = set()
46
+
47
+ # Chinese names
48
+ for m in _CN_NAME_PATTERN.finditer(text):
49
+ surname = m.group(1)
50
+ if surname in _CHINESE_SURNAMES:
51
+ name = m.group(0)
52
+ if name not in seen:
53
+ seen.add(name)
54
+ entities.append(Entity(name=name, entity_type="person"))
55
+
56
+ # Japanese names
57
+ for m in _JP_SURNAME_PATTERN.finditer(text):
58
+ name = m.group(0)
59
+ if name not in seen:
60
+ seen.add(name)
61
+ entities.append(Entity(name=name, entity_type="person"))
62
+
63
+ return entities
@@ -0,0 +1,109 @@
1
+ """English named entity recognition — context-aware pattern matching."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from engram.models.entity import Entity
8
+
9
+ # Titles that precede names
10
+ _TITLES = frozenset({
11
+ "mr", "mrs", "ms", "dr", "prof", "professor", "sir", "lady",
12
+ "president", "ceo", "cto", "cfo", "coo", "vp", "director",
13
+ })
14
+
15
+ # Words that look like names but aren't (expanded blocklist)
16
+ _BLOCKLIST = frozenset({
17
+ # Common section headers / phrases
18
+ "key points", "executive summary", "see also", "last updated",
19
+ "time line", "timeline", "action items", "next steps", "open loops",
20
+ "north america", "south america", "south korea", "north korea",
21
+ "new york", "los angeles", "san francisco", "san diego", "san jose",
22
+ "las vegas", "el paso", "hong kong", "sri lanka", "costa rica",
23
+ "buenos aires", "rio grande", "cape town", "new zealand",
24
+ "united states", "united kingdom", "saudi arabia", "south africa",
25
+ # Tech / product terms
26
+ "machine learning", "deep learning", "artificial intelligence",
27
+ "open source", "real time", "high quality", "best practice",
28
+ "cloud computing", "data science", "big data",
29
+ # Common false positives
30
+ "thank you", "good morning", "good afternoon", "good evening",
31
+ "happy new", "merry christmas", "looking forward",
32
+ })
33
+
34
+ # Organization suffixes
35
+ _ORG_SUFFIXES = re.compile(
36
+ r'\b(Inc\.?|Corp\.?|Ltd\.?|LLC|GmbH|Co\.|Foundation|Association'
37
+ r'|Institute|University|Partners|Capital|Ventures|Labs?|Group'
38
+ r'|Holdings|Technologies|Solutions|Networks|Studios?)\b',
39
+ re.IGNORECASE,
40
+ )
41
+
42
+ # Pattern: Two or three consecutive capitalized words
43
+ # Use lookahead instead of \b for CJK-mixed text compatibility
44
+ _NAME_PATTERN = re.compile(
45
+ r'(?<![a-zA-Z])([A-Z][a-z]{1,15}(?:\s+[A-Z][a-z]{1,15}){1,2})(?![a-zA-Z])'
46
+ )
47
+
48
+ # Pattern: Title + Name
49
+ _TITLED_NAME = re.compile(
50
+ r'\b(?:Mr|Mrs|Ms|Dr|Prof|CEO|CTO)\.\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})\b'
51
+ )
52
+
53
+
54
+ def extract_english_entities(text: str) -> list[Entity]:
55
+ """Extract English named entities from text.
56
+
57
+ Uses context clues to classify as person vs organization:
58
+ - Organization suffixes (Inc., Corp., Foundation, etc.) → organization
59
+ - Title prefixes (Dr., CEO, etc.) → person
60
+ - Default for capitalized name pairs → person
61
+ """
62
+ entities: list[Entity] = []
63
+ seen_names: set[str] = set()
64
+
65
+ # Pass 1: Titled names (high confidence)
66
+ for m in _TITLED_NAME.finditer(text):
67
+ name = m.group(1).strip()
68
+ normalized = name.lower()
69
+ if normalized not in seen_names and normalized not in _BLOCKLIST:
70
+ seen_names.add(normalized)
71
+ entities.append(Entity(name=name, entity_type="person"))
72
+
73
+ # Pass 2: Organizations (suffix-based)
74
+ for m in _ORG_SUFFIXES.finditer(text):
75
+ # Look backwards for the organization name
76
+ start = max(0, m.start() - 60)
77
+ prefix = text[start:m.end()]
78
+ org_match = re.search(
79
+ r'([A-Z][A-Za-z]*(?:\s+[A-Z][A-Za-z]*)*\s+' + re.escape(m.group()) + r')',
80
+ prefix,
81
+ )
82
+ if org_match:
83
+ name = org_match.group(1).strip()
84
+ normalized = name.lower()
85
+ # Skip very short org names (likely false positives like "Buterin co")
86
+ name_without_suffix = name.rsplit(None, 1)[0] if " " in name else name
87
+ if len(name_without_suffix) < 3:
88
+ continue
89
+ if normalized not in seen_names:
90
+ seen_names.add(normalized)
91
+ entities.append(Entity(name=name, entity_type="organization"))
92
+
93
+ # Pass 3: Capitalized name pairs (default → person)
94
+ for m in _NAME_PATTERN.finditer(text):
95
+ name = m.group(1).strip()
96
+ normalized = name.lower()
97
+ if normalized in seen_names:
98
+ continue
99
+ if normalized in _BLOCKLIST:
100
+ continue
101
+ # Skip if it's at start of sentence (could be regular words)
102
+ pos = m.start()
103
+ if pos > 0 and text[pos - 1] in '.!?\n':
104
+ # Sentence start — lower confidence, still extract
105
+ pass
106
+ seen_names.add(normalized)
107
+ entities.append(Entity(name=name, entity_type="person"))
108
+
109
+ return entities
@@ -0,0 +1,106 @@
1
+ """Korean named entity recognition — dictionary + pattern based."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from engram.models.entity import Entity
8
+
9
+ # Common Korean surnames (top ~100)
10
+ _KOREAN_SURNAMES = frozenset({
11
+ "김", "이", "박", "최", "정", "강", "조", "윤", "장", "임",
12
+ "한", "오", "서", "신", "권", "황", "안", "송", "류", "전",
13
+ "홍", "고", "문", "양", "손", "배", "백", "허", "유", "남",
14
+ "심", "노", "하", "곽", "성", "차", "주", "우", "구", "민",
15
+ "진", "나", "지", "엄", "채", "원", "천", "방", "공", "현",
16
+ "함", "변", "염", "석", "선", "설", "마", "길", "연", "위",
17
+ "표", "명", "기", "반", "라", "왕", "금", "옥", "육", "추",
18
+ "탁", "도", "감", "봉", "모", "맹", "제", "빈", "판", "피",
19
+ "섭", "어", "음", "사", "가", "복", "태", "목", "형", "편",
20
+ "승", "뇌", "범", "상", "갈", "예", "경", "럼", "온", "소",
21
+ })
22
+
23
+ # Korean organization keywords
24
+ _KR_ORG_KEYWORDS = frozenset({
25
+ "주식회사", "재단", "협회", "학회", "그룹", "홀딩스", "파트너스",
26
+ "캐피탈", "벤처스", "랩", "랩스", "연구소", "대학교", "대학",
27
+ "병원", "은행", "증권", "보험", "카드",
28
+ })
29
+
30
+ # Korean particles (조사) — must be stripped from name boundary
31
+ # Pre-sorted by length (longest first) for greedy matching
32
+ _KR_PARTICLES = tuple(sorted(
33
+ ("은", "는", "이", "가", "을", "를", "의", "에", "와", "과",
34
+ "도", "만", "으로", "로", "에서", "부터", "까지", "라고",
35
+ "에게", "한테", "께서"),
36
+ key=len, reverse=True,
37
+ ))
38
+
39
+ # Pattern: Organization with keywords (keyword embedded or after space)
40
+ _KR_ORG_PATTERN = re.compile(
41
+ r'((?:[가-힣A-Za-z]{1,15})\s*(?:' + '|'.join(sorted(_KR_ORG_KEYWORDS, key=len, reverse=True)) + r'))'
42
+ )
43
+
44
+ # Common Korean words that are NOT names (extended stopwords for NER)
45
+ _KR_NOT_NAMES = frozenset({
46
+ "오늘", "내일", "어제", "우리", "저희", "여기", "거기", "저기",
47
+ "이번", "다음", "지난", "올해", "작년", "내년", "현재", "최근",
48
+ "모든", "많은", "새로", "다른", "같은", "이런", "그런", "저런",
49
+ "하지", "그리", "따라", "또한", "그래", "아직", "이미", "바로",
50
+ "서로", "다시", "매우", "아주", "정말", "진짜", "너무", "가장",
51
+ "회의", "미팅", "보고", "발표", "논의", "결정", "진행", "완료",
52
+ "사업", "투자", "개발", "관리", "운영", "성장", "확대", "축소",
53
+ "기술", "시스템", "서비스", "플랫", "데이", "프로", "소프",
54
+ "문제", "해결", "방안", "계획", "목표", "전략", "방향", "결과",
55
+ "서비", "기업", "시장", "제품", "고객", "매출", "수익",
56
+ "함께", "시작", "종료", "변경",
57
+ "연구", "조사", "분석", "검토", "평가", "보완", "개선", "추진",
58
+ "진행", "참석", "참여", "준비", "완료", "예정", "필요", "가능",
59
+ "이다", "하다", "되다", "있다", "없다", "이며", "으며", "라고",
60
+ })
61
+
62
+
63
+ def extract_korean_entities(text: str) -> list[Entity]:
64
+ """Extract Korean named entities from text.
65
+
66
+ Uses surname dictionary + syllable count for person names.
67
+ Uses organization keywords for organizations.
68
+ """
69
+ entities: list[Entity] = []
70
+ seen_names: set[str] = set()
71
+
72
+ # Pass 1: Organizations with keywords
73
+ for m in _KR_ORG_PATTERN.finditer(text):
74
+ org_name = m.group(0).strip()
75
+ if org_name.lower() not in seen_names:
76
+ seen_names.add(org_name.lower())
77
+ entities.append(Entity(name=org_name, entity_type="organization"))
78
+
79
+ # Pass 2: Korean person names (surname + given name)
80
+ # Split text by spaces/punctuation, then check each word
81
+ words = re.findall(r'[가-힣]+', text)
82
+ for word in words:
83
+ name = _strip_particles(word)
84
+ if len(name) < 2 or len(name) > 3:
85
+ continue
86
+ surname = name[0]
87
+ if surname not in _KOREAN_SURNAMES:
88
+ continue
89
+ if name in _KR_NOT_NAMES:
90
+ continue
91
+ if name.lower() in seen_names:
92
+ continue
93
+
94
+ seen_names.add(name.lower())
95
+ entities.append(Entity(name=name, entity_type="person"))
96
+
97
+ return entities
98
+
99
+
100
+ def _strip_particles(word: str) -> str:
101
+ """Strip trailing Korean particles (조사) from a word."""
102
+ # _KR_PARTICLES is pre-sorted longest first
103
+ for particle in _KR_PARTICLES:
104
+ if word.endswith(particle) and len(word) > len(particle):
105
+ return word[:-len(particle)]
106
+ return word