memorytrace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- engram/__init__.py +8 -0
- engram/__main__.py +6 -0
- engram/cli/__init__.py +1 -0
- engram/cli/app.py +291 -0
- engram/cli/formatters.py +90 -0
- engram/cli/simple.py +267 -0
- engram/config.py +72 -0
- engram/engine.py +612 -0
- engram/exceptions.py +41 -0
- engram/extraction/__init__.py +6 -0
- engram/extraction/base.py +20 -0
- engram/extraction/llm_extractor.py +197 -0
- engram/extraction/ner/__init__.py +7 -0
- engram/extraction/ner/cjk.py +63 -0
- engram/extraction/ner/english.py +109 -0
- engram/extraction/ner/korean.py +106 -0
- engram/extraction/regex_extractor.py +188 -0
- engram/integrations/__init__.py +1 -0
- engram/integrations/mcp_server.py +213 -0
- engram/integrations/sdk.py +194 -0
- engram/models/__init__.py +19 -0
- engram/models/entity.py +72 -0
- engram/models/fact.py +58 -0
- engram/models/quality.py +61 -0
- engram/models/relation.py +26 -0
- engram/models/search.py +96 -0
- engram/models/session.py +53 -0
- engram/models/source.py +73 -0
- engram/quality/__init__.py +8 -0
- engram/quality/confidence.py +38 -0
- engram/quality/conflict.py +79 -0
- engram/quality/decay.py +28 -0
- engram/quality/gate.py +120 -0
- engram/quality/pii.py +80 -0
- engram/search/__init__.py +13 -0
- engram/search/base.py +20 -0
- engram/search/fts5_search.py +210 -0
- engram/search/hybrid.py +99 -0
- engram/search/semantic.py +186 -0
- engram/search/tokenizer.py +85 -0
- engram/session/__init__.py +6 -0
- engram/session/context.py +87 -0
- engram/session/manager.py +152 -0
- engram/session/working_memory.py +57 -0
- engram/storage/__init__.py +6 -0
- engram/storage/base.py +63 -0
- engram/storage/markdown_export.py +144 -0
- engram/storage/migrations.py +30 -0
- engram/storage/sqlite_store.py +615 -0
- memorytrace-0.1.0.dist-info/METADATA +138 -0
- memorytrace-0.1.0.dist-info/RECORD +54 -0
- memorytrace-0.1.0.dist-info/WHEEL +4 -0
- memorytrace-0.1.0.dist-info/entry_points.txt +3 -0
- memorytrace-0.1.0.dist-info/licenses/LICENSE +21 -0
engram/exceptions.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Engram exception hierarchy."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class EngramError(Exception):
|
|
5
|
+
"""Base exception for all Engram errors."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EntityNotFoundError(EngramError):
|
|
9
|
+
"""Raised when an entity lookup fails."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EntityAlreadyExistsError(EngramError):
|
|
13
|
+
"""Raised when creating an entity that already exists."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class StorageError(EngramError):
|
|
17
|
+
"""Raised on storage backend failures (I/O, corruption, etc.)."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ValidationError(EngramError):
|
|
21
|
+
"""Raised when data fails quality gate validation."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ConflictError(EngramError):
|
|
25
|
+
"""Raised when a fact conflicts with existing data."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SessionError(EngramError):
|
|
29
|
+
"""Raised on session lifecycle errors."""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ConfigError(EngramError):
|
|
33
|
+
"""Raised on configuration errors."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ExtractionError(EngramError):
|
|
37
|
+
"""Raised when entity/fact extraction fails."""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SecurityError(EngramError):
|
|
41
|
+
"""Raised on security violations (path traversal, etc.)."""
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Extractor protocol — the contract all extractors implement."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
from engram.models.entity import Entity
|
|
8
|
+
from engram.models.fact import Fact
|
|
9
|
+
from engram.models.relation import Relation
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@runtime_checkable
|
|
13
|
+
class Extractor(Protocol):
|
|
14
|
+
"""Abstract extraction interface."""
|
|
15
|
+
|
|
16
|
+
def extract_entities(self, text: str) -> list[Entity]: ...
|
|
17
|
+
|
|
18
|
+
def extract_facts(self, text: str, entities: list[Entity]) -> list[Fact]: ...
|
|
19
|
+
|
|
20
|
+
def extract_relations(self, text: str, entities: list[Entity]) -> list[Relation]: ...
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""LLM-based extractor — high-quality extraction using Claude or GPT.
|
|
2
|
+
|
|
3
|
+
Requires: pip install engram[llm]
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from engram.models.entity import Entity
|
|
12
|
+
from engram.models.fact import Fact
|
|
13
|
+
from engram.models.relation import Relation
|
|
14
|
+
|
|
15
|
+
_VALID_ENTITY_TYPES = frozenset({"person", "organization", "project", "concept"})
|
|
16
|
+
_MAX_NAME_LENGTH = 100
|
|
17
|
+
|
|
18
|
+
_ENTITY_PROMPT = """Extract named entities from the following text.
|
|
19
|
+
Return a JSON array of objects with fields: name, entity_type (person/organization/project/concept).
|
|
20
|
+
Only extract clearly identifiable entities. Do not guess.
|
|
21
|
+
|
|
22
|
+
Text:
|
|
23
|
+
{text}
|
|
24
|
+
|
|
25
|
+
Return ONLY the JSON array, no other text."""
|
|
26
|
+
|
|
27
|
+
_FACT_PROMPT = """Given the following text and known entities, extract factual statements.
|
|
28
|
+
Return a JSON array of objects with fields: subject, predicate, object, raw_text, confidence (0.0-1.0).
|
|
29
|
+
|
|
30
|
+
Known entities: {entities}
|
|
31
|
+
|
|
32
|
+
Text:
|
|
33
|
+
{text}
|
|
34
|
+
|
|
35
|
+
Return ONLY the JSON array, no other text."""
|
|
36
|
+
|
|
37
|
+
_RELATION_PROMPT = """Given the following text and known entities, extract relationships between entities.
|
|
38
|
+
Return a JSON array of objects with fields: from_name, to_name, relation_type (e.g., CEO_OF, WORKS_AT, INVESTED_IN).
|
|
39
|
+
|
|
40
|
+
Known entities: {entities}
|
|
41
|
+
|
|
42
|
+
Text:
|
|
43
|
+
{text}
|
|
44
|
+
|
|
45
|
+
Return ONLY the JSON array, no other text."""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class LLMExtractor:
|
|
49
|
+
"""LLM-based entity, fact, and relation extraction.
|
|
50
|
+
|
|
51
|
+
Uses Claude (Anthropic) or GPT (OpenAI) for high-quality extraction.
|
|
52
|
+
Falls back gracefully if API calls fail.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
provider: str = "anthropic",
|
|
58
|
+
model: Optional[str] = None,
|
|
59
|
+
):
|
|
60
|
+
self.provider = provider
|
|
61
|
+
self._client = None
|
|
62
|
+
|
|
63
|
+
if provider == "anthropic":
|
|
64
|
+
self.model = model or "claude-sonnet-4-6"
|
|
65
|
+
try:
|
|
66
|
+
from anthropic import Anthropic
|
|
67
|
+
self._client = Anthropic()
|
|
68
|
+
except ImportError:
|
|
69
|
+
raise ImportError(
|
|
70
|
+
"Anthropic SDK not installed. Run: pip install engram[llm]"
|
|
71
|
+
)
|
|
72
|
+
elif provider == "openai":
|
|
73
|
+
self.model = model or "gpt-4o-mini"
|
|
74
|
+
try:
|
|
75
|
+
from openai import OpenAI
|
|
76
|
+
self._client = OpenAI()
|
|
77
|
+
except ImportError:
|
|
78
|
+
raise ImportError(
|
|
79
|
+
"OpenAI SDK not installed. Run: pip install engram[llm]"
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(f"Unknown provider: {provider}. Use 'anthropic' or 'openai'.")
|
|
83
|
+
|
|
84
|
+
def _call_llm(self, prompt: str) -> str:
|
|
85
|
+
"""Call the LLM and return the response text."""
|
|
86
|
+
if self.provider == "anthropic":
|
|
87
|
+
response = self._client.messages.create(
|
|
88
|
+
model=self.model,
|
|
89
|
+
max_tokens=2048,
|
|
90
|
+
messages=[{"role": "user", "content": prompt}],
|
|
91
|
+
)
|
|
92
|
+
return response.content[0].text
|
|
93
|
+
else:
|
|
94
|
+
response = self._client.chat.completions.create(
|
|
95
|
+
model=self.model,
|
|
96
|
+
messages=[{"role": "user", "content": prompt}],
|
|
97
|
+
max_tokens=2048,
|
|
98
|
+
)
|
|
99
|
+
return response.choices[0].message.content or ""
|
|
100
|
+
|
|
101
|
+
def _parse_json_response(self, text: str) -> list:
|
|
102
|
+
"""Parse JSON from LLM response, handling markdown code blocks."""
|
|
103
|
+
text = text.strip()
|
|
104
|
+
if text.startswith("```"):
|
|
105
|
+
lines = text.split("\n")
|
|
106
|
+
# Remove opening ``` line
|
|
107
|
+
start = 1
|
|
108
|
+
# Remove closing ``` line if present
|
|
109
|
+
end = len(lines)
|
|
110
|
+
if lines[-1].strip().startswith("```"):
|
|
111
|
+
end = -1
|
|
112
|
+
text = "\n".join(lines[start:end])
|
|
113
|
+
try:
|
|
114
|
+
result = json.loads(text)
|
|
115
|
+
return result if isinstance(result, list) else []
|
|
116
|
+
except json.JSONDecodeError:
|
|
117
|
+
return []
|
|
118
|
+
|
|
119
|
+
def extract_entities(self, text: str) -> list[Entity]:
|
|
120
|
+
"""Extract entities using LLM."""
|
|
121
|
+
if not text.strip():
|
|
122
|
+
return []
|
|
123
|
+
try:
|
|
124
|
+
prompt = _ENTITY_PROMPT.format(text=text[:4000])
|
|
125
|
+
response = self._call_llm(prompt)
|
|
126
|
+
items = self._parse_json_response(response)
|
|
127
|
+
entities = []
|
|
128
|
+
seen = set()
|
|
129
|
+
for item in items:
|
|
130
|
+
name = item.get("name", "").strip()
|
|
131
|
+
if not name or name.lower() in seen:
|
|
132
|
+
continue
|
|
133
|
+
if len(name) > _MAX_NAME_LENGTH:
|
|
134
|
+
name = name[:_MAX_NAME_LENGTH]
|
|
135
|
+
entity_type = item.get("entity_type", "person")
|
|
136
|
+
if entity_type not in _VALID_ENTITY_TYPES:
|
|
137
|
+
entity_type = "person"
|
|
138
|
+
seen.add(name.lower())
|
|
139
|
+
entities.append(Entity(
|
|
140
|
+
name=name,
|
|
141
|
+
entity_type=entity_type,
|
|
142
|
+
))
|
|
143
|
+
return entities
|
|
144
|
+
except Exception:
|
|
145
|
+
return []
|
|
146
|
+
|
|
147
|
+
def extract_facts(self, text: str, entities: list[Entity]) -> list[Fact]:
|
|
148
|
+
"""Extract facts using LLM."""
|
|
149
|
+
if not text.strip() or not entities:
|
|
150
|
+
return []
|
|
151
|
+
try:
|
|
152
|
+
entity_names = ", ".join(e.name for e in entities)
|
|
153
|
+
entity_map = {e.name.lower(): e for e in entities}
|
|
154
|
+
prompt = _FACT_PROMPT.format(text=text[:4000], entities=entity_names)
|
|
155
|
+
response = self._call_llm(prompt)
|
|
156
|
+
items = self._parse_json_response(response)
|
|
157
|
+
facts = []
|
|
158
|
+
for item in items:
|
|
159
|
+
subject = item.get("subject", "").strip()
|
|
160
|
+
entity = entity_map.get(subject.lower())
|
|
161
|
+
if not entity:
|
|
162
|
+
continue
|
|
163
|
+
facts.append(Fact(
|
|
164
|
+
entity_id=entity.id,
|
|
165
|
+
subject=entity.name,
|
|
166
|
+
predicate=item.get("predicate", "attribute"),
|
|
167
|
+
object=item.get("object", ""),
|
|
168
|
+
raw_text=item.get("raw_text", ""),
|
|
169
|
+
confidence=min(1.0, max(0.0, item.get("confidence", 0.7))),
|
|
170
|
+
))
|
|
171
|
+
return facts
|
|
172
|
+
except Exception:
|
|
173
|
+
return []
|
|
174
|
+
|
|
175
|
+
def extract_relations(self, text: str, entities: list[Entity]) -> list[Relation]:
|
|
176
|
+
"""Extract relations using LLM."""
|
|
177
|
+
if not text.strip() or len(entities) < 2:
|
|
178
|
+
return []
|
|
179
|
+
try:
|
|
180
|
+
entity_names = ", ".join(e.name for e in entities)
|
|
181
|
+
entity_map = {e.name.lower(): e for e in entities}
|
|
182
|
+
prompt = _RELATION_PROMPT.format(text=text[:4000], entities=entity_names)
|
|
183
|
+
response = self._call_llm(prompt)
|
|
184
|
+
items = self._parse_json_response(response)
|
|
185
|
+
relations = []
|
|
186
|
+
for item in items:
|
|
187
|
+
from_entity = entity_map.get(item.get("from_name", "").lower())
|
|
188
|
+
to_entity = entity_map.get(item.get("to_name", "").lower())
|
|
189
|
+
if from_entity and to_entity:
|
|
190
|
+
relations.append(Relation(
|
|
191
|
+
from_entity_id=from_entity.id,
|
|
192
|
+
to_entity_id=to_entity.id,
|
|
193
|
+
relation_type=item.get("relation_type", "RELATED_TO"),
|
|
194
|
+
))
|
|
195
|
+
return relations
|
|
196
|
+
except Exception:
|
|
197
|
+
return []
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Named Entity Recognition modules."""
|
|
2
|
+
|
|
3
|
+
from engram.extraction.ner.english import extract_english_entities
|
|
4
|
+
from engram.extraction.ner.korean import extract_korean_entities
|
|
5
|
+
from engram.extraction.ner.cjk import extract_cjk_entities
|
|
6
|
+
|
|
7
|
+
__all__ = ["extract_english_entities", "extract_korean_entities", "extract_cjk_entities"]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""CJK (Chinese/Japanese) named entity recognition — basic fallback."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from engram.models.entity import Entity
|
|
8
|
+
|
|
9
|
+
# Common Chinese surnames (top ~50)
|
|
10
|
+
_CHINESE_SURNAMES = frozenset({
|
|
11
|
+
"王", "李", "张", "刘", "陈", "杨", "黄", "赵", "吴", "周",
|
|
12
|
+
"徐", "孙", "马", "朱", "胡", "郭", "林", "何", "高", "罗",
|
|
13
|
+
"郑", "梁", "谢", "宋", "唐", "韩", "曹", "许", "邓", "萧",
|
|
14
|
+
"冯", "曾", "程", "蔡", "彭", "潘", "袁", "于", "董", "余",
|
|
15
|
+
"苏", "叶", "吕", "魏", "蒋", "田", "杜", "丁", "沈", "姜",
|
|
16
|
+
})
|
|
17
|
+
|
|
18
|
+
# Common Japanese surnames (top ~50)
|
|
19
|
+
_JAPANESE_SURNAMES = frozenset({
|
|
20
|
+
"佐藤", "鈴木", "高橋", "田中", "伊藤", "渡辺", "山本", "中村",
|
|
21
|
+
"小林", "加藤", "吉田", "山田", "佐々木", "松本", "井上", "木村",
|
|
22
|
+
"林", "斎藤", "清水", "山崎", "池田", "橋本", "阿部", "石川",
|
|
23
|
+
"山下", "中島", "前田", "藤田", "小川", "後藤", "岡田", "長谷川",
|
|
24
|
+
"村上", "近藤", "石井", "遠藤", "坂本", "青木", "藤井", "西村",
|
|
25
|
+
"福田", "太田", "三浦", "藤原", "松田", "岡本", "中川", "中野",
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
# Chinese name: surname(1) + given(1-2) = 2-3 chars
|
|
29
|
+
_CN_NAME_PATTERN = re.compile(r'([\u4E00-\u9FFF])([\u4E00-\u9FFF]{1,2})')
|
|
30
|
+
|
|
31
|
+
# Japanese name: surname(1-3 chars) + given(1-3 chars)
|
|
32
|
+
_JP_SURNAME_PATTERN = re.compile(
|
|
33
|
+
'(' + '|'.join(re.escape(s) for s in _JAPANESE_SURNAMES) + r')([\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]{1,3})'
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def extract_cjk_entities(text: str) -> list[Entity]:
|
|
38
|
+
"""Extract Chinese and Japanese named entities from text.
|
|
39
|
+
|
|
40
|
+
Basic approach:
|
|
41
|
+
- Chinese: surname dictionary (1 char) + given name (1-2 chars)
|
|
42
|
+
- Japanese: surname dictionary (1-3 chars) + given name (1-3 chars)
|
|
43
|
+
"""
|
|
44
|
+
entities: list[Entity] = []
|
|
45
|
+
seen: set[str] = set()
|
|
46
|
+
|
|
47
|
+
# Chinese names
|
|
48
|
+
for m in _CN_NAME_PATTERN.finditer(text):
|
|
49
|
+
surname = m.group(1)
|
|
50
|
+
if surname in _CHINESE_SURNAMES:
|
|
51
|
+
name = m.group(0)
|
|
52
|
+
if name not in seen:
|
|
53
|
+
seen.add(name)
|
|
54
|
+
entities.append(Entity(name=name, entity_type="person"))
|
|
55
|
+
|
|
56
|
+
# Japanese names
|
|
57
|
+
for m in _JP_SURNAME_PATTERN.finditer(text):
|
|
58
|
+
name = m.group(0)
|
|
59
|
+
if name not in seen:
|
|
60
|
+
seen.add(name)
|
|
61
|
+
entities.append(Entity(name=name, entity_type="person"))
|
|
62
|
+
|
|
63
|
+
return entities
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""English named entity recognition — context-aware pattern matching."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from engram.models.entity import Entity
|
|
8
|
+
|
|
9
|
+
# Titles that precede names
|
|
10
|
+
_TITLES = frozenset({
|
|
11
|
+
"mr", "mrs", "ms", "dr", "prof", "professor", "sir", "lady",
|
|
12
|
+
"president", "ceo", "cto", "cfo", "coo", "vp", "director",
|
|
13
|
+
})
|
|
14
|
+
|
|
15
|
+
# Words that look like names but aren't (expanded blocklist)
|
|
16
|
+
_BLOCKLIST = frozenset({
|
|
17
|
+
# Common section headers / phrases
|
|
18
|
+
"key points", "executive summary", "see also", "last updated",
|
|
19
|
+
"time line", "timeline", "action items", "next steps", "open loops",
|
|
20
|
+
"north america", "south america", "south korea", "north korea",
|
|
21
|
+
"new york", "los angeles", "san francisco", "san diego", "san jose",
|
|
22
|
+
"las vegas", "el paso", "hong kong", "sri lanka", "costa rica",
|
|
23
|
+
"buenos aires", "rio grande", "cape town", "new zealand",
|
|
24
|
+
"united states", "united kingdom", "saudi arabia", "south africa",
|
|
25
|
+
# Tech / product terms
|
|
26
|
+
"machine learning", "deep learning", "artificial intelligence",
|
|
27
|
+
"open source", "real time", "high quality", "best practice",
|
|
28
|
+
"cloud computing", "data science", "big data",
|
|
29
|
+
# Common false positives
|
|
30
|
+
"thank you", "good morning", "good afternoon", "good evening",
|
|
31
|
+
"happy new", "merry christmas", "looking forward",
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
# Organization suffixes
|
|
35
|
+
_ORG_SUFFIXES = re.compile(
|
|
36
|
+
r'\b(Inc\.?|Corp\.?|Ltd\.?|LLC|GmbH|Co\.|Foundation|Association'
|
|
37
|
+
r'|Institute|University|Partners|Capital|Ventures|Labs?|Group'
|
|
38
|
+
r'|Holdings|Technologies|Solutions|Networks|Studios?)\b',
|
|
39
|
+
re.IGNORECASE,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Pattern: Two or three consecutive capitalized words
|
|
43
|
+
# Use lookahead instead of \b for CJK-mixed text compatibility
|
|
44
|
+
_NAME_PATTERN = re.compile(
|
|
45
|
+
r'(?<![a-zA-Z])([A-Z][a-z]{1,15}(?:\s+[A-Z][a-z]{1,15}){1,2})(?![a-zA-Z])'
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Pattern: Title + Name
|
|
49
|
+
_TITLED_NAME = re.compile(
|
|
50
|
+
r'\b(?:Mr|Mrs|Ms|Dr|Prof|CEO|CTO)\.\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})\b'
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def extract_english_entities(text: str) -> list[Entity]:
|
|
55
|
+
"""Extract English named entities from text.
|
|
56
|
+
|
|
57
|
+
Uses context clues to classify as person vs organization:
|
|
58
|
+
- Organization suffixes (Inc., Corp., Foundation, etc.) → organization
|
|
59
|
+
- Title prefixes (Dr., CEO, etc.) → person
|
|
60
|
+
- Default for capitalized name pairs → person
|
|
61
|
+
"""
|
|
62
|
+
entities: list[Entity] = []
|
|
63
|
+
seen_names: set[str] = set()
|
|
64
|
+
|
|
65
|
+
# Pass 1: Titled names (high confidence)
|
|
66
|
+
for m in _TITLED_NAME.finditer(text):
|
|
67
|
+
name = m.group(1).strip()
|
|
68
|
+
normalized = name.lower()
|
|
69
|
+
if normalized not in seen_names and normalized not in _BLOCKLIST:
|
|
70
|
+
seen_names.add(normalized)
|
|
71
|
+
entities.append(Entity(name=name, entity_type="person"))
|
|
72
|
+
|
|
73
|
+
# Pass 2: Organizations (suffix-based)
|
|
74
|
+
for m in _ORG_SUFFIXES.finditer(text):
|
|
75
|
+
# Look backwards for the organization name
|
|
76
|
+
start = max(0, m.start() - 60)
|
|
77
|
+
prefix = text[start:m.end()]
|
|
78
|
+
org_match = re.search(
|
|
79
|
+
r'([A-Z][A-Za-z]*(?:\s+[A-Z][A-Za-z]*)*\s+' + re.escape(m.group()) + r')',
|
|
80
|
+
prefix,
|
|
81
|
+
)
|
|
82
|
+
if org_match:
|
|
83
|
+
name = org_match.group(1).strip()
|
|
84
|
+
normalized = name.lower()
|
|
85
|
+
# Skip very short org names (likely false positives like "Buterin co")
|
|
86
|
+
name_without_suffix = name.rsplit(None, 1)[0] if " " in name else name
|
|
87
|
+
if len(name_without_suffix) < 3:
|
|
88
|
+
continue
|
|
89
|
+
if normalized not in seen_names:
|
|
90
|
+
seen_names.add(normalized)
|
|
91
|
+
entities.append(Entity(name=name, entity_type="organization"))
|
|
92
|
+
|
|
93
|
+
# Pass 3: Capitalized name pairs (default → person)
|
|
94
|
+
for m in _NAME_PATTERN.finditer(text):
|
|
95
|
+
name = m.group(1).strip()
|
|
96
|
+
normalized = name.lower()
|
|
97
|
+
if normalized in seen_names:
|
|
98
|
+
continue
|
|
99
|
+
if normalized in _BLOCKLIST:
|
|
100
|
+
continue
|
|
101
|
+
# Skip if it's at start of sentence (could be regular words)
|
|
102
|
+
pos = m.start()
|
|
103
|
+
if pos > 0 and text[pos - 1] in '.!?\n':
|
|
104
|
+
# Sentence start — lower confidence, still extract
|
|
105
|
+
pass
|
|
106
|
+
seen_names.add(normalized)
|
|
107
|
+
entities.append(Entity(name=name, entity_type="person"))
|
|
108
|
+
|
|
109
|
+
return entities
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Korean named entity recognition — dictionary + pattern based."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from engram.models.entity import Entity
|
|
8
|
+
|
|
9
|
+
# Common Korean surnames (top ~100)
|
|
10
|
+
_KOREAN_SURNAMES = frozenset({
|
|
11
|
+
"김", "이", "박", "최", "정", "강", "조", "윤", "장", "임",
|
|
12
|
+
"한", "오", "서", "신", "권", "황", "안", "송", "류", "전",
|
|
13
|
+
"홍", "고", "문", "양", "손", "배", "백", "허", "유", "남",
|
|
14
|
+
"심", "노", "하", "곽", "성", "차", "주", "우", "구", "민",
|
|
15
|
+
"진", "나", "지", "엄", "채", "원", "천", "방", "공", "현",
|
|
16
|
+
"함", "변", "염", "석", "선", "설", "마", "길", "연", "위",
|
|
17
|
+
"표", "명", "기", "반", "라", "왕", "금", "옥", "육", "추",
|
|
18
|
+
"탁", "도", "감", "봉", "모", "맹", "제", "빈", "판", "피",
|
|
19
|
+
"섭", "어", "음", "사", "가", "복", "태", "목", "형", "편",
|
|
20
|
+
"승", "뇌", "범", "상", "갈", "예", "경", "럼", "온", "소",
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
# Korean organization keywords
|
|
24
|
+
_KR_ORG_KEYWORDS = frozenset({
|
|
25
|
+
"주식회사", "재단", "협회", "학회", "그룹", "홀딩스", "파트너스",
|
|
26
|
+
"캐피탈", "벤처스", "랩", "랩스", "연구소", "대학교", "대학",
|
|
27
|
+
"병원", "은행", "증권", "보험", "카드",
|
|
28
|
+
})
|
|
29
|
+
|
|
30
|
+
# Korean particles (조사) — must be stripped from name boundary
|
|
31
|
+
# Pre-sorted by length (longest first) for greedy matching
|
|
32
|
+
_KR_PARTICLES = tuple(sorted(
|
|
33
|
+
("은", "는", "이", "가", "을", "를", "의", "에", "와", "과",
|
|
34
|
+
"도", "만", "으로", "로", "에서", "부터", "까지", "라고",
|
|
35
|
+
"에게", "한테", "께서"),
|
|
36
|
+
key=len, reverse=True,
|
|
37
|
+
))
|
|
38
|
+
|
|
39
|
+
# Pattern: Organization with keywords (keyword embedded or after space)
|
|
40
|
+
_KR_ORG_PATTERN = re.compile(
|
|
41
|
+
r'((?:[가-힣A-Za-z]{1,15})\s*(?:' + '|'.join(sorted(_KR_ORG_KEYWORDS, key=len, reverse=True)) + r'))'
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Common Korean words that are NOT names (extended stopwords for NER)
|
|
45
|
+
_KR_NOT_NAMES = frozenset({
|
|
46
|
+
"오늘", "내일", "어제", "우리", "저희", "여기", "거기", "저기",
|
|
47
|
+
"이번", "다음", "지난", "올해", "작년", "내년", "현재", "최근",
|
|
48
|
+
"모든", "많은", "새로", "다른", "같은", "이런", "그런", "저런",
|
|
49
|
+
"하지", "그리", "따라", "또한", "그래", "아직", "이미", "바로",
|
|
50
|
+
"서로", "다시", "매우", "아주", "정말", "진짜", "너무", "가장",
|
|
51
|
+
"회의", "미팅", "보고", "발표", "논의", "결정", "진행", "완료",
|
|
52
|
+
"사업", "투자", "개발", "관리", "운영", "성장", "확대", "축소",
|
|
53
|
+
"기술", "시스템", "서비스", "플랫", "데이", "프로", "소프",
|
|
54
|
+
"문제", "해결", "방안", "계획", "목표", "전략", "방향", "결과",
|
|
55
|
+
"서비", "기업", "시장", "제품", "고객", "매출", "수익",
|
|
56
|
+
"함께", "시작", "종료", "변경",
|
|
57
|
+
"연구", "조사", "분석", "검토", "평가", "보완", "개선", "추진",
|
|
58
|
+
"진행", "참석", "참여", "준비", "완료", "예정", "필요", "가능",
|
|
59
|
+
"이다", "하다", "되다", "있다", "없다", "이며", "으며", "라고",
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def extract_korean_entities(text: str) -> list[Entity]:
|
|
64
|
+
"""Extract Korean named entities from text.
|
|
65
|
+
|
|
66
|
+
Uses surname dictionary + syllable count for person names.
|
|
67
|
+
Uses organization keywords for organizations.
|
|
68
|
+
"""
|
|
69
|
+
entities: list[Entity] = []
|
|
70
|
+
seen_names: set[str] = set()
|
|
71
|
+
|
|
72
|
+
# Pass 1: Organizations with keywords
|
|
73
|
+
for m in _KR_ORG_PATTERN.finditer(text):
|
|
74
|
+
org_name = m.group(0).strip()
|
|
75
|
+
if org_name.lower() not in seen_names:
|
|
76
|
+
seen_names.add(org_name.lower())
|
|
77
|
+
entities.append(Entity(name=org_name, entity_type="organization"))
|
|
78
|
+
|
|
79
|
+
# Pass 2: Korean person names (surname + given name)
|
|
80
|
+
# Split text by spaces/punctuation, then check each word
|
|
81
|
+
words = re.findall(r'[가-힣]+', text)
|
|
82
|
+
for word in words:
|
|
83
|
+
name = _strip_particles(word)
|
|
84
|
+
if len(name) < 2 or len(name) > 3:
|
|
85
|
+
continue
|
|
86
|
+
surname = name[0]
|
|
87
|
+
if surname not in _KOREAN_SURNAMES:
|
|
88
|
+
continue
|
|
89
|
+
if name in _KR_NOT_NAMES:
|
|
90
|
+
continue
|
|
91
|
+
if name.lower() in seen_names:
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
seen_names.add(name.lower())
|
|
95
|
+
entities.append(Entity(name=name, entity_type="person"))
|
|
96
|
+
|
|
97
|
+
return entities
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _strip_particles(word: str) -> str:
|
|
101
|
+
"""Strip trailing Korean particles (조사) from a word."""
|
|
102
|
+
# _KR_PARTICLES is pre-sorted longest first
|
|
103
|
+
for particle in _KR_PARTICLES:
|
|
104
|
+
if word.endswith(particle) and len(word) > len(particle):
|
|
105
|
+
return word[:-len(particle)]
|
|
106
|
+
return word
|