tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/ner_prep.py
ADDED
|
@@ -0,0 +1,747 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NER preprocessing helpers (still POS/NER-free)
|
|
3
|
+
=============================================
|
|
4
|
+
|
|
5
|
+
TokMor core does NOT provide NER models. This module provides a small, deterministic
|
|
6
|
+
helper layer to make it *easy* to apply TokMor outputs to NER pipelines:
|
|
7
|
+
|
|
8
|
+
- run `segment(..., include_sns_tags=True)` to get stable tokens + offsets
|
|
9
|
+
- emit SNS discourse markers as separate "NER-style" entities
|
|
10
|
+
- filter out discourse markers / punctuation from the token stream used by NER
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
import unicodedata
|
|
17
|
+
from functools import lru_cache
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, Dict, List, Optional
|
|
20
|
+
|
|
21
|
+
SegmentToken = Dict[str, Any]
|
|
22
|
+
|
|
23
|
+
_RX_DATE = re.compile(r"^(\d{4}[-/]\d{1,2}[-/]\d{1,2}|\d{1,2}:\d{2}(:\d{2})?)$")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _is_punct_or_space(text: str) -> bool:
|
|
27
|
+
if not text:
|
|
28
|
+
return True
|
|
29
|
+
if text.strip() == "":
|
|
30
|
+
return True
|
|
31
|
+
cats = [unicodedata.category(c) for c in text]
|
|
32
|
+
return all(c.startswith(("P", "S")) for c in cats)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _is_discourse_marker(token: SegmentToken) -> bool:
|
|
36
|
+
sns = token.get("sns")
|
|
37
|
+
return isinstance(sns, dict) and sns.get("class") == "DISCOURSE_MARKER"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _is_number(text: str) -> bool:
|
|
41
|
+
if not text:
|
|
42
|
+
return False
|
|
43
|
+
t = text.replace(",", "").replace("_", "")
|
|
44
|
+
t = t.replace("-", "", 1)
|
|
45
|
+
t = t.replace(".", "", 1)
|
|
46
|
+
return t.isdigit()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def token_shape_hint(text: str) -> str:
|
|
50
|
+
"""
|
|
51
|
+
Tiny deterministic hint for NER pipelines (NOT a POS tagger).
|
|
52
|
+
|
|
53
|
+
Returns one of:
|
|
54
|
+
- Q: number-like
|
|
55
|
+
- T: date/time-like
|
|
56
|
+
- S: punct/symbol-only
|
|
57
|
+
- O: other/unknown
|
|
58
|
+
"""
|
|
59
|
+
t = str(text or "")
|
|
60
|
+
if not t:
|
|
61
|
+
return "O"
|
|
62
|
+
if _is_punct_or_space(t):
|
|
63
|
+
return "S"
|
|
64
|
+
if _RX_DATE.match(t):
|
|
65
|
+
return "T"
|
|
66
|
+
if _is_number(t):
|
|
67
|
+
return "Q"
|
|
68
|
+
return "O"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@lru_cache(maxsize=512)
|
|
72
|
+
def _lang_configs_dir() -> Optional[Path]:
|
|
73
|
+
"""
|
|
74
|
+
Resolve lang_configs directory if available.
|
|
75
|
+
|
|
76
|
+
Priority:
|
|
77
|
+
- TOKMOR_LANG_CONFIGS_DIR (explicit)
|
|
78
|
+
- repo root sibling: TokMor/lang_configs (dev)
|
|
79
|
+
"""
|
|
80
|
+
try:
|
|
81
|
+
import os
|
|
82
|
+
|
|
83
|
+
p = os.getenv("TOKMOR_LANG_CONFIGS_DIR", "").strip()
|
|
84
|
+
if p:
|
|
85
|
+
d = Path(p).expanduser().resolve()
|
|
86
|
+
else:
|
|
87
|
+
# ner_prep.py -> TokMor_v1/tokmor/ner_prep.py, repo root is parents[2]
|
|
88
|
+
d = Path(__file__).resolve().parents[2] / "lang_configs"
|
|
89
|
+
return d if d.exists() else None
|
|
90
|
+
except Exception:
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@lru_cache(maxsize=512)
|
|
95
|
+
def _load_lang_config_json(lang: str) -> Optional[Dict[str, Any]]:
|
|
96
|
+
"""
|
|
97
|
+
Load lang_configs/{lang}.json (if available).
|
|
98
|
+
"""
|
|
99
|
+
d = _lang_configs_dir()
|
|
100
|
+
if not d:
|
|
101
|
+
return None
|
|
102
|
+
base = (lang or "").split("-", 1)[0].lower()
|
|
103
|
+
fp = d / f"{base}.json"
|
|
104
|
+
if not fp.exists():
|
|
105
|
+
return None
|
|
106
|
+
try:
|
|
107
|
+
import json
|
|
108
|
+
|
|
109
|
+
data = json.loads(fp.read_text("utf-8", errors="replace"))
|
|
110
|
+
return data if isinstance(data, dict) else None
|
|
111
|
+
except Exception:
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@lru_cache(maxsize=256)
|
|
116
|
+
def _function_word_map(lang: str) -> Dict[str, str]:
|
|
117
|
+
"""
|
|
118
|
+
Best-effort function-word lexicon for the given language.
|
|
119
|
+
|
|
120
|
+
Source: tokmor morphology analyzers (built-in, small, high-precision).
|
|
121
|
+
Output values are analyzer-specific tags (DET/PRON/PREP/CONJ/AUX/NEG/...).
|
|
122
|
+
"""
|
|
123
|
+
out: Dict[str, str] = {}
|
|
124
|
+
|
|
125
|
+
base = (lang or "").split("-", 1)[0].lower() # zh-cn -> zh
|
|
126
|
+
|
|
127
|
+
# 0) Prefer lang_configs/*.json if present (358 langs).
|
|
128
|
+
try:
|
|
129
|
+
data = _load_lang_config_json(base) or {}
|
|
130
|
+
pos = data.get("pos", {}) if isinstance(data, dict) else {}
|
|
131
|
+
|
|
132
|
+
# We only materialize *function-ish* tags into the map.
|
|
133
|
+
# Do NOT treat ADV/ADJ/NOUN as function words for hard-blocking.
|
|
134
|
+
fields = [
|
|
135
|
+
("determiners", "DET"),
|
|
136
|
+
("pronouns", "PRON"),
|
|
137
|
+
("auxiliaries", "AUX"),
|
|
138
|
+
("prepositions", "ADP"),
|
|
139
|
+
("conjunctions", "CCONJ"),
|
|
140
|
+
("particles", "PART"),
|
|
141
|
+
("postpositions", "ADP"),
|
|
142
|
+
]
|
|
143
|
+
for key, tag in fields:
|
|
144
|
+
xs = pos.get(key, [])
|
|
145
|
+
if isinstance(xs, list):
|
|
146
|
+
for w in xs:
|
|
147
|
+
if isinstance(w, str) and w:
|
|
148
|
+
out[w] = tag
|
|
149
|
+
except Exception:
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
# 1) Fallback: use tokmor morphology analyzers' built-in tiny function_words dicts.
|
|
153
|
+
# Keep this conservative: ignore large open-class buckets (esp. English adverbs).
|
|
154
|
+
try:
|
|
155
|
+
from .morphology.unified import get_unified_analyzer
|
|
156
|
+
|
|
157
|
+
an = get_unified_analyzer(lang)
|
|
158
|
+
sa = getattr(an, "specialized_analyzer", None)
|
|
159
|
+
if not sa:
|
|
160
|
+
return out
|
|
161
|
+
|
|
162
|
+
fw = getattr(sa, "function_words", None)
|
|
163
|
+
if isinstance(fw, dict):
|
|
164
|
+
for k, v in fw.items():
|
|
165
|
+
if isinstance(k, str) and k:
|
|
166
|
+
out.setdefault(k, str(v))
|
|
167
|
+
|
|
168
|
+
# Some analyzers keep closed-class buckets (sets) instead of a single dict.
|
|
169
|
+
# NOTE: do NOT use "adverbs" here (often huge / open-class).
|
|
170
|
+
buckets = [
|
|
171
|
+
("determiners", "DET"),
|
|
172
|
+
("pronouns", "PRON"),
|
|
173
|
+
("auxiliaries", "AUX"),
|
|
174
|
+
("prepositions", "ADP"),
|
|
175
|
+
("conjunctions", "CCONJ"),
|
|
176
|
+
("particles", "PART"),
|
|
177
|
+
("postpositions", "ADP"),
|
|
178
|
+
]
|
|
179
|
+
for attr, tag in buckets:
|
|
180
|
+
s = getattr(sa, attr, None)
|
|
181
|
+
if isinstance(s, set):
|
|
182
|
+
for k in s:
|
|
183
|
+
if isinstance(k, str) and k:
|
|
184
|
+
out.setdefault(k, tag)
|
|
185
|
+
elif isinstance(s, dict):
|
|
186
|
+
for k in s.keys():
|
|
187
|
+
if isinstance(k, str) and k:
|
|
188
|
+
out.setdefault(k, tag)
|
|
189
|
+
except Exception:
|
|
190
|
+
return out
|
|
191
|
+
|
|
192
|
+
return out
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def function_word_tag(lang: str, token_text: str) -> Optional[str]:
|
|
196
|
+
"""
|
|
197
|
+
Return a small closed-class tag if `token_text` looks like a function word for `lang`.
|
|
198
|
+
Conservative: prefers exact surface match; falls back to lowercased match.
|
|
199
|
+
"""
|
|
200
|
+
t = str(token_text or "")
|
|
201
|
+
if not t:
|
|
202
|
+
return None
|
|
203
|
+
m = _function_word_map(lang)
|
|
204
|
+
if not m:
|
|
205
|
+
return None
|
|
206
|
+
if t in m:
|
|
207
|
+
return m[t]
|
|
208
|
+
tl = t.lower()
|
|
209
|
+
if tl in m:
|
|
210
|
+
return m[tl]
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
_HARD_BLOCK_FWTAGS = {"DET", "PRON", "AUX", "ADP", "CCONJ", "SCONJ", "PART"}
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@lru_cache(maxsize=2048)
|
|
218
|
+
def _extended_pos_map(lang: str) -> Dict[str, str]:
|
|
219
|
+
"""
|
|
220
|
+
Load optional external extended POS hints (surface -> coarse tag).
|
|
221
|
+
|
|
222
|
+
Expected location (runtime):
|
|
223
|
+
TOKMOR_DATA_DIR/extended_dict/{lang}_extended.json
|
|
224
|
+
|
|
225
|
+
This is built by: scripts/build_tokmor_data_pack_from_wiktextract.py --build-extended-dict
|
|
226
|
+
"""
|
|
227
|
+
# Allow disabling large optional dictionaries entirely for "lite" deployments.
|
|
228
|
+
try:
|
|
229
|
+
import os
|
|
230
|
+
|
|
231
|
+
v = (os.getenv("TOKMOR_DISABLE_EXTENDED_DICT", "") or "").strip().lower()
|
|
232
|
+
if v in {"1", "true", "yes", "y", "on"}:
|
|
233
|
+
return {}
|
|
234
|
+
except Exception:
|
|
235
|
+
pass
|
|
236
|
+
try:
|
|
237
|
+
from . import resources
|
|
238
|
+
except Exception:
|
|
239
|
+
return {}
|
|
240
|
+
|
|
241
|
+
base = (lang or "").split("-", 1)[0].lower()
|
|
242
|
+
p = resources.data_dir() / "extended_dict" / f"{base}_extended.json"
|
|
243
|
+
if not p.exists():
|
|
244
|
+
return {}
|
|
245
|
+
try:
|
|
246
|
+
import json
|
|
247
|
+
|
|
248
|
+
obj = json.loads(p.read_text(encoding="utf-8", errors="ignore"))
|
|
249
|
+
if isinstance(obj, dict):
|
|
250
|
+
# normalize to str->str
|
|
251
|
+
out: Dict[str, str] = {}
|
|
252
|
+
for k, v in obj.items():
|
|
253
|
+
if not isinstance(k, str) or not k:
|
|
254
|
+
continue
|
|
255
|
+
if not isinstance(v, str) or not v:
|
|
256
|
+
continue
|
|
257
|
+
out[k] = v
|
|
258
|
+
return out
|
|
259
|
+
except Exception:
|
|
260
|
+
return {}
|
|
261
|
+
return {}
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _pos4_from_extended_dict(lang: str, token_text: str) -> Optional[str]:
|
|
265
|
+
"""
|
|
266
|
+
Return POS4 from external extended_dict if present, else None.
|
|
267
|
+
"""
|
|
268
|
+
t = str(token_text or "")
|
|
269
|
+
if not t:
|
|
270
|
+
return None
|
|
271
|
+
base = (lang or "").split("-", 1)[0].lower()
|
|
272
|
+
|
|
273
|
+
m = _extended_pos_map(base)
|
|
274
|
+
if not m:
|
|
275
|
+
return None
|
|
276
|
+
|
|
277
|
+
# Match builder behavior: some languages lower-case keys.
|
|
278
|
+
key = t
|
|
279
|
+
if base in {"en", "de", "fr", "es", "it", "pt", "nl", "sv", "da", "no", "fi"}:
|
|
280
|
+
key = t.lower()
|
|
281
|
+
tag = m.get(key) or m.get(t) or m.get(t.lower())
|
|
282
|
+
if not tag:
|
|
283
|
+
return None
|
|
284
|
+
u = tag.upper()
|
|
285
|
+
if u in {"NOUN", "PROPN"}:
|
|
286
|
+
return "N"
|
|
287
|
+
if u in {"VERB", "AUX"}:
|
|
288
|
+
return "V"
|
|
289
|
+
if u == "ADJ":
|
|
290
|
+
return "ADJ"
|
|
291
|
+
if u == "ADV":
|
|
292
|
+
return "ADV"
|
|
293
|
+
return None
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
@lru_cache(maxsize=50000)
|
|
297
|
+
def _pos4_hint_cached(lang: str, token_text: str) -> str:
|
|
298
|
+
"""
|
|
299
|
+
POS4 hints for NER: only N/V/ADJ/ADV when confident, else UNK.
|
|
300
|
+
- N: NOUN/PROPN-ish (content noun, incl. proper noun)
|
|
301
|
+
- V: VERB/AUX-ish
|
|
302
|
+
- ADJ: ADJ-ish
|
|
303
|
+
- ADV: ADV-ish
|
|
304
|
+
- UNK: abstain
|
|
305
|
+
"""
|
|
306
|
+
t = str(token_text or "")
|
|
307
|
+
if not t or len(t) > 64:
|
|
308
|
+
return "UNK"
|
|
309
|
+
|
|
310
|
+
# Never try to POS-tag pure punctuation/symbols/dates/numbers here.
|
|
311
|
+
# Those are covered by token_shape_hint(Q/T/S) already.
|
|
312
|
+
if token_shape_hint(t) != "O":
|
|
313
|
+
return "UNK"
|
|
314
|
+
|
|
315
|
+
# Korean morpheme tokens: unified analyzer often returns coarse nouns for stems/endings.
|
|
316
|
+
# Add a tiny whitelist to improve V hints in morpheme-split output (ko default morphology=True).
|
|
317
|
+
base = (lang or "").split("-", 1)[0].lower()
|
|
318
|
+
if base == "ko":
|
|
319
|
+
ko_verbish = {
|
|
320
|
+
# common light verbs / auxiliaries (surface fragments)
|
|
321
|
+
"하", "되", "있", "없", "이", "아니",
|
|
322
|
+
# common past/tense/ending fragments seen in our tokenizer output
|
|
323
|
+
"었", "았", "였", "겠", "시", "지", "고", "서", "면", "다", "요",
|
|
324
|
+
}
|
|
325
|
+
if t in ko_verbish:
|
|
326
|
+
return "V"
|
|
327
|
+
|
|
328
|
+
# 0) If an external extended_dict exists, prefer it as a cheap high-coverage hint.
|
|
329
|
+
# This is the primary "best-effort POS4" mechanism for many languages.
|
|
330
|
+
try:
|
|
331
|
+
ex = _pos4_from_extended_dict(base, t)
|
|
332
|
+
if ex:
|
|
333
|
+
return ex
|
|
334
|
+
except Exception:
|
|
335
|
+
pass
|
|
336
|
+
|
|
337
|
+
# 1) Prefer lang_configs/*.json word lists + suffixes (358 langs) if present.
|
|
338
|
+
try:
|
|
339
|
+
data = _load_lang_config_json(base) or {}
|
|
340
|
+
pos = data.get("pos", {}) if isinstance(data, dict) else {}
|
|
341
|
+
tl = t.lower()
|
|
342
|
+
|
|
343
|
+
def _in_list(key: str, token: str) -> bool:
|
|
344
|
+
xs = pos.get(key, [])
|
|
345
|
+
return isinstance(xs, list) and token in xs
|
|
346
|
+
|
|
347
|
+
# Direct word lists (these are small and high-signal)
|
|
348
|
+
if _in_list("nouns", tl):
|
|
349
|
+
return "N"
|
|
350
|
+
if _in_list("adjectives", tl):
|
|
351
|
+
return "ADJ"
|
|
352
|
+
if _in_list("adverbs", tl):
|
|
353
|
+
return "ADV"
|
|
354
|
+
# Treat auxiliaries as verbish for POS4 purposes
|
|
355
|
+
if _in_list("auxiliaries", tl):
|
|
356
|
+
return "V"
|
|
357
|
+
|
|
358
|
+
# Suffix hints (conservative)
|
|
359
|
+
suf = data.get("suffixes", {}) if isinstance(data, dict) else {}
|
|
360
|
+
if isinstance(suf, dict) and t.isalpha() and len(t) >= 4:
|
|
361
|
+
advs = suf.get("adv", [])
|
|
362
|
+
adjs = suf.get("adj", [])
|
|
363
|
+
verbs = suf.get("verb", [])
|
|
364
|
+
nouns = suf.get("noun", [])
|
|
365
|
+
|
|
366
|
+
if isinstance(advs, list) and any(tl.endswith(s) for s in advs if isinstance(s, str) and s):
|
|
367
|
+
return "ADV"
|
|
368
|
+
if isinstance(adjs, list) and any(tl.endswith(s) for s in adjs if isinstance(s, str) and s):
|
|
369
|
+
return "ADJ"
|
|
370
|
+
if isinstance(verbs, list) and any(tl.endswith(s) for s in verbs if isinstance(s, str) and s):
|
|
371
|
+
# Avoid over-tagging in non-Latin scripts unless explicitly configured
|
|
372
|
+
return "V" if (tl.isascii() or base in {"ar"}) else "UNK"
|
|
373
|
+
if isinstance(nouns, list) and any(tl.endswith(s) for s in nouns if isinstance(s, str) and s):
|
|
374
|
+
# Noun suffixes are noisy; apply only for Latin-ish tokens.
|
|
375
|
+
if tl.isascii():
|
|
376
|
+
return "N"
|
|
377
|
+
except Exception:
|
|
378
|
+
pass
|
|
379
|
+
|
|
380
|
+
# 1.5) Tiny global heuristic: Latin TitleCase often indicates a name (helps vi/fr/es/...).
|
|
381
|
+
if t[:1].isupper() and t.isalpha() and len(t) >= 2:
|
|
382
|
+
if not (t.isupper() and len(t) <= 3):
|
|
383
|
+
return "N"
|
|
384
|
+
|
|
385
|
+
# 2) Fallback: use tokmor unified morphology tags (token-level, abstaining)
|
|
386
|
+
try:
|
|
387
|
+
from .morphology.unified import get_unified_analyzer
|
|
388
|
+
|
|
389
|
+
an = get_unified_analyzer(lang)
|
|
390
|
+
|
|
391
|
+
# Chinese: prefer specialized analyzer morphemes (they contain POS like n/v/a/d/p/u...).
|
|
392
|
+
if base.startswith("zh"):
|
|
393
|
+
sa = getattr(an, "specialized_analyzer", None)
|
|
394
|
+
if sa and hasattr(sa, "analyze"):
|
|
395
|
+
r = sa.analyze(t)
|
|
396
|
+
morphemes = None
|
|
397
|
+
if hasattr(r, "best") and hasattr(r.best, "morphemes"):
|
|
398
|
+
morphemes = r.best.morphemes
|
|
399
|
+
elif hasattr(r, "morphemes"):
|
|
400
|
+
morphemes = r.morphemes
|
|
401
|
+
tags: List[str] = []
|
|
402
|
+
if morphemes:
|
|
403
|
+
for m in morphemes:
|
|
404
|
+
p = str(getattr(m, "pos", "") or "")
|
|
405
|
+
if not p:
|
|
406
|
+
continue
|
|
407
|
+
# Common Chinese tags in our analyzer:
|
|
408
|
+
# n/ns/nr/nt/nrt... noun-ish, v verb, a adj, d adv, r pron
|
|
409
|
+
if p.startswith("n"):
|
|
410
|
+
tags.append("N")
|
|
411
|
+
elif p.startswith("v"):
|
|
412
|
+
tags.append("V")
|
|
413
|
+
elif p.startswith("a"):
|
|
414
|
+
tags.append("ADJ")
|
|
415
|
+
elif p.startswith("d"):
|
|
416
|
+
tags.append("ADV")
|
|
417
|
+
elif p == "r":
|
|
418
|
+
tags.append("N")
|
|
419
|
+
if tags:
|
|
420
|
+
# Priority: V > ADJ > ADV > N (matches surface aggregation)
|
|
421
|
+
if "V" in tags:
|
|
422
|
+
return "V"
|
|
423
|
+
if "ADJ" in tags:
|
|
424
|
+
return "ADJ"
|
|
425
|
+
if "ADV" in tags:
|
|
426
|
+
return "ADV"
|
|
427
|
+
if "N" in tags:
|
|
428
|
+
return "N"
|
|
429
|
+
|
|
430
|
+
morphs = an.analyze(t)
|
|
431
|
+
tags: List[str] = []
|
|
432
|
+
for m in morphs:
|
|
433
|
+
p = str(getattr(m, "pos", "") or "")
|
|
434
|
+
pu = p.upper()
|
|
435
|
+
# universal-style
|
|
436
|
+
if pu in {"NOUN", "PROPN"}:
|
|
437
|
+
tags.append("N")
|
|
438
|
+
elif pu in {"VERB", "AUX"}:
|
|
439
|
+
tags.append("V")
|
|
440
|
+
elif pu == "ADJ":
|
|
441
|
+
tags.append("ADJ")
|
|
442
|
+
elif pu == "ADV":
|
|
443
|
+
tags.append("ADV")
|
|
444
|
+
else:
|
|
445
|
+
# Korean (Sejong-ish)
|
|
446
|
+
if base == "ko":
|
|
447
|
+
b = p.split("+", 1)[0]
|
|
448
|
+
if b.startswith("NN"):
|
|
449
|
+
tags.append("N")
|
|
450
|
+
elif b in {"VV", "VX", "VCP", "VCN"}:
|
|
451
|
+
tags.append("V")
|
|
452
|
+
elif b == "VA":
|
|
453
|
+
tags.append("ADJ")
|
|
454
|
+
elif b in {"MAG", "MAJ"}:
|
|
455
|
+
tags.append("ADV")
|
|
456
|
+
# derivational verb/adjective suffixes and endings -> verbish for NER hints
|
|
457
|
+
elif b in {"XSV"} or b.startswith("E"):
|
|
458
|
+
tags.append("V")
|
|
459
|
+
elif b in {"XSA"}:
|
|
460
|
+
tags.append("ADJ")
|
|
461
|
+
# Japanese (string tags)
|
|
462
|
+
if base == "ja":
|
|
463
|
+
if "名詞" in p:
|
|
464
|
+
tags.append("N")
|
|
465
|
+
elif "動詞" in p or "助動詞" in p:
|
|
466
|
+
tags.append("V")
|
|
467
|
+
elif "形容詞" in p:
|
|
468
|
+
tags.append("ADJ")
|
|
469
|
+
elif "副詞" in p:
|
|
470
|
+
tags.append("ADV")
|
|
471
|
+
# Chinese (jieba-like short tags)
|
|
472
|
+
if base.startswith("zh"):
|
|
473
|
+
if p.startswith("n"):
|
|
474
|
+
tags.append("N")
|
|
475
|
+
elif p.startswith("v"):
|
|
476
|
+
tags.append("V")
|
|
477
|
+
elif p.startswith("a"):
|
|
478
|
+
tags.append("ADJ")
|
|
479
|
+
elif p == "d":
|
|
480
|
+
tags.append("ADV")
|
|
481
|
+
if not tags:
|
|
482
|
+
return "UNK"
|
|
483
|
+
# majority vote with abstain
|
|
484
|
+
from collections import Counter
|
|
485
|
+
|
|
486
|
+
c = Counter(tags)
|
|
487
|
+
top, n = c.most_common(1)[0]
|
|
488
|
+
if n / len(tags) < 0.75:
|
|
489
|
+
return "UNK"
|
|
490
|
+
return top
|
|
491
|
+
except Exception:
|
|
492
|
+
return "UNK"
|
|
493
|
+
|
|
494
|
+
return "UNK"
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def pos4_hint(lang: str, token_text: str) -> str:
|
|
498
|
+
return _pos4_hint_cached(lang, token_text)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def build_sns_entities(tokens: List[SegmentToken], *, text: str) -> List[Dict[str, Any]]:
|
|
502
|
+
"""
|
|
503
|
+
Convert tokens tagged as DISCOURSE_MARKER into NER-style structured entities.
|
|
504
|
+
"""
|
|
505
|
+
out: List[Dict[str, Any]] = []
|
|
506
|
+
for t in tokens:
|
|
507
|
+
sns = t.get("sns")
|
|
508
|
+
if not isinstance(sns, dict):
|
|
509
|
+
continue
|
|
510
|
+
if sns.get("class") != "DISCOURSE_MARKER":
|
|
511
|
+
continue
|
|
512
|
+
s = int(t["start"])
|
|
513
|
+
e = int(t["end"])
|
|
514
|
+
surf = text[s:e]
|
|
515
|
+
if not surf:
|
|
516
|
+
continue
|
|
517
|
+
out.append(
|
|
518
|
+
{
|
|
519
|
+
"type": "SNS_DISCOURSE",
|
|
520
|
+
"subtype": str(sns.get("subtype") or "OTHER"),
|
|
521
|
+
"intensity": int(sns.get("intensity") or 1),
|
|
522
|
+
"text": surf,
|
|
523
|
+
"start": s,
|
|
524
|
+
"end": e,
|
|
525
|
+
}
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
# de-dup preserve order
|
|
529
|
+
seen = set()
|
|
530
|
+
dedup: List[Dict[str, Any]] = []
|
|
531
|
+
for x in out:
|
|
532
|
+
k = (x["text"], x["start"], x["end"], x["subtype"], x["intensity"])
|
|
533
|
+
if k in seen:
|
|
534
|
+
continue
|
|
535
|
+
seen.add(k)
|
|
536
|
+
dedup.append(x)
|
|
537
|
+
return dedup
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def filter_ner_tokens(
|
|
541
|
+
tokens: List[SegmentToken],
|
|
542
|
+
*,
|
|
543
|
+
drop_sns_discourse: bool = True,
|
|
544
|
+
drop_punct_or_space: bool = True,
|
|
545
|
+
) -> List[SegmentToken]:
|
|
546
|
+
"""
|
|
547
|
+
Filter tokens for a typical NER model input stream.
|
|
548
|
+
"""
|
|
549
|
+
out: List[SegmentToken] = []
|
|
550
|
+
for t in tokens:
|
|
551
|
+
tt = str(t.get("text") or "")
|
|
552
|
+
if drop_sns_discourse and _is_discourse_marker(t):
|
|
553
|
+
continue
|
|
554
|
+
if drop_punct_or_space and _is_punct_or_space(tt):
|
|
555
|
+
continue
|
|
556
|
+
out.append(t)
|
|
557
|
+
return out
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
def merged_surfaces_from_offsets(
|
|
561
|
+
tokens: List[SegmentToken],
|
|
562
|
+
*,
|
|
563
|
+
text: str,
|
|
564
|
+
max_len: int = 48,
|
|
565
|
+
) -> List[str]:
|
|
566
|
+
"""
|
|
567
|
+
Merge contiguous tokens (no gaps in offsets) into surface strings.
|
|
568
|
+
|
|
569
|
+
This is often what users *expect* to feed into downstream NER components,
|
|
570
|
+
especially for morpheme-split languages (ko) or mixed-script spans (LG전자).
|
|
571
|
+
"""
|
|
572
|
+
return merged_surfaces_from_offsets_ex(tokens, text=text, max_len=max_len, dedup=True)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def merged_surfaces_from_offsets_ex(
|
|
576
|
+
tokens: List[SegmentToken],
|
|
577
|
+
*,
|
|
578
|
+
text: str,
|
|
579
|
+
max_len: int = 48,
|
|
580
|
+
dedup: bool = False,
|
|
581
|
+
) -> List[str]:
|
|
582
|
+
"""
|
|
583
|
+
Merge contiguous tokens (no gaps in offsets) into surface strings.
|
|
584
|
+
|
|
585
|
+
If dedup=True, de-duplicate while preserving order (useful for display).
|
|
586
|
+
For NER model inputs, prefer dedup=False (order/duplicates matter).
|
|
587
|
+
"""
|
|
588
|
+
out: List[str] = []
|
|
589
|
+
i = 0
|
|
590
|
+
while i < len(tokens):
|
|
591
|
+
a = tokens[i]
|
|
592
|
+
s0 = int(a["start"])
|
|
593
|
+
e0 = int(a["end"])
|
|
594
|
+
j = i + 1
|
|
595
|
+
while j < len(tokens):
|
|
596
|
+
b = tokens[j]
|
|
597
|
+
bs = int(b["start"])
|
|
598
|
+
be = int(b["end"])
|
|
599
|
+
if bs != e0:
|
|
600
|
+
break
|
|
601
|
+
if (be - s0) > max_len:
|
|
602
|
+
break
|
|
603
|
+
e0 = be
|
|
604
|
+
j += 1
|
|
605
|
+
surf = text[s0:e0].strip()
|
|
606
|
+
if surf:
|
|
607
|
+
out.append(surf)
|
|
608
|
+
i = j
|
|
609
|
+
|
|
610
|
+
if not dedup:
|
|
611
|
+
return out
|
|
612
|
+
|
|
613
|
+
seen = set()
|
|
614
|
+
deduped: List[str] = []
|
|
615
|
+
for s in out:
|
|
616
|
+
if s in seen:
|
|
617
|
+
continue
|
|
618
|
+
seen.add(s)
|
|
619
|
+
deduped.append(s)
|
|
620
|
+
return deduped
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def merged_token_groups_from_offsets(
|
|
624
|
+
tokens: List[SegmentToken],
|
|
625
|
+
*,
|
|
626
|
+
text: str,
|
|
627
|
+
max_len: int = 48,
|
|
628
|
+
) -> List[Dict[str, Any]]:
|
|
629
|
+
"""
|
|
630
|
+
Like merged_surfaces_from_offsets, but returns groups with their member tokens.
|
|
631
|
+
"""
|
|
632
|
+
out: List[Dict[str, Any]] = []
|
|
633
|
+
i = 0
|
|
634
|
+
while i < len(tokens):
|
|
635
|
+
a = tokens[i]
|
|
636
|
+
s0 = int(a["start"])
|
|
637
|
+
e0 = int(a["end"])
|
|
638
|
+
j = i + 1
|
|
639
|
+
while j < len(tokens):
|
|
640
|
+
b = tokens[j]
|
|
641
|
+
bs = int(b["start"])
|
|
642
|
+
be = int(b["end"])
|
|
643
|
+
if bs != e0:
|
|
644
|
+
break
|
|
645
|
+
if (be - s0) > max_len:
|
|
646
|
+
break
|
|
647
|
+
e0 = be
|
|
648
|
+
j += 1
|
|
649
|
+
surf = text[s0:e0].strip()
|
|
650
|
+
if surf:
|
|
651
|
+
out.append({"text": surf, "start": s0, "end": e0, "tokens": tokens[i:j]})
|
|
652
|
+
i = j
|
|
653
|
+
return out
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def ner_preprocess(
|
|
657
|
+
text: str,
|
|
658
|
+
*,
|
|
659
|
+
lang: str,
|
|
660
|
+
sns: bool = True,
|
|
661
|
+
morphology: Optional[bool] = None,
|
|
662
|
+
include_token_hints: bool = False,
|
|
663
|
+
include_function_word_hints: bool = False,
|
|
664
|
+
drop_function_words: bool = True,
|
|
665
|
+
include_pos4_hints: bool = False,
|
|
666
|
+
use_surfaces: bool = True,
|
|
667
|
+
) -> Dict[str, Any]:
|
|
668
|
+
"""
|
|
669
|
+
One-shot helper for NER pipelines:
|
|
670
|
+
- normalize (SNS-aware if sns=True)
|
|
671
|
+
- segment with offsets + sns tags
|
|
672
|
+
- return (A) tokens for NER input (markers/punct removed) and
|
|
673
|
+
(B) SNS markers as separate "entities"
|
|
674
|
+
"""
|
|
675
|
+
# NOTE: legacy segmentation logic lives in tokmor.legacy_api.
|
|
676
|
+
# NER preprocessing remains stable regardless of which public API is "primary".
|
|
677
|
+
from .legacy_api import segment
|
|
678
|
+
from .preprocess import normalize_text
|
|
679
|
+
|
|
680
|
+
text_norm = normalize_text(text, sns=bool(sns))
|
|
681
|
+
seg = segment(
|
|
682
|
+
text_norm,
|
|
683
|
+
lang=lang,
|
|
684
|
+
sns=bool(sns),
|
|
685
|
+
morphology=morphology,
|
|
686
|
+
include_sns_tags=True,
|
|
687
|
+
)
|
|
688
|
+
tokens = list(seg.get("tokens") or [])
|
|
689
|
+
sns_entities = build_sns_entities(tokens, text=text_norm)
|
|
690
|
+
ner_tokens = filter_ner_tokens(tokens)
|
|
691
|
+
if drop_function_words:
|
|
692
|
+
# Hard-block typical function words from NER input stream.
|
|
693
|
+
# This is intentionally conservative: if we don't know, we keep the token.
|
|
694
|
+
ner_tokens = [
|
|
695
|
+
t
|
|
696
|
+
for t in ner_tokens
|
|
697
|
+
if (function_word_tag(lang, str(t.get("text") or "")) or "") not in _HARD_BLOCK_FWTAGS
|
|
698
|
+
]
|
|
699
|
+
ner_surfaces = merged_surfaces_from_offsets(ner_tokens, text=text_norm)
|
|
700
|
+
groups = merged_token_groups_from_offsets(ner_tokens, text=text_norm)
|
|
701
|
+
|
|
702
|
+
out: Dict[str, Any] = {
|
|
703
|
+
"schema_version": seg.get("schema_version"),
|
|
704
|
+
"tokmor_version": seg.get("tokmor_version"),
|
|
705
|
+
"lang": seg.get("lang"),
|
|
706
|
+
"text_norm": text_norm,
|
|
707
|
+
"tokens": tokens,
|
|
708
|
+
"ner_tokens": ner_tokens,
|
|
709
|
+
"ner_surfaces": ner_surfaces,
|
|
710
|
+
"sns_entities": sns_entities,
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
if include_token_hints:
|
|
714
|
+
out["ner_token_hints"] = [token_shape_hint(str(t.get("text") or "")) for t in ner_tokens]
|
|
715
|
+
|
|
716
|
+
if include_function_word_hints:
|
|
717
|
+
out["ner_function_words"] = [function_word_tag(lang, str(t.get("text") or "")) for t in ner_tokens]
|
|
718
|
+
|
|
719
|
+
if include_pos4_hints:
|
|
720
|
+
out["ner_pos4"] = [pos4_hint(lang, str(t.get("text") or "")) for t in ner_tokens]
|
|
721
|
+
|
|
722
|
+
if use_surfaces:
|
|
723
|
+
# Model-oriented surface tokens (ordered; not de-duped)
|
|
724
|
+
out["ner_input_tokens"] = [g["text"] for g in groups]
|
|
725
|
+
|
|
726
|
+
if include_token_hints:
|
|
727
|
+
out["ner_input_token_hints"] = [token_shape_hint(str(g["text"])) for g in groups]
|
|
728
|
+
|
|
729
|
+
if include_pos4_hints:
|
|
730
|
+
# Aggregate morpheme-level hints to surface-level.
|
|
731
|
+
# Priority: V > J > R > N > UNK
|
|
732
|
+
def _agg_pos4(g: Dict[str, Any]) -> str:
|
|
733
|
+
tags = [pos4_hint(lang, str(t.get("text") or "")) for t in (g.get("tokens") or [])]
|
|
734
|
+
if "V" in tags:
|
|
735
|
+
return "V"
|
|
736
|
+
if "ADJ" in tags:
|
|
737
|
+
return "ADJ"
|
|
738
|
+
if "ADV" in tags:
|
|
739
|
+
return "ADV"
|
|
740
|
+
if "N" in tags:
|
|
741
|
+
return "N"
|
|
742
|
+
return "UNK"
|
|
743
|
+
|
|
744
|
+
out["ner_input_pos4"] = [_agg_pos4(g) for g in groups]
|
|
745
|
+
|
|
746
|
+
return out
|
|
747
|
+
|