tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/ner_prep.py ADDED
@@ -0,0 +1,747 @@
1
+ """
2
+ NER preprocessing helpers (still POS/NER-free)
3
+ =============================================
4
+
5
+ TokMor core does NOT provide NER models. This module provides a small, deterministic
6
+ helper layer to make it *easy* to apply TokMor outputs to NER pipelines:
7
+
8
+ - run `segment(..., include_sns_tags=True)` to get stable tokens + offsets
9
+ - emit SNS discourse markers as separate "NER-style" entities
10
+ - filter out discourse markers / punctuation from the token stream used by NER
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ import unicodedata
17
+ from functools import lru_cache
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional
20
+
21
+ SegmentToken = Dict[str, Any]
22
+
23
+ _RX_DATE = re.compile(r"^(\d{4}[-/]\d{1,2}[-/]\d{1,2}|\d{1,2}:\d{2}(:\d{2})?)$")
24
+
25
+
26
+ def _is_punct_or_space(text: str) -> bool:
27
+ if not text:
28
+ return True
29
+ if text.strip() == "":
30
+ return True
31
+ cats = [unicodedata.category(c) for c in text]
32
+ return all(c.startswith(("P", "S")) for c in cats)
33
+
34
+
35
+ def _is_discourse_marker(token: SegmentToken) -> bool:
36
+ sns = token.get("sns")
37
+ return isinstance(sns, dict) and sns.get("class") == "DISCOURSE_MARKER"
38
+
39
+
40
+ def _is_number(text: str) -> bool:
41
+ if not text:
42
+ return False
43
+ t = text.replace(",", "").replace("_", "")
44
+ t = t.replace("-", "", 1)
45
+ t = t.replace(".", "", 1)
46
+ return t.isdigit()
47
+
48
+
49
+ def token_shape_hint(text: str) -> str:
50
+ """
51
+ Tiny deterministic hint for NER pipelines (NOT a POS tagger).
52
+
53
+ Returns one of:
54
+ - Q: number-like
55
+ - T: date/time-like
56
+ - S: punct/symbol-only
57
+ - O: other/unknown
58
+ """
59
+ t = str(text or "")
60
+ if not t:
61
+ return "O"
62
+ if _is_punct_or_space(t):
63
+ return "S"
64
+ if _RX_DATE.match(t):
65
+ return "T"
66
+ if _is_number(t):
67
+ return "Q"
68
+ return "O"
69
+
70
+
71
+ @lru_cache(maxsize=512)
72
+ def _lang_configs_dir() -> Optional[Path]:
73
+ """
74
+ Resolve lang_configs directory if available.
75
+
76
+ Priority:
77
+ - TOKMOR_LANG_CONFIGS_DIR (explicit)
78
+ - repo root sibling: TokMor/lang_configs (dev)
79
+ """
80
+ try:
81
+ import os
82
+
83
+ p = os.getenv("TOKMOR_LANG_CONFIGS_DIR", "").strip()
84
+ if p:
85
+ d = Path(p).expanduser().resolve()
86
+ else:
87
+ # ner_prep.py -> TokMor_v1/tokmor/ner_prep.py, repo root is parents[2]
88
+ d = Path(__file__).resolve().parents[2] / "lang_configs"
89
+ return d if d.exists() else None
90
+ except Exception:
91
+ return None
92
+
93
+
94
+ @lru_cache(maxsize=512)
95
+ def _load_lang_config_json(lang: str) -> Optional[Dict[str, Any]]:
96
+ """
97
+ Load lang_configs/{lang}.json (if available).
98
+ """
99
+ d = _lang_configs_dir()
100
+ if not d:
101
+ return None
102
+ base = (lang or "").split("-", 1)[0].lower()
103
+ fp = d / f"{base}.json"
104
+ if not fp.exists():
105
+ return None
106
+ try:
107
+ import json
108
+
109
+ data = json.loads(fp.read_text("utf-8", errors="replace"))
110
+ return data if isinstance(data, dict) else None
111
+ except Exception:
112
+ return None
113
+
114
+
115
+ @lru_cache(maxsize=256)
116
+ def _function_word_map(lang: str) -> Dict[str, str]:
117
+ """
118
+ Best-effort function-word lexicon for the given language.
119
+
120
+ Source: tokmor morphology analyzers (built-in, small, high-precision).
121
+ Output values are analyzer-specific tags (DET/PRON/PREP/CONJ/AUX/NEG/...).
122
+ """
123
+ out: Dict[str, str] = {}
124
+
125
+ base = (lang or "").split("-", 1)[0].lower() # zh-cn -> zh
126
+
127
+ # 0) Prefer lang_configs/*.json if present (358 langs).
128
+ try:
129
+ data = _load_lang_config_json(base) or {}
130
+ pos = data.get("pos", {}) if isinstance(data, dict) else {}
131
+
132
+ # We only materialize *function-ish* tags into the map.
133
+ # Do NOT treat ADV/ADJ/NOUN as function words for hard-blocking.
134
+ fields = [
135
+ ("determiners", "DET"),
136
+ ("pronouns", "PRON"),
137
+ ("auxiliaries", "AUX"),
138
+ ("prepositions", "ADP"),
139
+ ("conjunctions", "CCONJ"),
140
+ ("particles", "PART"),
141
+ ("postpositions", "ADP"),
142
+ ]
143
+ for key, tag in fields:
144
+ xs = pos.get(key, [])
145
+ if isinstance(xs, list):
146
+ for w in xs:
147
+ if isinstance(w, str) and w:
148
+ out[w] = tag
149
+ except Exception:
150
+ pass
151
+
152
+ # 1) Fallback: use tokmor morphology analyzers' built-in tiny function_words dicts.
153
+ # Keep this conservative: ignore large open-class buckets (esp. English adverbs).
154
+ try:
155
+ from .morphology.unified import get_unified_analyzer
156
+
157
+ an = get_unified_analyzer(lang)
158
+ sa = getattr(an, "specialized_analyzer", None)
159
+ if not sa:
160
+ return out
161
+
162
+ fw = getattr(sa, "function_words", None)
163
+ if isinstance(fw, dict):
164
+ for k, v in fw.items():
165
+ if isinstance(k, str) and k:
166
+ out.setdefault(k, str(v))
167
+
168
+ # Some analyzers keep closed-class buckets (sets) instead of a single dict.
169
+ # NOTE: do NOT use "adverbs" here (often huge / open-class).
170
+ buckets = [
171
+ ("determiners", "DET"),
172
+ ("pronouns", "PRON"),
173
+ ("auxiliaries", "AUX"),
174
+ ("prepositions", "ADP"),
175
+ ("conjunctions", "CCONJ"),
176
+ ("particles", "PART"),
177
+ ("postpositions", "ADP"),
178
+ ]
179
+ for attr, tag in buckets:
180
+ s = getattr(sa, attr, None)
181
+ if isinstance(s, set):
182
+ for k in s:
183
+ if isinstance(k, str) and k:
184
+ out.setdefault(k, tag)
185
+ elif isinstance(s, dict):
186
+ for k in s.keys():
187
+ if isinstance(k, str) and k:
188
+ out.setdefault(k, tag)
189
+ except Exception:
190
+ return out
191
+
192
+ return out
193
+
194
+
195
+ def function_word_tag(lang: str, token_text: str) -> Optional[str]:
196
+ """
197
+ Return a small closed-class tag if `token_text` looks like a function word for `lang`.
198
+ Conservative: prefers exact surface match; falls back to lowercased match.
199
+ """
200
+ t = str(token_text or "")
201
+ if not t:
202
+ return None
203
+ m = _function_word_map(lang)
204
+ if not m:
205
+ return None
206
+ if t in m:
207
+ return m[t]
208
+ tl = t.lower()
209
+ if tl in m:
210
+ return m[tl]
211
+ return None
212
+
213
+
214
+ _HARD_BLOCK_FWTAGS = {"DET", "PRON", "AUX", "ADP", "CCONJ", "SCONJ", "PART"}
215
+
216
+
217
+ @lru_cache(maxsize=2048)
218
+ def _extended_pos_map(lang: str) -> Dict[str, str]:
219
+ """
220
+ Load optional external extended POS hints (surface -> coarse tag).
221
+
222
+ Expected location (runtime):
223
+ TOKMOR_DATA_DIR/extended_dict/{lang}_extended.json
224
+
225
+ This is built by: scripts/build_tokmor_data_pack_from_wiktextract.py --build-extended-dict
226
+ """
227
+ # Allow disabling large optional dictionaries entirely for "lite" deployments.
228
+ try:
229
+ import os
230
+
231
+ v = (os.getenv("TOKMOR_DISABLE_EXTENDED_DICT", "") or "").strip().lower()
232
+ if v in {"1", "true", "yes", "y", "on"}:
233
+ return {}
234
+ except Exception:
235
+ pass
236
+ try:
237
+ from . import resources
238
+ except Exception:
239
+ return {}
240
+
241
+ base = (lang or "").split("-", 1)[0].lower()
242
+ p = resources.data_dir() / "extended_dict" / f"{base}_extended.json"
243
+ if not p.exists():
244
+ return {}
245
+ try:
246
+ import json
247
+
248
+ obj = json.loads(p.read_text(encoding="utf-8", errors="ignore"))
249
+ if isinstance(obj, dict):
250
+ # normalize to str->str
251
+ out: Dict[str, str] = {}
252
+ for k, v in obj.items():
253
+ if not isinstance(k, str) or not k:
254
+ continue
255
+ if not isinstance(v, str) or not v:
256
+ continue
257
+ out[k] = v
258
+ return out
259
+ except Exception:
260
+ return {}
261
+ return {}
262
+
263
+
264
+ def _pos4_from_extended_dict(lang: str, token_text: str) -> Optional[str]:
265
+ """
266
+ Return POS4 from external extended_dict if present, else None.
267
+ """
268
+ t = str(token_text or "")
269
+ if not t:
270
+ return None
271
+ base = (lang or "").split("-", 1)[0].lower()
272
+
273
+ m = _extended_pos_map(base)
274
+ if not m:
275
+ return None
276
+
277
+ # Match builder behavior: some languages lower-case keys.
278
+ key = t
279
+ if base in {"en", "de", "fr", "es", "it", "pt", "nl", "sv", "da", "no", "fi"}:
280
+ key = t.lower()
281
+ tag = m.get(key) or m.get(t) or m.get(t.lower())
282
+ if not tag:
283
+ return None
284
+ u = tag.upper()
285
+ if u in {"NOUN", "PROPN"}:
286
+ return "N"
287
+ if u in {"VERB", "AUX"}:
288
+ return "V"
289
+ if u == "ADJ":
290
+ return "ADJ"
291
+ if u == "ADV":
292
+ return "ADV"
293
+ return None
294
+
295
+
296
+ @lru_cache(maxsize=50000)
297
+ def _pos4_hint_cached(lang: str, token_text: str) -> str:
298
+ """
299
+ POS4 hints for NER: only N/V/ADJ/ADV when confident, else UNK.
300
+ - N: NOUN/PROPN-ish (content noun, incl. proper noun)
301
+ - V: VERB/AUX-ish
302
+ - ADJ: ADJ-ish
303
+ - ADV: ADV-ish
304
+ - UNK: abstain
305
+ """
306
+ t = str(token_text or "")
307
+ if not t or len(t) > 64:
308
+ return "UNK"
309
+
310
+ # Never try to POS-tag pure punctuation/symbols/dates/numbers here.
311
+ # Those are covered by token_shape_hint(Q/T/S) already.
312
+ if token_shape_hint(t) != "O":
313
+ return "UNK"
314
+
315
+ # Korean morpheme tokens: unified analyzer often returns coarse nouns for stems/endings.
316
+ # Add a tiny whitelist to improve V hints in morpheme-split output (ko default morphology=True).
317
+ base = (lang or "").split("-", 1)[0].lower()
318
+ if base == "ko":
319
+ ko_verbish = {
320
+ # common light verbs / auxiliaries (surface fragments)
321
+ "하", "되", "있", "없", "이", "아니",
322
+ # common past/tense/ending fragments seen in our tokenizer output
323
+ "었", "았", "였", "겠", "시", "지", "고", "서", "면", "다", "요",
324
+ }
325
+ if t in ko_verbish:
326
+ return "V"
327
+
328
+ # 0) If an external extended_dict exists, prefer it as a cheap high-coverage hint.
329
+ # This is the primary "best-effort POS4" mechanism for many languages.
330
+ try:
331
+ ex = _pos4_from_extended_dict(base, t)
332
+ if ex:
333
+ return ex
334
+ except Exception:
335
+ pass
336
+
337
+ # 1) Prefer lang_configs/*.json word lists + suffixes (358 langs) if present.
338
+ try:
339
+ data = _load_lang_config_json(base) or {}
340
+ pos = data.get("pos", {}) if isinstance(data, dict) else {}
341
+ tl = t.lower()
342
+
343
+ def _in_list(key: str, token: str) -> bool:
344
+ xs = pos.get(key, [])
345
+ return isinstance(xs, list) and token in xs
346
+
347
+ # Direct word lists (these are small and high-signal)
348
+ if _in_list("nouns", tl):
349
+ return "N"
350
+ if _in_list("adjectives", tl):
351
+ return "ADJ"
352
+ if _in_list("adverbs", tl):
353
+ return "ADV"
354
+ # Treat auxiliaries as verbish for POS4 purposes
355
+ if _in_list("auxiliaries", tl):
356
+ return "V"
357
+
358
+ # Suffix hints (conservative)
359
+ suf = data.get("suffixes", {}) if isinstance(data, dict) else {}
360
+ if isinstance(suf, dict) and t.isalpha() and len(t) >= 4:
361
+ advs = suf.get("adv", [])
362
+ adjs = suf.get("adj", [])
363
+ verbs = suf.get("verb", [])
364
+ nouns = suf.get("noun", [])
365
+
366
+ if isinstance(advs, list) and any(tl.endswith(s) for s in advs if isinstance(s, str) and s):
367
+ return "ADV"
368
+ if isinstance(adjs, list) and any(tl.endswith(s) for s in adjs if isinstance(s, str) and s):
369
+ return "ADJ"
370
+ if isinstance(verbs, list) and any(tl.endswith(s) for s in verbs if isinstance(s, str) and s):
371
+ # Avoid over-tagging in non-Latin scripts unless explicitly configured
372
+ return "V" if (tl.isascii() or base in {"ar"}) else "UNK"
373
+ if isinstance(nouns, list) and any(tl.endswith(s) for s in nouns if isinstance(s, str) and s):
374
+ # Noun suffixes are noisy; apply only for Latin-ish tokens.
375
+ if tl.isascii():
376
+ return "N"
377
+ except Exception:
378
+ pass
379
+
380
+ # 1.5) Tiny global heuristic: Latin TitleCase often indicates a name (helps vi/fr/es/...).
381
+ if t[:1].isupper() and t.isalpha() and len(t) >= 2:
382
+ if not (t.isupper() and len(t) <= 3):
383
+ return "N"
384
+
385
+ # 2) Fallback: use tokmor unified morphology tags (token-level, abstaining)
386
+ try:
387
+ from .morphology.unified import get_unified_analyzer
388
+
389
+ an = get_unified_analyzer(lang)
390
+
391
+ # Chinese: prefer specialized analyzer morphemes (they contain POS like n/v/a/d/p/u...).
392
+ if base.startswith("zh"):
393
+ sa = getattr(an, "specialized_analyzer", None)
394
+ if sa and hasattr(sa, "analyze"):
395
+ r = sa.analyze(t)
396
+ morphemes = None
397
+ if hasattr(r, "best") and hasattr(r.best, "morphemes"):
398
+ morphemes = r.best.morphemes
399
+ elif hasattr(r, "morphemes"):
400
+ morphemes = r.morphemes
401
+ tags: List[str] = []
402
+ if morphemes:
403
+ for m in morphemes:
404
+ p = str(getattr(m, "pos", "") or "")
405
+ if not p:
406
+ continue
407
+ # Common Chinese tags in our analyzer:
408
+ # n/ns/nr/nt/nrt... noun-ish, v verb, a adj, d adv, r pron
409
+ if p.startswith("n"):
410
+ tags.append("N")
411
+ elif p.startswith("v"):
412
+ tags.append("V")
413
+ elif p.startswith("a"):
414
+ tags.append("ADJ")
415
+ elif p.startswith("d"):
416
+ tags.append("ADV")
417
+ elif p == "r":
418
+ tags.append("N")
419
+ if tags:
420
+ # Priority: V > ADJ > ADV > N (matches surface aggregation)
421
+ if "V" in tags:
422
+ return "V"
423
+ if "ADJ" in tags:
424
+ return "ADJ"
425
+ if "ADV" in tags:
426
+ return "ADV"
427
+ if "N" in tags:
428
+ return "N"
429
+
430
+ morphs = an.analyze(t)
431
+ tags: List[str] = []
432
+ for m in morphs:
433
+ p = str(getattr(m, "pos", "") or "")
434
+ pu = p.upper()
435
+ # universal-style
436
+ if pu in {"NOUN", "PROPN"}:
437
+ tags.append("N")
438
+ elif pu in {"VERB", "AUX"}:
439
+ tags.append("V")
440
+ elif pu == "ADJ":
441
+ tags.append("ADJ")
442
+ elif pu == "ADV":
443
+ tags.append("ADV")
444
+ else:
445
+ # Korean (Sejong-ish)
446
+ if base == "ko":
447
+ b = p.split("+", 1)[0]
448
+ if b.startswith("NN"):
449
+ tags.append("N")
450
+ elif b in {"VV", "VX", "VCP", "VCN"}:
451
+ tags.append("V")
452
+ elif b == "VA":
453
+ tags.append("ADJ")
454
+ elif b in {"MAG", "MAJ"}:
455
+ tags.append("ADV")
456
+ # derivational verb/adjective suffixes and endings -> verbish for NER hints
457
+ elif b in {"XSV"} or b.startswith("E"):
458
+ tags.append("V")
459
+ elif b in {"XSA"}:
460
+ tags.append("ADJ")
461
+ # Japanese (string tags)
462
+ if base == "ja":
463
+ if "名詞" in p:
464
+ tags.append("N")
465
+ elif "動詞" in p or "助動詞" in p:
466
+ tags.append("V")
467
+ elif "形容詞" in p:
468
+ tags.append("ADJ")
469
+ elif "副詞" in p:
470
+ tags.append("ADV")
471
+ # Chinese (jieba-like short tags)
472
+ if base.startswith("zh"):
473
+ if p.startswith("n"):
474
+ tags.append("N")
475
+ elif p.startswith("v"):
476
+ tags.append("V")
477
+ elif p.startswith("a"):
478
+ tags.append("ADJ")
479
+ elif p == "d":
480
+ tags.append("ADV")
481
+ if not tags:
482
+ return "UNK"
483
+ # majority vote with abstain
484
+ from collections import Counter
485
+
486
+ c = Counter(tags)
487
+ top, n = c.most_common(1)[0]
488
+ if n / len(tags) < 0.75:
489
+ return "UNK"
490
+ return top
491
+ except Exception:
492
+ return "UNK"
493
+
494
+ return "UNK"
495
+
496
+
497
+ def pos4_hint(lang: str, token_text: str) -> str:
498
+ return _pos4_hint_cached(lang, token_text)
499
+
500
+
501
+ def build_sns_entities(tokens: List[SegmentToken], *, text: str) -> List[Dict[str, Any]]:
502
+ """
503
+ Convert tokens tagged as DISCOURSE_MARKER into NER-style structured entities.
504
+ """
505
+ out: List[Dict[str, Any]] = []
506
+ for t in tokens:
507
+ sns = t.get("sns")
508
+ if not isinstance(sns, dict):
509
+ continue
510
+ if sns.get("class") != "DISCOURSE_MARKER":
511
+ continue
512
+ s = int(t["start"])
513
+ e = int(t["end"])
514
+ surf = text[s:e]
515
+ if not surf:
516
+ continue
517
+ out.append(
518
+ {
519
+ "type": "SNS_DISCOURSE",
520
+ "subtype": str(sns.get("subtype") or "OTHER"),
521
+ "intensity": int(sns.get("intensity") or 1),
522
+ "text": surf,
523
+ "start": s,
524
+ "end": e,
525
+ }
526
+ )
527
+
528
+ # de-dup preserve order
529
+ seen = set()
530
+ dedup: List[Dict[str, Any]] = []
531
+ for x in out:
532
+ k = (x["text"], x["start"], x["end"], x["subtype"], x["intensity"])
533
+ if k in seen:
534
+ continue
535
+ seen.add(k)
536
+ dedup.append(x)
537
+ return dedup
538
+
539
+
540
+ def filter_ner_tokens(
541
+ tokens: List[SegmentToken],
542
+ *,
543
+ drop_sns_discourse: bool = True,
544
+ drop_punct_or_space: bool = True,
545
+ ) -> List[SegmentToken]:
546
+ """
547
+ Filter tokens for a typical NER model input stream.
548
+ """
549
+ out: List[SegmentToken] = []
550
+ for t in tokens:
551
+ tt = str(t.get("text") or "")
552
+ if drop_sns_discourse and _is_discourse_marker(t):
553
+ continue
554
+ if drop_punct_or_space and _is_punct_or_space(tt):
555
+ continue
556
+ out.append(t)
557
+ return out
558
+
559
+
560
+ def merged_surfaces_from_offsets(
561
+ tokens: List[SegmentToken],
562
+ *,
563
+ text: str,
564
+ max_len: int = 48,
565
+ ) -> List[str]:
566
+ """
567
+ Merge contiguous tokens (no gaps in offsets) into surface strings.
568
+
569
+ This is often what users *expect* to feed into downstream NER components,
570
+ especially for morpheme-split languages (ko) or mixed-script spans (LG전자).
571
+ """
572
+ return merged_surfaces_from_offsets_ex(tokens, text=text, max_len=max_len, dedup=True)
573
+
574
+
575
+ def merged_surfaces_from_offsets_ex(
576
+ tokens: List[SegmentToken],
577
+ *,
578
+ text: str,
579
+ max_len: int = 48,
580
+ dedup: bool = False,
581
+ ) -> List[str]:
582
+ """
583
+ Merge contiguous tokens (no gaps in offsets) into surface strings.
584
+
585
+ If dedup=True, de-duplicate while preserving order (useful for display).
586
+ For NER model inputs, prefer dedup=False (order/duplicates matter).
587
+ """
588
+ out: List[str] = []
589
+ i = 0
590
+ while i < len(tokens):
591
+ a = tokens[i]
592
+ s0 = int(a["start"])
593
+ e0 = int(a["end"])
594
+ j = i + 1
595
+ while j < len(tokens):
596
+ b = tokens[j]
597
+ bs = int(b["start"])
598
+ be = int(b["end"])
599
+ if bs != e0:
600
+ break
601
+ if (be - s0) > max_len:
602
+ break
603
+ e0 = be
604
+ j += 1
605
+ surf = text[s0:e0].strip()
606
+ if surf:
607
+ out.append(surf)
608
+ i = j
609
+
610
+ if not dedup:
611
+ return out
612
+
613
+ seen = set()
614
+ deduped: List[str] = []
615
+ for s in out:
616
+ if s in seen:
617
+ continue
618
+ seen.add(s)
619
+ deduped.append(s)
620
+ return deduped
621
+
622
+
623
+ def merged_token_groups_from_offsets(
624
+ tokens: List[SegmentToken],
625
+ *,
626
+ text: str,
627
+ max_len: int = 48,
628
+ ) -> List[Dict[str, Any]]:
629
+ """
630
+ Like merged_surfaces_from_offsets, but returns groups with their member tokens.
631
+ """
632
+ out: List[Dict[str, Any]] = []
633
+ i = 0
634
+ while i < len(tokens):
635
+ a = tokens[i]
636
+ s0 = int(a["start"])
637
+ e0 = int(a["end"])
638
+ j = i + 1
639
+ while j < len(tokens):
640
+ b = tokens[j]
641
+ bs = int(b["start"])
642
+ be = int(b["end"])
643
+ if bs != e0:
644
+ break
645
+ if (be - s0) > max_len:
646
+ break
647
+ e0 = be
648
+ j += 1
649
+ surf = text[s0:e0].strip()
650
+ if surf:
651
+ out.append({"text": surf, "start": s0, "end": e0, "tokens": tokens[i:j]})
652
+ i = j
653
+ return out
654
+
655
+
656
+ def ner_preprocess(
657
+ text: str,
658
+ *,
659
+ lang: str,
660
+ sns: bool = True,
661
+ morphology: Optional[bool] = None,
662
+ include_token_hints: bool = False,
663
+ include_function_word_hints: bool = False,
664
+ drop_function_words: bool = True,
665
+ include_pos4_hints: bool = False,
666
+ use_surfaces: bool = True,
667
+ ) -> Dict[str, Any]:
668
+ """
669
+ One-shot helper for NER pipelines:
670
+ - normalize (SNS-aware if sns=True)
671
+ - segment with offsets + sns tags
672
+ - return (A) tokens for NER input (markers/punct removed) and
673
+ (B) SNS markers as separate "entities"
674
+ """
675
+ # NOTE: legacy segmentation logic lives in tokmor.legacy_api.
676
+ # NER preprocessing remains stable regardless of which public API is "primary".
677
+ from .legacy_api import segment
678
+ from .preprocess import normalize_text
679
+
680
+ text_norm = normalize_text(text, sns=bool(sns))
681
+ seg = segment(
682
+ text_norm,
683
+ lang=lang,
684
+ sns=bool(sns),
685
+ morphology=morphology,
686
+ include_sns_tags=True,
687
+ )
688
+ tokens = list(seg.get("tokens") or [])
689
+ sns_entities = build_sns_entities(tokens, text=text_norm)
690
+ ner_tokens = filter_ner_tokens(tokens)
691
+ if drop_function_words:
692
+ # Hard-block typical function words from NER input stream.
693
+ # This is intentionally conservative: if we don't know, we keep the token.
694
+ ner_tokens = [
695
+ t
696
+ for t in ner_tokens
697
+ if (function_word_tag(lang, str(t.get("text") or "")) or "") not in _HARD_BLOCK_FWTAGS
698
+ ]
699
+ ner_surfaces = merged_surfaces_from_offsets(ner_tokens, text=text_norm)
700
+ groups = merged_token_groups_from_offsets(ner_tokens, text=text_norm)
701
+
702
+ out: Dict[str, Any] = {
703
+ "schema_version": seg.get("schema_version"),
704
+ "tokmor_version": seg.get("tokmor_version"),
705
+ "lang": seg.get("lang"),
706
+ "text_norm": text_norm,
707
+ "tokens": tokens,
708
+ "ner_tokens": ner_tokens,
709
+ "ner_surfaces": ner_surfaces,
710
+ "sns_entities": sns_entities,
711
+ }
712
+
713
+ if include_token_hints:
714
+ out["ner_token_hints"] = [token_shape_hint(str(t.get("text") or "")) for t in ner_tokens]
715
+
716
+ if include_function_word_hints:
717
+ out["ner_function_words"] = [function_word_tag(lang, str(t.get("text") or "")) for t in ner_tokens]
718
+
719
+ if include_pos4_hints:
720
+ out["ner_pos4"] = [pos4_hint(lang, str(t.get("text") or "")) for t in ner_tokens]
721
+
722
+ if use_surfaces:
723
+ # Model-oriented surface tokens (ordered; not de-duped)
724
+ out["ner_input_tokens"] = [g["text"] for g in groups]
725
+
726
+ if include_token_hints:
727
+ out["ner_input_token_hints"] = [token_shape_hint(str(g["text"])) for g in groups]
728
+
729
+ if include_pos4_hints:
730
+ # Aggregate morpheme-level hints to surface-level.
731
+ # Priority: V > J > R > N > UNK
732
+ def _agg_pos4(g: Dict[str, Any]) -> str:
733
+ tags = [pos4_hint(lang, str(t.get("text") or "")) for t in (g.get("tokens") or [])]
734
+ if "V" in tags:
735
+ return "V"
736
+ if "ADJ" in tags:
737
+ return "ADJ"
738
+ if "ADV" in tags:
739
+ return "ADV"
740
+ if "N" in tags:
741
+ return "N"
742
+ return "UNK"
743
+
744
+ out["ner_input_pos4"] = [_agg_pos4(g) for g in groups]
745
+
746
+ return out
747
+