tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,855 @@
1
+ """
2
+ Unified Morphological Analyzer
3
+ ==============================
4
+
5
+ 두 가지 형태소 시스템을 통합:
6
+ 1. 특화 분석기 (74개 언어) - 규칙 기반
7
+ 2. (옵션) lemma lexicon (언어별) - 사전 기반 (외부 자산)
8
+
9
+ Note:
10
+ - TokMor OSS core는 대형 lemma 사전을 wheel/sdist에 포함하지 않는다.
11
+ - lemma lexicon은 필요하면 `TOKMOR_DATA_DIR/lemma_dict/` 또는 `TOKMOR_LEMMA_DICT_DIR`로 제공한다.
12
+ """
13
+
14
+ import pickle
15
+ import os
16
+ from pathlib import Path
17
+ from typing import Optional, List, Dict, Any, Tuple
18
+ from dataclasses import dataclass
19
+
20
+ from .advanced_base import AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult
21
+ from . import ANALYZERS
22
+ from .. import resources
23
+ from ..lemma_store import BaseLemmaStore, load_lemma_store
24
+
25
+
26
+ @dataclass
27
+ class LemmaResult:
28
+ """Lemmatization result"""
29
+ word: str
30
+ lemma: str
31
+ pos: Optional[str] = None
32
+ features: Optional[str] = None
33
+ confidence: float = 1.0
34
+ source: str = 'unknown' # 'lexicon', 'specialized', 'model', 'rule', 'fallback'
35
+
36
+
37
+ class UnifiedMorphAnalyzer:
38
+ """
39
+ 통합 형태소 분석기
40
+
41
+ 우선순위:
42
+ 1. 특화 분석기 (있으면)
43
+ 2. lemma lexicon (있으면)
44
+ 3. 규칙 기반 폴백
45
+ """
46
+
47
+ def __init__(self, lang: str):
48
+ self.lang = lang.lower()
49
+ self.specialized_analyzer = None
50
+ self._lemma_store: Optional[BaseLemmaStore] = None # lazy-loaded (sqlite/pkl)
51
+ self._load_analyzers()
52
+
53
+ def _load_analyzers(self):
54
+ """분석기 로드"""
55
+ # 1. 특화 분석기 시도
56
+ if self.lang in ANALYZERS:
57
+ try:
58
+ self.specialized_analyzer = ANALYZERS[self.lang]()
59
+ except Exception:
60
+ pass
61
+
62
+ def _get_lemma_store(self) -> Optional[BaseLemmaStore]:
63
+ """
64
+ Load lemma lexicon store (surface -> lemma) if available.
65
+
66
+ Location (priority):
67
+ - TOKMOR_LEMMA_DICT_DIR/{lang}.sqlite|.db|.pkl ...
68
+ - tokmor/models/lemma_dict/{lang}.sqlite|.db|.pkl ...
69
+ - (legacy) some deployments used additional lemma packs under TOKMOR_DATA_DIR
70
+ """
71
+ if self._lemma_store is not None:
72
+ return self._lemma_store
73
+
74
+ # Allow lemma aliasing for top100 wiki-style codes (dev convenience)
75
+ lex_lang = resources.normalize_lang_for_lemma(self.lang) if hasattr(resources, "normalize_lang_for_lemma") else self.lang
76
+ lex_path = resources.resolve_lemma_dict_path(lex_lang)
77
+ if not lex_path:
78
+ self._lemma_store = None
79
+ return None
80
+
81
+ try:
82
+ self._lemma_store = load_lemma_store(lex_path)
83
+ return self._lemma_store
84
+ except Exception:
85
+ pass
86
+
87
+ self._lemma_store = None
88
+ return None
89
+
90
+ @property
91
+ def source(self) -> str:
92
+ """분석기 소스 반환"""
93
+ if self.specialized_analyzer:
94
+ return 'specialized'
95
+ return 'fallback'
96
+
97
+ @property
98
+ def is_available(self) -> bool:
99
+ """분석 가능 여부"""
100
+ # Quality spec for TokMor preprocessing:
101
+ # - segmentation is handled by tokenizer
102
+ # - lemma is always available at least as an identity fallback
103
+ # Therefore morphology should be "available" for any tokenized language.
104
+ return True
105
+
106
+ def lemmatize(self, word: str, pos_hint: Optional[str] = None) -> str:
107
+ """
108
+ 단어의 기본형(lemma) 반환
109
+
110
+ Args:
111
+ word: 입력 단어
112
+
113
+ Returns:
114
+ 기본형 (lemma)
115
+ """
116
+ result = self.lemmatize_detailed(word, pos_hint=pos_hint)
117
+ return result.lemma
118
+
119
+ def lemmatize_detailed(self, word: str, *, pos_hint: Optional[str] = None) -> LemmaResult:
120
+ """
121
+ 상세 lemmatization 결과 반환
122
+
123
+ Args:
124
+ word: 입력 단어
125
+
126
+ Returns:
127
+ LemmaResult with lemma, pos, features, source
128
+ """
129
+ word_lower = word.lower()
130
+
131
+ # 0. Lemma lexicon (if present): highest priority
132
+ store = self._get_lemma_store()
133
+ if store:
134
+ # Latin-script languages: store lowercase key; otherwise keep exact surface too.
135
+ key = word_lower if self.lang in {"en", "de", "fr", "es", "it", "pt", "nl", "sv", "da", "no", "fi"} else word
136
+ lemma = store.get(key)
137
+ if lemma:
138
+ # Some legacy lexicons store decomposed lemmas like "먹+었+다".
139
+ # For Korean, prefer the specialized analyzer unless the lexicon provides a clean lemma.
140
+ if self.lang == "ko" and isinstance(lemma, str) and "+" in lemma:
141
+ style = os.getenv("TOKMOR_KO_LEMMA_STYLE", "dict").strip().lower()
142
+ if style not in ("decomp",):
143
+ lemma = None
144
+ if lemma:
145
+ if self.lang == "en":
146
+ lemma = self._en_postprocess_lemma(word, lemma, source="lexicon", pos_hint=pos_hint)
147
+ return LemmaResult(
148
+ word=word,
149
+ lemma=lemma,
150
+ confidence=0.99,
151
+ source="lexicon",
152
+ )
153
+
154
+ # Pack-less mode: prefer conservative fallback rules across languages.
155
+ # Many "advanced analyzers" are dictionary/heuristic segmenters and can output partial lemmas
156
+ # (worse than a stable normalized surface) when no lemma pack is present.
157
+ packless = os.getenv("TOKMOR_DISABLE_LEMMA_PACK", "").strip().lower() in {"1", "true", "yes", "y", "on"}
158
+
159
+ # 1. 특화 분석기 시도
160
+ # NOTE: for Chinese, lemma is effectively identity; specialized segmentation can produce
161
+ # undesirable "partial" lemmas for combined tokens (e.g., dates). Prefer fallback.
162
+ if self.lang == "zh" or (packless and self.lang not in {"ko"}):
163
+ self.specialized_analyzer = None
164
+
165
+ if self.specialized_analyzer:
166
+ try:
167
+ result = self.specialized_analyzer.analyze(word)
168
+
169
+ # Advanced analyzers often return NBestResult -> use .best.morphemes
170
+ morphemes = None
171
+ confidence = 1.0
172
+ if hasattr(result, "best") and hasattr(result.best, "morphemes"):
173
+ morphemes = result.best.morphemes
174
+ confidence = getattr(result.best, "score", 1.0)
175
+ elif hasattr(result, "morphemes"):
176
+ morphemes = result.morphemes
177
+ confidence = getattr(result, "score", 1.0)
178
+
179
+ if morphemes:
180
+ # Korean: reconstruct lemma for verb/adjective and noun+XSV(하다) patterns
181
+ if self.lang == "ko":
182
+ lemma, pos = self._ko_reconstruct_lemma(morphemes)
183
+ return LemmaResult(
184
+ word=word,
185
+ lemma=lemma,
186
+ pos=pos,
187
+ features=getattr(morphemes[0], "features", None),
188
+ confidence=float(confidence),
189
+ source="specialized",
190
+ )
191
+
192
+ # Japanese: avoid returning a function morpheme as "lemma" (e.g., もの -> も).
193
+ if self.lang == "ja":
194
+ # Prefer a "content" morpheme lemma; fallback to normalized surface.
195
+ def _is_ja_function_pos(p: Optional[str]) -> bool:
196
+ if not p:
197
+ return False
198
+ # Common function tags in our JA analyzer family
199
+ if p.startswith("助"):
200
+ return True # 助詞/助動詞...
201
+ if p in {"連用形", "終止形", "未然形", "促音便"}:
202
+ return True
203
+ return False
204
+
205
+ for m in morphemes:
206
+ mp = getattr(m, "pos", None)
207
+ if _is_ja_function_pos(mp):
208
+ continue
209
+ lem0 = getattr(m, "lemma", getattr(m, "form", None))
210
+ if isinstance(lem0, str) and lem0:
211
+ return LemmaResult(
212
+ word=word,
213
+ lemma=lem0,
214
+ pos=mp,
215
+ features=getattr(m, "features", None),
216
+ confidence=float(confidence),
217
+ source="specialized",
218
+ )
219
+ # If everything looked like function parts, fall back to rule-based path.
220
+ raise RuntimeError("ja specialized produced only function morphemes")
221
+
222
+ # Default: first morpheme lemma
223
+ m0 = morphemes[0]
224
+ lem = getattr(m0, "lemma", getattr(m0, "form", word))
225
+ if self.lang == "en":
226
+ lem = self._en_postprocess_lemma(word, str(lem), source="specialized", pos_hint=pos_hint)
227
+ return LemmaResult(
228
+ word=word,
229
+ lemma=lem,
230
+ pos=getattr(m0, "pos", None),
231
+ features=getattr(m0, "features", None),
232
+ confidence=float(confidence),
233
+ source="specialized",
234
+ )
235
+ except Exception:
236
+ pass
237
+
238
+ # 2. 규칙 기반 폴백
239
+ lemma = self._fallback_lemmatize(word_lower, original=word)
240
+ if self.lang == "en":
241
+ lemma = self._en_postprocess_lemma(word, lemma, source="fallback", pos_hint=pos_hint)
242
+ return LemmaResult(
243
+ word=word,
244
+ lemma=lemma,
245
+ confidence=0.3,
246
+ source='fallback'
247
+ )
248
+
249
+ def _en_postprocess_lemma(self, word: str, lemma: str, *, source: str = "", pos_hint: Optional[str] = None) -> str:
250
+ """
251
+ English lemma postprocessing to better match common corpus conventions on noisy/web corpora.
252
+ This runs AFTER lexicon lookup and AFTER rule-based fallback.
253
+ """
254
+ import unicodedata
255
+
256
+ w = word or ""
257
+ wl = w.lower()
258
+ lem = lemma or w
259
+
260
+ # 1) Common clitics / contractions (token-level, no context)
261
+ # Note: "'s" is ambiguous (POS=PART vs AUX). In EWT test, "'s" is slightly more common.
262
+ CONTR = {
263
+ "n't": "not",
264
+ "'re": "be",
265
+ "'m": "be",
266
+ "'ve": "have",
267
+ "'ll": "will",
268
+ "'d": "would",
269
+ # "'s" is ambiguous: AUX -> be/have, PART -> 's
270
+ "'s": "'s",
271
+ }
272
+ if wl in CONTR:
273
+ if wl == "'s":
274
+ # Use coarse POS hint when available: AUX/VERB-like -> be
275
+ ph = (pos_hint or "").upper()
276
+ if ph in {"V"}:
277
+ return "be"
278
+ return CONTR[wl]
279
+ # Token "s" (missing apostrophe in noisy text) is ambiguous; in EWT it most often maps to BE.
280
+ if wl == "s" and w == "s":
281
+ ph = (pos_hint or "").upper()
282
+ if ph in {"V"}:
283
+ return "be"
284
+ return "'s"
285
+
286
+ # 2) Preserve punctuation-only tokens exactly (..., --, etc.)
287
+ def _is_punct_or_symbol_only(s: str) -> bool:
288
+ return bool(s) and all(unicodedata.category(ch)[0] in {"P", "S"} for ch in s)
289
+
290
+ if _is_punct_or_symbol_only(w):
291
+ return w
292
+
293
+ # 3) Preserve special mixed tokens (dates, ids, emails, domains, filenames, phone numbers)
294
+ has_digit = any(ch.isdigit() for ch in w)
295
+ has_alpha = any(ch.isalpha() for ch in w)
296
+ has_special_sep = any(ch in {"@", ".", "-", "_", "/", ":"} for ch in w)
297
+ if (has_digit and (has_alpha or has_special_sep)) or (("@" in w) and has_alpha):
298
+ return w
299
+ # phone-like patterns (digits with separators)
300
+ if has_digit and any(ch in {"-", "/"} for ch in w):
301
+ return w
302
+
303
+ # 4) Abbreviations (web/news specific wins)
304
+ if w == "AM":
305
+ return "a.m."
306
+ if w == "PM":
307
+ return "p.m."
308
+ # Abbreviations ending with a dot: keep surface (Inc., Dr., U.S., D.C., ...)
309
+ if "." in w and w.endswith(".") and any(ch.isalpha() for ch in w):
310
+ return w
311
+
312
+ # 5) Acronyms / all-caps
313
+ if len(w) >= 2 and w.isupper() and w.isalpha():
314
+ # Short 2-letter country/region acronyms are usually kept as-is (US, UK, EU, ...).
315
+ if len(w) <= 2:
316
+ return w
317
+ # If lexicon produced a lemma, trust it (common words often appear in all-caps in headlines).
318
+ if source == "lexicon":
319
+ return lem
320
+ # Heuristic: vowel-less all-caps tokens are often technical acronyms -> lowercase (MMBTU -> mmbtu)
321
+ vcnt = sum(1 for ch in w if ch in "AEIOU")
322
+ if vcnt <= 1:
323
+ return wl
324
+ # Otherwise, likely a name written in all-caps -> Titlecase (SOUTER -> Souter)
325
+ return w.title()
326
+
327
+ # 6) Emoticons with letters: lowercase (e.g., :D -> :d)
328
+ if any(ch.isalpha() for ch in w) and sum(ch.isalnum() for ch in w) <= 2 and has_special_sep:
329
+ return w.lower()
330
+
331
+ # 7) Case normalization fixes
332
+ # If lexicon returns a titlecased lemma for a lowercase token, prefer the lowercase surface.
333
+ if w.islower() and len(lem) > 1 and lem[0].isupper() and lem.lower() == wl:
334
+ return w
335
+
336
+ # If token is Titlecase and lemma is the fully-lowercased form, it can be a PROPN-like lemma.
337
+ # Apply only for non-lexicon sources to avoid harming normal sentence-initial capitalization.
338
+ if source != "lexicon" and len(w) >= 2 and w[0].isupper() and w[1:].islower() and lem == wl:
339
+ TITLE_STOP = {
340
+ "i", "a", "an", "the", "this", "that", "these", "those",
341
+ "and", "or", "but", "if", "in", "on", "at", "to", "of", "for", "from", "by", "with", "as",
342
+ "do", "did", "does", "is", "are", "was", "were", "be",
343
+ "my", "your", "our", "their", "his", "her",
344
+ "yes", "no", "hi", "hello", "thanks", "thank", "please", "what", "how", "why", "when", "where", "who",
345
+ "good", "great", "very", "email",
346
+ }
347
+ if wl not in TITLE_STOP:
348
+ return w
349
+
350
+ # 8) Pronoun lemma overrides (unambiguous subset)
351
+ PRON = {
352
+ "us": "we",
353
+ "me": "I",
354
+ "him": "he",
355
+ "them": "they",
356
+ }
357
+ if wl in PRON and w.islower():
358
+ return PRON[wl]
359
+
360
+ return lem
361
+
362
+ def _ko_reconstruct_lemma(self, morphemes) -> Tuple[str, Optional[str]]:
363
+ """
364
+ Korean lemma reconstruction from morpheme sequence.
365
+ Goal: produce dictionary-form lemma closer to common usage (e.g., 먹었다 -> 먹다, 했다 -> 하다, 발표했다 -> 발표하다).
366
+ """
367
+ def _pos(m):
368
+ return getattr(m, "pos", "") or ""
369
+
370
+ def _lem(m):
371
+ return getattr(m, "lemma", getattr(m, "form", getattr(m, "surface", ""))) or ""
372
+
373
+ # 1) If there's an explicit verb/adjective stem, use it + '다'
374
+ for m in morphemes:
375
+ p = _pos(m)
376
+ if p in {"VV", "VA", "VX", "VCP", "VCN"}:
377
+ stem = _lem(m)
378
+ if stem:
379
+ return (stem + "다", p)
380
+
381
+ # 2) Noun + XSV('하') pattern -> noun + '하다'
382
+ if len(morphemes) >= 2:
383
+ m0, m1 = morphemes[0], morphemes[1]
384
+ if _pos(m0).startswith("N") and _pos(m1) == "XSV" and _lem(m1) == "하":
385
+ n = _lem(m0)
386
+ if n:
387
+ return (n + "하다", "VV")
388
+
389
+ # 3) Fallback: first morpheme lemma
390
+ m0 = morphemes[0]
391
+ return (_lem(m0) or word, _pos(m0) or None)
392
+
393
+ def _apply_suffix_rules(self, word: str, rules: Dict) -> Optional[Dict]:
394
+ """접미사 규칙 적용"""
395
+ best_match = None
396
+ best_length = 0
397
+
398
+ for suffix, rule in rules.items():
399
+ if word.endswith(suffix) and len(suffix) > best_length:
400
+ strip = rule.get('strip', len(suffix))
401
+ add = rule.get('add', '')
402
+
403
+ if len(word) > strip:
404
+ lemma = word[:-strip] + add if strip > 0 else word + add
405
+ if lemma: # 빈 문자열 방지
406
+ best_match = {
407
+ 'lemma': lemma,
408
+ 'features': rule.get('features'),
409
+ 'prob': rule.get('prob', 0.5)
410
+ }
411
+ best_length = len(suffix)
412
+
413
+ return best_match
414
+
415
+ def _fallback_lemmatize(self, word: str, *, original: Optional[str] = None) -> str:
416
+ """
417
+ 규칙 기반 폴백 lemmatization (pack-less / lexicon-less path)
418
+
419
+ 목표:
420
+ - 어떤 언어/토큰이 와도 절대 크래시하지 않음
421
+ - 과도한 어간추출(stemming)로 의미 훼손하지 않음(보수적)
422
+ - 최소한의 정규화: NFC + 양끝 구두점 제거 + (가능하면) 소문자화
423
+ """
424
+ import unicodedata
425
+
426
+ w0 = original if original is not None else word
427
+ try:
428
+ w0 = unicodedata.normalize("NFC", w0)
429
+ except Exception:
430
+ pass
431
+ # Strip common edge punctuation (keep internal apostrophes/hyphens)
432
+ try:
433
+ w0 = w0.strip().strip(" \t\r\n")
434
+ # Include common CJK punctuation too (。!?、,:; etc.)
435
+ w0 = w0.strip(".,;:!?\"“”‘’`()[]{}<>«»。、,!?:;()【】『』「」《》〈〉")
436
+ except Exception:
437
+ pass
438
+ if not w0:
439
+ return word
440
+
441
+ def _has_casing(s: str) -> bool:
442
+ # True if the script has case distinctions (Latin/Cyrillic/Greek/etc.).
443
+ # (For uncased scripts, lower/casefold is a no-op anyway, but keep it explicit.)
444
+ try:
445
+ for ch in s:
446
+ if not ch.isalpha():
447
+ continue
448
+ if ch.lower() != ch.upper():
449
+ return True
450
+ except Exception:
451
+ pass
452
+ return False
453
+
454
+ # Default: casefold for any cased script (handles Latin/Cyrillic/Greek/etc).
455
+ try:
456
+ wl = w0.casefold() if _has_casing(w0) else w0
457
+ except Exception:
458
+ wl = w0.lower() if w0.isascii() else w0
459
+
460
+ # Arabic-family normalization (keeps it conservative; improves pack-less matching/stability)
461
+ if self.lang in {"ar", "fa", "ur"}:
462
+ try:
463
+ # Remove tatweel and common harakat/diacritics
464
+ wl = wl.replace("ـ", "")
465
+ wl = "".join(ch for ch in wl if unicodedata.category(ch) != "Mn")
466
+ # Normalize alef variants + ya/maqsura (useful for ar; harmless for fa/ur)
467
+ wl = wl.translate(
468
+ str.maketrans(
469
+ {
470
+ "أ": "ا",
471
+ "إ": "ا",
472
+ "آ": "ا",
473
+ "ى": "ي",
474
+ }
475
+ )
476
+ )
477
+ except Exception:
478
+ pass
479
+
480
+ # Hebrew: strip niqqud (Mn marks)
481
+ if self.lang == "he":
482
+ try:
483
+ wl = "".join(ch for ch in wl if unicodedata.category(ch) != "Mn")
484
+ except Exception:
485
+ pass
486
+
487
+ # Turkic orthography: split case-marking after apostrophe in proper nouns (Ankara'ya -> ankara)
488
+ if self.lang in {"tr", "az", "kk", "uz"}:
489
+ if "'" in wl:
490
+ try:
491
+ left, right = wl.split("'", 1)
492
+ if left and right and left.isalpha() and right.isalpha():
493
+ wl = left
494
+ except Exception:
495
+ pass
496
+
497
+ # Romance elision clitics: l'amour -> amour (very conservative)
498
+ if self.lang in {"fr", "it", "ca"}:
499
+ if "'" in wl:
500
+ try:
501
+ left, right = wl.split("'", 1)
502
+ if 1 <= len(left) <= 2 and right and right[0].isalpha():
503
+ wl = right
504
+ except Exception:
505
+ pass
506
+
507
+ def _is_hangul_syllable(ch: str) -> bool:
508
+ o = ord(ch)
509
+ return 0xAC00 <= o <= 0xD7A3
510
+
511
+ def _ko_strip_josa(s: str) -> str:
512
+ # Conservative: strip only common postpositions/endings if token is Hangul and longer than suffix.
513
+ if not s or len(s) < 2:
514
+ return s
515
+ if not all(_is_hangul_syllable(c) for c in s):
516
+ return s
517
+ # Longest-first
518
+ suffixes = [
519
+ "으로부터",
520
+ "에서부터",
521
+ "으로써",
522
+ "로써",
523
+ "으로서",
524
+ "로서",
525
+ "에게서",
526
+ "까지",
527
+ "부터",
528
+ "에서",
529
+ "에게",
530
+ "한테",
531
+ "께서",
532
+ "께",
533
+ "보다",
534
+ "처럼",
535
+ "마저",
536
+ "조차",
537
+ "라도",
538
+ "이나",
539
+ "나",
540
+ "와",
541
+ "과",
542
+ "으로",
543
+ "로",
544
+ "에",
545
+ "의",
546
+ "도",
547
+ "만",
548
+ "은",
549
+ "는",
550
+ "이",
551
+ "가",
552
+ "을",
553
+ "를",
554
+ ]
555
+ for suf in suffixes:
556
+ if len(s) > len(suf) and s.endswith(suf):
557
+ stem = s[: -len(suf)]
558
+ if stem:
559
+ return stem
560
+ return s
561
+
562
+ def _is_kanji(ch: str) -> bool:
563
+ o = ord(ch)
564
+ return (0x4E00 <= o <= 0x9FFF) or (0x3400 <= o <= 0x4DBF)
565
+
566
+ def _is_kana(ch: str) -> bool:
567
+ o = ord(ch)
568
+ return (0x3040 <= o <= 0x309F) or (0x30A0 <= o <= 0x30FF)
569
+
570
+ def _ja_strip_particles(s: str) -> str:
571
+ # Conservative: strip only if the token ends with a particle and the preceding char looks content-ish.
572
+ if not s or len(s) < 2:
573
+ return s
574
+ # multi-char first
575
+ multi = ["から", "まで", "より"]
576
+ for suf in multi:
577
+ if len(s) > len(suf) and s.endswith(suf):
578
+ stem = s[: -len(suf)]
579
+ if stem:
580
+ return stem
581
+ # single-char particles: only strip if previous char is Kanji/Katakana (avoid stripping from pure-hiragana words like もの)
582
+ single = ["は", "が", "を", "に", "で", "と", "も", "へ", "の", "や", "か", "ね", "よ", "な"]
583
+ last = s[-1]
584
+ if last in single:
585
+ prev = s[-2]
586
+ if _is_kanji(prev) or (0x30A0 <= ord(prev) <= 0x30FF):
587
+ stem = s[:-1]
588
+ if stem:
589
+ return stem
590
+ return s
591
+
592
+ # 영어 기본 규칙(기존 유지)
593
+ if self.lang == 'en':
594
+ # -ing
595
+ if wl.endswith('ing') and len(wl) > 4:
596
+ base = wl[:-3]
597
+ if base.endswith('e'):
598
+ return base
599
+ if len(base) > 2 and base[-1] == base[-2]: # running -> run
600
+ return base[:-1]
601
+ return base + 'e' if base[-1] not in 'aeiou' else base
602
+ # -ed
603
+ if wl.endswith('ed') and len(wl) > 3:
604
+ base = wl[:-2]
605
+ if wl.endswith('ied'):
606
+ return wl[:-3] + 'y'
607
+ if len(base) > 2 and base[-1] == base[-2]: # stopped -> stop
608
+ return base[:-1]
609
+ return base + 'e' if base[-1] not in 'aeiou' else base
610
+ # -s, -es
611
+ if wl.endswith('ies') and len(wl) > 3:
612
+ return wl[:-3] + 'y'
613
+ if wl.endswith('es') and len(wl) > 3:
614
+ return wl[:-2]
615
+ if wl.endswith('s') and len(wl) > 2:
616
+ return wl[:-1]
617
+ return wl
618
+
619
+ # High-value pack-less tuning: strip common function-word suffixes in agglutinative no-space-ish usage.
620
+ if self.lang == "ko":
621
+ return _ko_strip_josa(wl)
622
+ if self.lang == "ja":
623
+ return _ja_strip_particles(wl)
624
+
625
+ # Pack-less fallback for most languages: conservative normalization only
626
+ # - If ASCII: lowercase to stabilize (matches common lemma conventions for Latin-script langs)
627
+ # - Else: keep as-is (already NFC + edge punct stripped)
628
+ return wl
629
+
630
+ def analyze(self, text: str) -> List[LemmaResult]:
631
+ """
632
+ 텍스트 전체 분석
633
+
634
+ Args:
635
+ text: 입력 텍스트
636
+
637
+ Returns:
638
+ LemmaResult 리스트
639
+ """
640
+ # SEA no-space scripts: the template analyzers tend to group whole runs,
641
+ # while TokMor tokenizers (Brahmic/Thai native) provide much better segmentation.
642
+ # For product stability, prefer tokenizer-backed segmentation here.
643
+ if self.lang in {"th", "km", "lo", "my"}:
644
+ try:
645
+ from ..factory import get_tokenizer # lazy import
646
+ use_morph = (self.lang == "th")
647
+ tok = get_tokenizer(self.lang, use_morphology=use_morph)
648
+ tres = tok.tokenize(text)
649
+ out: List[LemmaResult] = []
650
+ for t in (tres.tokens or []):
651
+ try:
652
+ det = self.lemmatize_detailed(t.text)
653
+ lemma = t.lemma if getattr(t, "lemma", None) else det.lemma
654
+ out.append(
655
+ LemmaResult(
656
+ word=t.text,
657
+ lemma=lemma,
658
+ pos=getattr(t, "pos", None) or det.pos,
659
+ features=str(getattr(t, "features", None)) if getattr(t, "features", None) else det.features,
660
+ confidence=float(det.confidence),
661
+ source=det.source,
662
+ )
663
+ )
664
+ except Exception:
665
+ out.append(LemmaResult(word=t.text, lemma=t.text, source="fallback", confidence=0.1))
666
+ if out:
667
+ return out
668
+ except Exception:
669
+ # fall through to generic logic
670
+ pass
671
+
672
+ # 특화 분석기 있으면 사용
673
+ if self.specialized_analyzer:
674
+ try:
675
+ result = self.specialized_analyzer.analyze(text)
676
+ if result and result.morphemes:
677
+ return [
678
+ LemmaResult(
679
+ word=m.form,
680
+ lemma=getattr(m, 'lemma', m.form),
681
+ pos=m.pos,
682
+ features=getattr(m, 'features', None),
683
+ source='specialized'
684
+ )
685
+ for m in result.morphemes
686
+ ]
687
+ except Exception:
688
+ pass
689
+
690
+ # Generic fallback: tokenizer-backed segmentation (better than whitespace split for many scripts)
691
+ try:
692
+ from ..factory import get_tokenizer # lazy import
693
+ tok = get_tokenizer(self.lang, use_morphology=False)
694
+ tokens = tok.tokenize(text).texts()
695
+ if tokens:
696
+ return [self.lemmatize_detailed(w) for w in tokens]
697
+ except Exception:
698
+ pass
699
+
700
+ # Last fallback: whitespace split
701
+ words = text.split()
702
+ return [self.lemmatize_detailed(w) for w in words if w]
703
+
704
+ def get_morphemes(self, word: str) -> List[Morpheme]:
705
+ """
706
+ 단어의 형태소 분석 결과 반환 (특화 분석기 전용)
707
+
708
+ Args:
709
+ word: 입력 단어
710
+
711
+ Returns:
712
+ Morpheme 리스트
713
+ """
714
+ if self.specialized_analyzer:
715
+ try:
716
+ result = self.specialized_analyzer.analyze(word)
717
+ if result and result.morphemes:
718
+ return result.morphemes
719
+ except Exception:
720
+ pass
721
+
722
+ # 폴백: 단일 형태소로 반환
723
+ lemma_result = self.lemmatize_detailed(word)
724
+ return [Morpheme(
725
+ form=word,
726
+ pos=lemma_result.pos or 'X',
727
+ lemma=lemma_result.lemma,
728
+ features=lemma_result.features
729
+ )]
730
+
731
+
732
+ # 캐시
733
+ _unified_analyzers: Dict[str, UnifiedMorphAnalyzer] = {}
734
+
735
+
736
+ def get_unified_analyzer(lang: str) -> UnifiedMorphAnalyzer:
737
+ """
738
+ 통합 형태소 분석기 반환
739
+
740
+ Args:
741
+ lang: 언어 코드
742
+
743
+ Returns:
744
+ UnifiedMorphAnalyzer 인스턴스
745
+ """
746
+ lang = lang.lower()
747
+
748
+ if lang not in _unified_analyzers:
749
+ _unified_analyzers[lang] = UnifiedMorphAnalyzer(lang)
750
+
751
+ return _unified_analyzers[lang]
752
+
753
+
754
+ def unified_supported_languages() -> List[str]:
755
+ """
756
+ 통합 형태소 분석 지원 언어 목록
757
+
758
+ Returns:
759
+ 언어 코드 리스트 (특화 + lemma lexicon 합집합)
760
+ """
761
+ # 특화 분석기 언어
762
+ specialized = set(ANALYZERS.keys()) - {'xx'}
763
+ # Tokenize-supported languages (fallback lemma is always available)
764
+ tokenize_langs = set()
765
+ try:
766
+ from ..factory import supported_languages as _supported_languages # lazy import
767
+ tokenize_langs = set(_supported_languages())
768
+ except Exception:
769
+ tokenize_langs = set()
770
+ lex_langs = set()
771
+ try:
772
+ ld = resources.lemma_dict_dir()
773
+ if ld.exists():
774
+ for f in ld.glob("*"):
775
+ if f.suffix.lower() in (".pkl", ".sqlite", ".db", ".sqlite3"):
776
+ lex_langs.add(f.stem.split("_")[0])
777
+ except Exception:
778
+ pass
779
+ # Legacy external lemma packs are intentionally not supported in OSS distribution.
780
+
781
+ return sorted(tokenize_langs | specialized | lex_langs)
782
+
783
+
784
+ def unified_language_info() -> Dict[str, Any]:
785
+ """
786
+ 통합 형태소 분석 언어별 상세 정보
787
+
788
+ Returns:
789
+ 언어별 정보 딕셔너리
790
+ """
791
+ # 특화 분석기 언어
792
+ specialized = set(ANALYZERS.keys()) - {'xx'}
793
+
794
+ # lemma lexicon 언어
795
+ lex_langs = set()
796
+ try:
797
+ ld = resources.lemma_dict_dir()
798
+ if ld.exists():
799
+ for f in ld.glob("*"):
800
+ if f.suffix.lower() in (".pkl", ".sqlite", ".db", ".sqlite3"):
801
+ lex_langs.add(f.stem.split("_")[0])
802
+ except Exception:
803
+ pass
804
+ # Legacy external lemma packs are intentionally not supported in OSS distribution.
805
+
806
+ # 언어별 정보 조합
807
+ all_langs = specialized | set(lex_langs)
808
+
809
+ info = {
810
+ 'total': len(all_langs),
811
+ 'specialized_count': len(specialized),
812
+ 'lexicon_count': len(lex_langs),
813
+ 'overlap_count': len(specialized & set(lex_langs)),
814
+ 'languages': {}
815
+ }
816
+
817
+ for lang in sorted(all_langs):
818
+ lang_info = {
819
+ 'has_specialized': lang in specialized,
820
+ 'has_lexicon': lang in lex_langs
821
+ }
822
+ info['languages'][lang] = lang_info
823
+
824
+ return info
825
+
826
+
827
+ # 편의 함수
828
+ def lemmatize(word: str, lang: str = 'en') -> str:
829
+ """
830
+ 단어의 기본형 반환 (편의 함수)
831
+
832
+ Args:
833
+ word: 입력 단어
834
+ lang: 언어 코드
835
+
836
+ Returns:
837
+ 기본형 (lemma)
838
+ """
839
+ analyzer = get_unified_analyzer(lang)
840
+ return analyzer.lemmatize(word)
841
+
842
+
843
+ def analyze(text: str, lang: str = 'en') -> List[LemmaResult]:
844
+ """
845
+ 텍스트 형태소 분석 (편의 함수)
846
+
847
+ Args:
848
+ text: 입력 텍스트
849
+ lang: 언어 코드
850
+
851
+ Returns:
852
+ LemmaResult 리스트
853
+ """
854
+ analyzer = get_unified_analyzer(lang)
855
+ return analyzer.analyze(text)