tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,855 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified Morphological Analyzer
|
|
3
|
+
==============================
|
|
4
|
+
|
|
5
|
+
두 가지 형태소 시스템을 통합:
|
|
6
|
+
1. 특화 분석기 (74개 언어) - 규칙 기반
|
|
7
|
+
2. (옵션) lemma lexicon (언어별) - 사전 기반 (외부 자산)
|
|
8
|
+
|
|
9
|
+
Note:
|
|
10
|
+
- TokMor OSS core는 대형 lemma 사전을 wheel/sdist에 포함하지 않는다.
|
|
11
|
+
- lemma lexicon은 필요하면 `TOKMOR_DATA_DIR/lemma_dict/` 또는 `TOKMOR_LEMMA_DICT_DIR`로 제공한다.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import pickle
|
|
15
|
+
import os
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Optional, List, Dict, Any, Tuple
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
|
|
20
|
+
from .advanced_base import AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult
|
|
21
|
+
from . import ANALYZERS
|
|
22
|
+
from .. import resources
|
|
23
|
+
from ..lemma_store import BaseLemmaStore, load_lemma_store
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class LemmaResult:
|
|
28
|
+
"""Lemmatization result"""
|
|
29
|
+
word: str
|
|
30
|
+
lemma: str
|
|
31
|
+
pos: Optional[str] = None
|
|
32
|
+
features: Optional[str] = None
|
|
33
|
+
confidence: float = 1.0
|
|
34
|
+
source: str = 'unknown' # 'lexicon', 'specialized', 'model', 'rule', 'fallback'
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class UnifiedMorphAnalyzer:
|
|
38
|
+
"""
|
|
39
|
+
통합 형태소 분석기
|
|
40
|
+
|
|
41
|
+
우선순위:
|
|
42
|
+
1. 특화 분석기 (있으면)
|
|
43
|
+
2. lemma lexicon (있으면)
|
|
44
|
+
3. 규칙 기반 폴백
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, lang: str):
|
|
48
|
+
self.lang = lang.lower()
|
|
49
|
+
self.specialized_analyzer = None
|
|
50
|
+
self._lemma_store: Optional[BaseLemmaStore] = None # lazy-loaded (sqlite/pkl)
|
|
51
|
+
self._load_analyzers()
|
|
52
|
+
|
|
53
|
+
def _load_analyzers(self):
|
|
54
|
+
"""분석기 로드"""
|
|
55
|
+
# 1. 특화 분석기 시도
|
|
56
|
+
if self.lang in ANALYZERS:
|
|
57
|
+
try:
|
|
58
|
+
self.specialized_analyzer = ANALYZERS[self.lang]()
|
|
59
|
+
except Exception:
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
def _get_lemma_store(self) -> Optional[BaseLemmaStore]:
|
|
63
|
+
"""
|
|
64
|
+
Load lemma lexicon store (surface -> lemma) if available.
|
|
65
|
+
|
|
66
|
+
Location (priority):
|
|
67
|
+
- TOKMOR_LEMMA_DICT_DIR/{lang}.sqlite|.db|.pkl ...
|
|
68
|
+
- tokmor/models/lemma_dict/{lang}.sqlite|.db|.pkl ...
|
|
69
|
+
- (legacy) some deployments used additional lemma packs under TOKMOR_DATA_DIR
|
|
70
|
+
"""
|
|
71
|
+
if self._lemma_store is not None:
|
|
72
|
+
return self._lemma_store
|
|
73
|
+
|
|
74
|
+
# Allow lemma aliasing for top100 wiki-style codes (dev convenience)
|
|
75
|
+
lex_lang = resources.normalize_lang_for_lemma(self.lang) if hasattr(resources, "normalize_lang_for_lemma") else self.lang
|
|
76
|
+
lex_path = resources.resolve_lemma_dict_path(lex_lang)
|
|
77
|
+
if not lex_path:
|
|
78
|
+
self._lemma_store = None
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
self._lemma_store = load_lemma_store(lex_path)
|
|
83
|
+
return self._lemma_store
|
|
84
|
+
except Exception:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
self._lemma_store = None
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def source(self) -> str:
|
|
92
|
+
"""분석기 소스 반환"""
|
|
93
|
+
if self.specialized_analyzer:
|
|
94
|
+
return 'specialized'
|
|
95
|
+
return 'fallback'
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def is_available(self) -> bool:
|
|
99
|
+
"""분석 가능 여부"""
|
|
100
|
+
# Quality spec for TokMor preprocessing:
|
|
101
|
+
# - segmentation is handled by tokenizer
|
|
102
|
+
# - lemma is always available at least as an identity fallback
|
|
103
|
+
# Therefore morphology should be "available" for any tokenized language.
|
|
104
|
+
return True
|
|
105
|
+
|
|
106
|
+
def lemmatize(self, word: str, pos_hint: Optional[str] = None) -> str:
|
|
107
|
+
"""
|
|
108
|
+
단어의 기본형(lemma) 반환
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
word: 입력 단어
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
기본형 (lemma)
|
|
115
|
+
"""
|
|
116
|
+
result = self.lemmatize_detailed(word, pos_hint=pos_hint)
|
|
117
|
+
return result.lemma
|
|
118
|
+
|
|
119
|
+
def lemmatize_detailed(self, word: str, *, pos_hint: Optional[str] = None) -> LemmaResult:
|
|
120
|
+
"""
|
|
121
|
+
상세 lemmatization 결과 반환
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
word: 입력 단어
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
LemmaResult with lemma, pos, features, source
|
|
128
|
+
"""
|
|
129
|
+
word_lower = word.lower()
|
|
130
|
+
|
|
131
|
+
# 0. Lemma lexicon (if present): highest priority
|
|
132
|
+
store = self._get_lemma_store()
|
|
133
|
+
if store:
|
|
134
|
+
# Latin-script languages: store lowercase key; otherwise keep exact surface too.
|
|
135
|
+
key = word_lower if self.lang in {"en", "de", "fr", "es", "it", "pt", "nl", "sv", "da", "no", "fi"} else word
|
|
136
|
+
lemma = store.get(key)
|
|
137
|
+
if lemma:
|
|
138
|
+
# Some legacy lexicons store decomposed lemmas like "먹+었+다".
|
|
139
|
+
# For Korean, prefer the specialized analyzer unless the lexicon provides a clean lemma.
|
|
140
|
+
if self.lang == "ko" and isinstance(lemma, str) and "+" in lemma:
|
|
141
|
+
style = os.getenv("TOKMOR_KO_LEMMA_STYLE", "dict").strip().lower()
|
|
142
|
+
if style not in ("decomp",):
|
|
143
|
+
lemma = None
|
|
144
|
+
if lemma:
|
|
145
|
+
if self.lang == "en":
|
|
146
|
+
lemma = self._en_postprocess_lemma(word, lemma, source="lexicon", pos_hint=pos_hint)
|
|
147
|
+
return LemmaResult(
|
|
148
|
+
word=word,
|
|
149
|
+
lemma=lemma,
|
|
150
|
+
confidence=0.99,
|
|
151
|
+
source="lexicon",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Pack-less mode: prefer conservative fallback rules across languages.
|
|
155
|
+
# Many "advanced analyzers" are dictionary/heuristic segmenters and can output partial lemmas
|
|
156
|
+
# (worse than a stable normalized surface) when no lemma pack is present.
|
|
157
|
+
packless = os.getenv("TOKMOR_DISABLE_LEMMA_PACK", "").strip().lower() in {"1", "true", "yes", "y", "on"}
|
|
158
|
+
|
|
159
|
+
# 1. 특화 분석기 시도
|
|
160
|
+
# NOTE: for Chinese, lemma is effectively identity; specialized segmentation can produce
|
|
161
|
+
# undesirable "partial" lemmas for combined tokens (e.g., dates). Prefer fallback.
|
|
162
|
+
if self.lang == "zh" or (packless and self.lang not in {"ko"}):
|
|
163
|
+
self.specialized_analyzer = None
|
|
164
|
+
|
|
165
|
+
if self.specialized_analyzer:
|
|
166
|
+
try:
|
|
167
|
+
result = self.specialized_analyzer.analyze(word)
|
|
168
|
+
|
|
169
|
+
# Advanced analyzers often return NBestResult -> use .best.morphemes
|
|
170
|
+
morphemes = None
|
|
171
|
+
confidence = 1.0
|
|
172
|
+
if hasattr(result, "best") and hasattr(result.best, "morphemes"):
|
|
173
|
+
morphemes = result.best.morphemes
|
|
174
|
+
confidence = getattr(result.best, "score", 1.0)
|
|
175
|
+
elif hasattr(result, "morphemes"):
|
|
176
|
+
morphemes = result.morphemes
|
|
177
|
+
confidence = getattr(result, "score", 1.0)
|
|
178
|
+
|
|
179
|
+
if morphemes:
|
|
180
|
+
# Korean: reconstruct lemma for verb/adjective and noun+XSV(하다) patterns
|
|
181
|
+
if self.lang == "ko":
|
|
182
|
+
lemma, pos = self._ko_reconstruct_lemma(morphemes)
|
|
183
|
+
return LemmaResult(
|
|
184
|
+
word=word,
|
|
185
|
+
lemma=lemma,
|
|
186
|
+
pos=pos,
|
|
187
|
+
features=getattr(morphemes[0], "features", None),
|
|
188
|
+
confidence=float(confidence),
|
|
189
|
+
source="specialized",
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Japanese: avoid returning a function morpheme as "lemma" (e.g., もの -> も).
|
|
193
|
+
if self.lang == "ja":
|
|
194
|
+
# Prefer a "content" morpheme lemma; fallback to normalized surface.
|
|
195
|
+
def _is_ja_function_pos(p: Optional[str]) -> bool:
|
|
196
|
+
if not p:
|
|
197
|
+
return False
|
|
198
|
+
# Common function tags in our JA analyzer family
|
|
199
|
+
if p.startswith("助"):
|
|
200
|
+
return True # 助詞/助動詞...
|
|
201
|
+
if p in {"連用形", "終止形", "未然形", "促音便"}:
|
|
202
|
+
return True
|
|
203
|
+
return False
|
|
204
|
+
|
|
205
|
+
for m in morphemes:
|
|
206
|
+
mp = getattr(m, "pos", None)
|
|
207
|
+
if _is_ja_function_pos(mp):
|
|
208
|
+
continue
|
|
209
|
+
lem0 = getattr(m, "lemma", getattr(m, "form", None))
|
|
210
|
+
if isinstance(lem0, str) and lem0:
|
|
211
|
+
return LemmaResult(
|
|
212
|
+
word=word,
|
|
213
|
+
lemma=lem0,
|
|
214
|
+
pos=mp,
|
|
215
|
+
features=getattr(m, "features", None),
|
|
216
|
+
confidence=float(confidence),
|
|
217
|
+
source="specialized",
|
|
218
|
+
)
|
|
219
|
+
# If everything looked like function parts, fall back to rule-based path.
|
|
220
|
+
raise RuntimeError("ja specialized produced only function morphemes")
|
|
221
|
+
|
|
222
|
+
# Default: first morpheme lemma
|
|
223
|
+
m0 = morphemes[0]
|
|
224
|
+
lem = getattr(m0, "lemma", getattr(m0, "form", word))
|
|
225
|
+
if self.lang == "en":
|
|
226
|
+
lem = self._en_postprocess_lemma(word, str(lem), source="specialized", pos_hint=pos_hint)
|
|
227
|
+
return LemmaResult(
|
|
228
|
+
word=word,
|
|
229
|
+
lemma=lem,
|
|
230
|
+
pos=getattr(m0, "pos", None),
|
|
231
|
+
features=getattr(m0, "features", None),
|
|
232
|
+
confidence=float(confidence),
|
|
233
|
+
source="specialized",
|
|
234
|
+
)
|
|
235
|
+
except Exception:
|
|
236
|
+
pass
|
|
237
|
+
|
|
238
|
+
# 2. 규칙 기반 폴백
|
|
239
|
+
lemma = self._fallback_lemmatize(word_lower, original=word)
|
|
240
|
+
if self.lang == "en":
|
|
241
|
+
lemma = self._en_postprocess_lemma(word, lemma, source="fallback", pos_hint=pos_hint)
|
|
242
|
+
return LemmaResult(
|
|
243
|
+
word=word,
|
|
244
|
+
lemma=lemma,
|
|
245
|
+
confidence=0.3,
|
|
246
|
+
source='fallback'
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def _en_postprocess_lemma(self, word: str, lemma: str, *, source: str = "", pos_hint: Optional[str] = None) -> str:
|
|
250
|
+
"""
|
|
251
|
+
English lemma postprocessing to better match common corpus conventions on noisy/web corpora.
|
|
252
|
+
This runs AFTER lexicon lookup and AFTER rule-based fallback.
|
|
253
|
+
"""
|
|
254
|
+
import unicodedata
|
|
255
|
+
|
|
256
|
+
w = word or ""
|
|
257
|
+
wl = w.lower()
|
|
258
|
+
lem = lemma or w
|
|
259
|
+
|
|
260
|
+
# 1) Common clitics / contractions (token-level, no context)
|
|
261
|
+
# Note: "'s" is ambiguous (POS=PART vs AUX). In EWT test, "'s" is slightly more common.
|
|
262
|
+
CONTR = {
|
|
263
|
+
"n't": "not",
|
|
264
|
+
"'re": "be",
|
|
265
|
+
"'m": "be",
|
|
266
|
+
"'ve": "have",
|
|
267
|
+
"'ll": "will",
|
|
268
|
+
"'d": "would",
|
|
269
|
+
# "'s" is ambiguous: AUX -> be/have, PART -> 's
|
|
270
|
+
"'s": "'s",
|
|
271
|
+
}
|
|
272
|
+
if wl in CONTR:
|
|
273
|
+
if wl == "'s":
|
|
274
|
+
# Use coarse POS hint when available: AUX/VERB-like -> be
|
|
275
|
+
ph = (pos_hint or "").upper()
|
|
276
|
+
if ph in {"V"}:
|
|
277
|
+
return "be"
|
|
278
|
+
return CONTR[wl]
|
|
279
|
+
# Token "s" (missing apostrophe in noisy text) is ambiguous; in EWT it most often maps to BE.
|
|
280
|
+
if wl == "s" and w == "s":
|
|
281
|
+
ph = (pos_hint or "").upper()
|
|
282
|
+
if ph in {"V"}:
|
|
283
|
+
return "be"
|
|
284
|
+
return "'s"
|
|
285
|
+
|
|
286
|
+
# 2) Preserve punctuation-only tokens exactly (..., --, etc.)
|
|
287
|
+
def _is_punct_or_symbol_only(s: str) -> bool:
|
|
288
|
+
return bool(s) and all(unicodedata.category(ch)[0] in {"P", "S"} for ch in s)
|
|
289
|
+
|
|
290
|
+
if _is_punct_or_symbol_only(w):
|
|
291
|
+
return w
|
|
292
|
+
|
|
293
|
+
# 3) Preserve special mixed tokens (dates, ids, emails, domains, filenames, phone numbers)
|
|
294
|
+
has_digit = any(ch.isdigit() for ch in w)
|
|
295
|
+
has_alpha = any(ch.isalpha() for ch in w)
|
|
296
|
+
has_special_sep = any(ch in {"@", ".", "-", "_", "/", ":"} for ch in w)
|
|
297
|
+
if (has_digit and (has_alpha or has_special_sep)) or (("@" in w) and has_alpha):
|
|
298
|
+
return w
|
|
299
|
+
# phone-like patterns (digits with separators)
|
|
300
|
+
if has_digit and any(ch in {"-", "/"} for ch in w):
|
|
301
|
+
return w
|
|
302
|
+
|
|
303
|
+
# 4) Abbreviations (web/news specific wins)
|
|
304
|
+
if w == "AM":
|
|
305
|
+
return "a.m."
|
|
306
|
+
if w == "PM":
|
|
307
|
+
return "p.m."
|
|
308
|
+
# Abbreviations ending with a dot: keep surface (Inc., Dr., U.S., D.C., ...)
|
|
309
|
+
if "." in w and w.endswith(".") and any(ch.isalpha() for ch in w):
|
|
310
|
+
return w
|
|
311
|
+
|
|
312
|
+
# 5) Acronyms / all-caps
|
|
313
|
+
if len(w) >= 2 and w.isupper() and w.isalpha():
|
|
314
|
+
# Short 2-letter country/region acronyms are usually kept as-is (US, UK, EU, ...).
|
|
315
|
+
if len(w) <= 2:
|
|
316
|
+
return w
|
|
317
|
+
# If lexicon produced a lemma, trust it (common words often appear in all-caps in headlines).
|
|
318
|
+
if source == "lexicon":
|
|
319
|
+
return lem
|
|
320
|
+
# Heuristic: vowel-less all-caps tokens are often technical acronyms -> lowercase (MMBTU -> mmbtu)
|
|
321
|
+
vcnt = sum(1 for ch in w if ch in "AEIOU")
|
|
322
|
+
if vcnt <= 1:
|
|
323
|
+
return wl
|
|
324
|
+
# Otherwise, likely a name written in all-caps -> Titlecase (SOUTER -> Souter)
|
|
325
|
+
return w.title()
|
|
326
|
+
|
|
327
|
+
# 6) Emoticons with letters: lowercase (e.g., :D -> :d)
|
|
328
|
+
if any(ch.isalpha() for ch in w) and sum(ch.isalnum() for ch in w) <= 2 and has_special_sep:
|
|
329
|
+
return w.lower()
|
|
330
|
+
|
|
331
|
+
# 7) Case normalization fixes
|
|
332
|
+
# If lexicon returns a titlecased lemma for a lowercase token, prefer the lowercase surface.
|
|
333
|
+
if w.islower() and len(lem) > 1 and lem[0].isupper() and lem.lower() == wl:
|
|
334
|
+
return w
|
|
335
|
+
|
|
336
|
+
# If token is Titlecase and lemma is the fully-lowercased form, it can be a PROPN-like lemma.
|
|
337
|
+
# Apply only for non-lexicon sources to avoid harming normal sentence-initial capitalization.
|
|
338
|
+
if source != "lexicon" and len(w) >= 2 and w[0].isupper() and w[1:].islower() and lem == wl:
|
|
339
|
+
TITLE_STOP = {
|
|
340
|
+
"i", "a", "an", "the", "this", "that", "these", "those",
|
|
341
|
+
"and", "or", "but", "if", "in", "on", "at", "to", "of", "for", "from", "by", "with", "as",
|
|
342
|
+
"do", "did", "does", "is", "are", "was", "were", "be",
|
|
343
|
+
"my", "your", "our", "their", "his", "her",
|
|
344
|
+
"yes", "no", "hi", "hello", "thanks", "thank", "please", "what", "how", "why", "when", "where", "who",
|
|
345
|
+
"good", "great", "very", "email",
|
|
346
|
+
}
|
|
347
|
+
if wl not in TITLE_STOP:
|
|
348
|
+
return w
|
|
349
|
+
|
|
350
|
+
# 8) Pronoun lemma overrides (unambiguous subset)
|
|
351
|
+
PRON = {
|
|
352
|
+
"us": "we",
|
|
353
|
+
"me": "I",
|
|
354
|
+
"him": "he",
|
|
355
|
+
"them": "they",
|
|
356
|
+
}
|
|
357
|
+
if wl in PRON and w.islower():
|
|
358
|
+
return PRON[wl]
|
|
359
|
+
|
|
360
|
+
return lem
|
|
361
|
+
|
|
362
|
+
def _ko_reconstruct_lemma(self, morphemes) -> Tuple[str, Optional[str]]:
|
|
363
|
+
"""
|
|
364
|
+
Korean lemma reconstruction from morpheme sequence.
|
|
365
|
+
Goal: produce dictionary-form lemma closer to common usage (e.g., 먹었다 -> 먹다, 했다 -> 하다, 발표했다 -> 발표하다).
|
|
366
|
+
"""
|
|
367
|
+
def _pos(m):
|
|
368
|
+
return getattr(m, "pos", "") or ""
|
|
369
|
+
|
|
370
|
+
def _lem(m):
|
|
371
|
+
return getattr(m, "lemma", getattr(m, "form", getattr(m, "surface", ""))) or ""
|
|
372
|
+
|
|
373
|
+
# 1) If there's an explicit verb/adjective stem, use it + '다'
|
|
374
|
+
for m in morphemes:
|
|
375
|
+
p = _pos(m)
|
|
376
|
+
if p in {"VV", "VA", "VX", "VCP", "VCN"}:
|
|
377
|
+
stem = _lem(m)
|
|
378
|
+
if stem:
|
|
379
|
+
return (stem + "다", p)
|
|
380
|
+
|
|
381
|
+
# 2) Noun + XSV('하') pattern -> noun + '하다'
|
|
382
|
+
if len(morphemes) >= 2:
|
|
383
|
+
m0, m1 = morphemes[0], morphemes[1]
|
|
384
|
+
if _pos(m0).startswith("N") and _pos(m1) == "XSV" and _lem(m1) == "하":
|
|
385
|
+
n = _lem(m0)
|
|
386
|
+
if n:
|
|
387
|
+
return (n + "하다", "VV")
|
|
388
|
+
|
|
389
|
+
# 3) Fallback: first morpheme lemma
|
|
390
|
+
m0 = morphemes[0]
|
|
391
|
+
return (_lem(m0) or word, _pos(m0) or None)
|
|
392
|
+
|
|
393
|
+
def _apply_suffix_rules(self, word: str, rules: Dict) -> Optional[Dict]:
|
|
394
|
+
"""접미사 규칙 적용"""
|
|
395
|
+
best_match = None
|
|
396
|
+
best_length = 0
|
|
397
|
+
|
|
398
|
+
for suffix, rule in rules.items():
|
|
399
|
+
if word.endswith(suffix) and len(suffix) > best_length:
|
|
400
|
+
strip = rule.get('strip', len(suffix))
|
|
401
|
+
add = rule.get('add', '')
|
|
402
|
+
|
|
403
|
+
if len(word) > strip:
|
|
404
|
+
lemma = word[:-strip] + add if strip > 0 else word + add
|
|
405
|
+
if lemma: # 빈 문자열 방지
|
|
406
|
+
best_match = {
|
|
407
|
+
'lemma': lemma,
|
|
408
|
+
'features': rule.get('features'),
|
|
409
|
+
'prob': rule.get('prob', 0.5)
|
|
410
|
+
}
|
|
411
|
+
best_length = len(suffix)
|
|
412
|
+
|
|
413
|
+
return best_match
|
|
414
|
+
|
|
415
|
+
def _fallback_lemmatize(self, word: str, *, original: Optional[str] = None) -> str:
|
|
416
|
+
"""
|
|
417
|
+
규칙 기반 폴백 lemmatization (pack-less / lexicon-less path)
|
|
418
|
+
|
|
419
|
+
목표:
|
|
420
|
+
- 어떤 언어/토큰이 와도 절대 크래시하지 않음
|
|
421
|
+
- 과도한 어간추출(stemming)로 의미 훼손하지 않음(보수적)
|
|
422
|
+
- 최소한의 정규화: NFC + 양끝 구두점 제거 + (가능하면) 소문자화
|
|
423
|
+
"""
|
|
424
|
+
import unicodedata
|
|
425
|
+
|
|
426
|
+
w0 = original if original is not None else word
|
|
427
|
+
try:
|
|
428
|
+
w0 = unicodedata.normalize("NFC", w0)
|
|
429
|
+
except Exception:
|
|
430
|
+
pass
|
|
431
|
+
# Strip common edge punctuation (keep internal apostrophes/hyphens)
|
|
432
|
+
try:
|
|
433
|
+
w0 = w0.strip().strip(" \t\r\n")
|
|
434
|
+
# Include common CJK punctuation too (。!?、,:; etc.)
|
|
435
|
+
w0 = w0.strip(".,;:!?\"“”‘’`()[]{}<>«»。、,!?:;()【】『』「」《》〈〉")
|
|
436
|
+
except Exception:
|
|
437
|
+
pass
|
|
438
|
+
if not w0:
|
|
439
|
+
return word
|
|
440
|
+
|
|
441
|
+
def _has_casing(s: str) -> bool:
|
|
442
|
+
# True if the script has case distinctions (Latin/Cyrillic/Greek/etc.).
|
|
443
|
+
# (For uncased scripts, lower/casefold is a no-op anyway, but keep it explicit.)
|
|
444
|
+
try:
|
|
445
|
+
for ch in s:
|
|
446
|
+
if not ch.isalpha():
|
|
447
|
+
continue
|
|
448
|
+
if ch.lower() != ch.upper():
|
|
449
|
+
return True
|
|
450
|
+
except Exception:
|
|
451
|
+
pass
|
|
452
|
+
return False
|
|
453
|
+
|
|
454
|
+
# Default: casefold for any cased script (handles Latin/Cyrillic/Greek/etc).
|
|
455
|
+
try:
|
|
456
|
+
wl = w0.casefold() if _has_casing(w0) else w0
|
|
457
|
+
except Exception:
|
|
458
|
+
wl = w0.lower() if w0.isascii() else w0
|
|
459
|
+
|
|
460
|
+
# Arabic-family normalization (keeps it conservative; improves pack-less matching/stability)
|
|
461
|
+
if self.lang in {"ar", "fa", "ur"}:
|
|
462
|
+
try:
|
|
463
|
+
# Remove tatweel and common harakat/diacritics
|
|
464
|
+
wl = wl.replace("ـ", "")
|
|
465
|
+
wl = "".join(ch for ch in wl if unicodedata.category(ch) != "Mn")
|
|
466
|
+
# Normalize alef variants + ya/maqsura (useful for ar; harmless for fa/ur)
|
|
467
|
+
wl = wl.translate(
|
|
468
|
+
str.maketrans(
|
|
469
|
+
{
|
|
470
|
+
"أ": "ا",
|
|
471
|
+
"إ": "ا",
|
|
472
|
+
"آ": "ا",
|
|
473
|
+
"ى": "ي",
|
|
474
|
+
}
|
|
475
|
+
)
|
|
476
|
+
)
|
|
477
|
+
except Exception:
|
|
478
|
+
pass
|
|
479
|
+
|
|
480
|
+
# Hebrew: strip niqqud (Mn marks)
|
|
481
|
+
if self.lang == "he":
|
|
482
|
+
try:
|
|
483
|
+
wl = "".join(ch for ch in wl if unicodedata.category(ch) != "Mn")
|
|
484
|
+
except Exception:
|
|
485
|
+
pass
|
|
486
|
+
|
|
487
|
+
# Turkic orthography: split case-marking after apostrophe in proper nouns (Ankara'ya -> ankara)
|
|
488
|
+
if self.lang in {"tr", "az", "kk", "uz"}:
|
|
489
|
+
if "'" in wl:
|
|
490
|
+
try:
|
|
491
|
+
left, right = wl.split("'", 1)
|
|
492
|
+
if left and right and left.isalpha() and right.isalpha():
|
|
493
|
+
wl = left
|
|
494
|
+
except Exception:
|
|
495
|
+
pass
|
|
496
|
+
|
|
497
|
+
# Romance elision clitics: l'amour -> amour (very conservative)
|
|
498
|
+
if self.lang in {"fr", "it", "ca"}:
|
|
499
|
+
if "'" in wl:
|
|
500
|
+
try:
|
|
501
|
+
left, right = wl.split("'", 1)
|
|
502
|
+
if 1 <= len(left) <= 2 and right and right[0].isalpha():
|
|
503
|
+
wl = right
|
|
504
|
+
except Exception:
|
|
505
|
+
pass
|
|
506
|
+
|
|
507
|
+
def _is_hangul_syllable(ch: str) -> bool:
|
|
508
|
+
o = ord(ch)
|
|
509
|
+
return 0xAC00 <= o <= 0xD7A3
|
|
510
|
+
|
|
511
|
+
def _ko_strip_josa(s: str) -> str:
|
|
512
|
+
# Conservative: strip only common postpositions/endings if token is Hangul and longer than suffix.
|
|
513
|
+
if not s or len(s) < 2:
|
|
514
|
+
return s
|
|
515
|
+
if not all(_is_hangul_syllable(c) for c in s):
|
|
516
|
+
return s
|
|
517
|
+
# Longest-first
|
|
518
|
+
suffixes = [
|
|
519
|
+
"으로부터",
|
|
520
|
+
"에서부터",
|
|
521
|
+
"으로써",
|
|
522
|
+
"로써",
|
|
523
|
+
"으로서",
|
|
524
|
+
"로서",
|
|
525
|
+
"에게서",
|
|
526
|
+
"까지",
|
|
527
|
+
"부터",
|
|
528
|
+
"에서",
|
|
529
|
+
"에게",
|
|
530
|
+
"한테",
|
|
531
|
+
"께서",
|
|
532
|
+
"께",
|
|
533
|
+
"보다",
|
|
534
|
+
"처럼",
|
|
535
|
+
"마저",
|
|
536
|
+
"조차",
|
|
537
|
+
"라도",
|
|
538
|
+
"이나",
|
|
539
|
+
"나",
|
|
540
|
+
"와",
|
|
541
|
+
"과",
|
|
542
|
+
"으로",
|
|
543
|
+
"로",
|
|
544
|
+
"에",
|
|
545
|
+
"의",
|
|
546
|
+
"도",
|
|
547
|
+
"만",
|
|
548
|
+
"은",
|
|
549
|
+
"는",
|
|
550
|
+
"이",
|
|
551
|
+
"가",
|
|
552
|
+
"을",
|
|
553
|
+
"를",
|
|
554
|
+
]
|
|
555
|
+
for suf in suffixes:
|
|
556
|
+
if len(s) > len(suf) and s.endswith(suf):
|
|
557
|
+
stem = s[: -len(suf)]
|
|
558
|
+
if stem:
|
|
559
|
+
return stem
|
|
560
|
+
return s
|
|
561
|
+
|
|
562
|
+
def _is_kanji(ch: str) -> bool:
|
|
563
|
+
o = ord(ch)
|
|
564
|
+
return (0x4E00 <= o <= 0x9FFF) or (0x3400 <= o <= 0x4DBF)
|
|
565
|
+
|
|
566
|
+
def _is_kana(ch: str) -> bool:
|
|
567
|
+
o = ord(ch)
|
|
568
|
+
return (0x3040 <= o <= 0x309F) or (0x30A0 <= o <= 0x30FF)
|
|
569
|
+
|
|
570
|
+
def _ja_strip_particles(s: str) -> str:
|
|
571
|
+
# Conservative: strip only if the token ends with a particle and the preceding char looks content-ish.
|
|
572
|
+
if not s or len(s) < 2:
|
|
573
|
+
return s
|
|
574
|
+
# multi-char first
|
|
575
|
+
multi = ["から", "まで", "より"]
|
|
576
|
+
for suf in multi:
|
|
577
|
+
if len(s) > len(suf) and s.endswith(suf):
|
|
578
|
+
stem = s[: -len(suf)]
|
|
579
|
+
if stem:
|
|
580
|
+
return stem
|
|
581
|
+
# single-char particles: only strip if previous char is Kanji/Katakana (avoid stripping from pure-hiragana words like もの)
|
|
582
|
+
single = ["は", "が", "を", "に", "で", "と", "も", "へ", "の", "や", "か", "ね", "よ", "な"]
|
|
583
|
+
last = s[-1]
|
|
584
|
+
if last in single:
|
|
585
|
+
prev = s[-2]
|
|
586
|
+
if _is_kanji(prev) or (0x30A0 <= ord(prev) <= 0x30FF):
|
|
587
|
+
stem = s[:-1]
|
|
588
|
+
if stem:
|
|
589
|
+
return stem
|
|
590
|
+
return s
|
|
591
|
+
|
|
592
|
+
# 영어 기본 규칙(기존 유지)
|
|
593
|
+
if self.lang == 'en':
|
|
594
|
+
# -ing
|
|
595
|
+
if wl.endswith('ing') and len(wl) > 4:
|
|
596
|
+
base = wl[:-3]
|
|
597
|
+
if base.endswith('e'):
|
|
598
|
+
return base
|
|
599
|
+
if len(base) > 2 and base[-1] == base[-2]: # running -> run
|
|
600
|
+
return base[:-1]
|
|
601
|
+
return base + 'e' if base[-1] not in 'aeiou' else base
|
|
602
|
+
# -ed
|
|
603
|
+
if wl.endswith('ed') and len(wl) > 3:
|
|
604
|
+
base = wl[:-2]
|
|
605
|
+
if wl.endswith('ied'):
|
|
606
|
+
return wl[:-3] + 'y'
|
|
607
|
+
if len(base) > 2 and base[-1] == base[-2]: # stopped -> stop
|
|
608
|
+
return base[:-1]
|
|
609
|
+
return base + 'e' if base[-1] not in 'aeiou' else base
|
|
610
|
+
# -s, -es
|
|
611
|
+
if wl.endswith('ies') and len(wl) > 3:
|
|
612
|
+
return wl[:-3] + 'y'
|
|
613
|
+
if wl.endswith('es') and len(wl) > 3:
|
|
614
|
+
return wl[:-2]
|
|
615
|
+
if wl.endswith('s') and len(wl) > 2:
|
|
616
|
+
return wl[:-1]
|
|
617
|
+
return wl
|
|
618
|
+
|
|
619
|
+
# High-value pack-less tuning: strip common function-word suffixes in agglutinative no-space-ish usage.
|
|
620
|
+
if self.lang == "ko":
|
|
621
|
+
return _ko_strip_josa(wl)
|
|
622
|
+
if self.lang == "ja":
|
|
623
|
+
return _ja_strip_particles(wl)
|
|
624
|
+
|
|
625
|
+
# Pack-less fallback for most languages: conservative normalization only
|
|
626
|
+
# - If ASCII: lowercase to stabilize (matches common lemma conventions for Latin-script langs)
|
|
627
|
+
# - Else: keep as-is (already NFC + edge punct stripped)
|
|
628
|
+
return wl
|
|
629
|
+
|
|
630
|
+
def analyze(self, text: str) -> List[LemmaResult]:
|
|
631
|
+
"""
|
|
632
|
+
텍스트 전체 분석
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
text: 입력 텍스트
|
|
636
|
+
|
|
637
|
+
Returns:
|
|
638
|
+
LemmaResult 리스트
|
|
639
|
+
"""
|
|
640
|
+
# SEA no-space scripts: the template analyzers tend to group whole runs,
|
|
641
|
+
# while TokMor tokenizers (Brahmic/Thai native) provide much better segmentation.
|
|
642
|
+
# For product stability, prefer tokenizer-backed segmentation here.
|
|
643
|
+
if self.lang in {"th", "km", "lo", "my"}:
|
|
644
|
+
try:
|
|
645
|
+
from ..factory import get_tokenizer # lazy import
|
|
646
|
+
use_morph = (self.lang == "th")
|
|
647
|
+
tok = get_tokenizer(self.lang, use_morphology=use_morph)
|
|
648
|
+
tres = tok.tokenize(text)
|
|
649
|
+
out: List[LemmaResult] = []
|
|
650
|
+
for t in (tres.tokens or []):
|
|
651
|
+
try:
|
|
652
|
+
det = self.lemmatize_detailed(t.text)
|
|
653
|
+
lemma = t.lemma if getattr(t, "lemma", None) else det.lemma
|
|
654
|
+
out.append(
|
|
655
|
+
LemmaResult(
|
|
656
|
+
word=t.text,
|
|
657
|
+
lemma=lemma,
|
|
658
|
+
pos=getattr(t, "pos", None) or det.pos,
|
|
659
|
+
features=str(getattr(t, "features", None)) if getattr(t, "features", None) else det.features,
|
|
660
|
+
confidence=float(det.confidence),
|
|
661
|
+
source=det.source,
|
|
662
|
+
)
|
|
663
|
+
)
|
|
664
|
+
except Exception:
|
|
665
|
+
out.append(LemmaResult(word=t.text, lemma=t.text, source="fallback", confidence=0.1))
|
|
666
|
+
if out:
|
|
667
|
+
return out
|
|
668
|
+
except Exception:
|
|
669
|
+
# fall through to generic logic
|
|
670
|
+
pass
|
|
671
|
+
|
|
672
|
+
# 특화 분석기 있으면 사용
|
|
673
|
+
if self.specialized_analyzer:
|
|
674
|
+
try:
|
|
675
|
+
result = self.specialized_analyzer.analyze(text)
|
|
676
|
+
if result and result.morphemes:
|
|
677
|
+
return [
|
|
678
|
+
LemmaResult(
|
|
679
|
+
word=m.form,
|
|
680
|
+
lemma=getattr(m, 'lemma', m.form),
|
|
681
|
+
pos=m.pos,
|
|
682
|
+
features=getattr(m, 'features', None),
|
|
683
|
+
source='specialized'
|
|
684
|
+
)
|
|
685
|
+
for m in result.morphemes
|
|
686
|
+
]
|
|
687
|
+
except Exception:
|
|
688
|
+
pass
|
|
689
|
+
|
|
690
|
+
# Generic fallback: tokenizer-backed segmentation (better than whitespace split for many scripts)
|
|
691
|
+
try:
|
|
692
|
+
from ..factory import get_tokenizer # lazy import
|
|
693
|
+
tok = get_tokenizer(self.lang, use_morphology=False)
|
|
694
|
+
tokens = tok.tokenize(text).texts()
|
|
695
|
+
if tokens:
|
|
696
|
+
return [self.lemmatize_detailed(w) for w in tokens]
|
|
697
|
+
except Exception:
|
|
698
|
+
pass
|
|
699
|
+
|
|
700
|
+
# Last fallback: whitespace split
|
|
701
|
+
words = text.split()
|
|
702
|
+
return [self.lemmatize_detailed(w) for w in words if w]
|
|
703
|
+
|
|
704
|
+
def get_morphemes(self, word: str) -> List[Morpheme]:
|
|
705
|
+
"""
|
|
706
|
+
단어의 형태소 분석 결과 반환 (특화 분석기 전용)
|
|
707
|
+
|
|
708
|
+
Args:
|
|
709
|
+
word: 입력 단어
|
|
710
|
+
|
|
711
|
+
Returns:
|
|
712
|
+
Morpheme 리스트
|
|
713
|
+
"""
|
|
714
|
+
if self.specialized_analyzer:
|
|
715
|
+
try:
|
|
716
|
+
result = self.specialized_analyzer.analyze(word)
|
|
717
|
+
if result and result.morphemes:
|
|
718
|
+
return result.morphemes
|
|
719
|
+
except Exception:
|
|
720
|
+
pass
|
|
721
|
+
|
|
722
|
+
# 폴백: 단일 형태소로 반환
|
|
723
|
+
lemma_result = self.lemmatize_detailed(word)
|
|
724
|
+
return [Morpheme(
|
|
725
|
+
form=word,
|
|
726
|
+
pos=lemma_result.pos or 'X',
|
|
727
|
+
lemma=lemma_result.lemma,
|
|
728
|
+
features=lemma_result.features
|
|
729
|
+
)]
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
# 캐시
|
|
733
|
+
_unified_analyzers: Dict[str, UnifiedMorphAnalyzer] = {}
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
def get_unified_analyzer(lang: str) -> UnifiedMorphAnalyzer:
|
|
737
|
+
"""
|
|
738
|
+
통합 형태소 분석기 반환
|
|
739
|
+
|
|
740
|
+
Args:
|
|
741
|
+
lang: 언어 코드
|
|
742
|
+
|
|
743
|
+
Returns:
|
|
744
|
+
UnifiedMorphAnalyzer 인스턴스
|
|
745
|
+
"""
|
|
746
|
+
lang = lang.lower()
|
|
747
|
+
|
|
748
|
+
if lang not in _unified_analyzers:
|
|
749
|
+
_unified_analyzers[lang] = UnifiedMorphAnalyzer(lang)
|
|
750
|
+
|
|
751
|
+
return _unified_analyzers[lang]
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
def unified_supported_languages() -> List[str]:
|
|
755
|
+
"""
|
|
756
|
+
통합 형태소 분석 지원 언어 목록
|
|
757
|
+
|
|
758
|
+
Returns:
|
|
759
|
+
언어 코드 리스트 (특화 + lemma lexicon 합집합)
|
|
760
|
+
"""
|
|
761
|
+
# 특화 분석기 언어
|
|
762
|
+
specialized = set(ANALYZERS.keys()) - {'xx'}
|
|
763
|
+
# Tokenize-supported languages (fallback lemma is always available)
|
|
764
|
+
tokenize_langs = set()
|
|
765
|
+
try:
|
|
766
|
+
from ..factory import supported_languages as _supported_languages # lazy import
|
|
767
|
+
tokenize_langs = set(_supported_languages())
|
|
768
|
+
except Exception:
|
|
769
|
+
tokenize_langs = set()
|
|
770
|
+
lex_langs = set()
|
|
771
|
+
try:
|
|
772
|
+
ld = resources.lemma_dict_dir()
|
|
773
|
+
if ld.exists():
|
|
774
|
+
for f in ld.glob("*"):
|
|
775
|
+
if f.suffix.lower() in (".pkl", ".sqlite", ".db", ".sqlite3"):
|
|
776
|
+
lex_langs.add(f.stem.split("_")[0])
|
|
777
|
+
except Exception:
|
|
778
|
+
pass
|
|
779
|
+
# Legacy external lemma packs are intentionally not supported in OSS distribution.
|
|
780
|
+
|
|
781
|
+
return sorted(tokenize_langs | specialized | lex_langs)
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
def unified_language_info() -> Dict[str, Any]:
|
|
785
|
+
"""
|
|
786
|
+
통합 형태소 분석 언어별 상세 정보
|
|
787
|
+
|
|
788
|
+
Returns:
|
|
789
|
+
언어별 정보 딕셔너리
|
|
790
|
+
"""
|
|
791
|
+
# 특화 분석기 언어
|
|
792
|
+
specialized = set(ANALYZERS.keys()) - {'xx'}
|
|
793
|
+
|
|
794
|
+
# lemma lexicon 언어
|
|
795
|
+
lex_langs = set()
|
|
796
|
+
try:
|
|
797
|
+
ld = resources.lemma_dict_dir()
|
|
798
|
+
if ld.exists():
|
|
799
|
+
for f in ld.glob("*"):
|
|
800
|
+
if f.suffix.lower() in (".pkl", ".sqlite", ".db", ".sqlite3"):
|
|
801
|
+
lex_langs.add(f.stem.split("_")[0])
|
|
802
|
+
except Exception:
|
|
803
|
+
pass
|
|
804
|
+
# Legacy external lemma packs are intentionally not supported in OSS distribution.
|
|
805
|
+
|
|
806
|
+
# 언어별 정보 조합
|
|
807
|
+
all_langs = specialized | set(lex_langs)
|
|
808
|
+
|
|
809
|
+
info = {
|
|
810
|
+
'total': len(all_langs),
|
|
811
|
+
'specialized_count': len(specialized),
|
|
812
|
+
'lexicon_count': len(lex_langs),
|
|
813
|
+
'overlap_count': len(specialized & set(lex_langs)),
|
|
814
|
+
'languages': {}
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
for lang in sorted(all_langs):
|
|
818
|
+
lang_info = {
|
|
819
|
+
'has_specialized': lang in specialized,
|
|
820
|
+
'has_lexicon': lang in lex_langs
|
|
821
|
+
}
|
|
822
|
+
info['languages'][lang] = lang_info
|
|
823
|
+
|
|
824
|
+
return info
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
# 편의 함수
|
|
828
|
+
def lemmatize(word: str, lang: str = 'en') -> str:
|
|
829
|
+
"""
|
|
830
|
+
단어의 기본형 반환 (편의 함수)
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
word: 입력 단어
|
|
834
|
+
lang: 언어 코드
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
기본형 (lemma)
|
|
838
|
+
"""
|
|
839
|
+
analyzer = get_unified_analyzer(lang)
|
|
840
|
+
return analyzer.lemmatize(word)
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
def analyze(text: str, lang: str = 'en') -> List[LemmaResult]:
|
|
844
|
+
"""
|
|
845
|
+
텍스트 형태소 분석 (편의 함수)
|
|
846
|
+
|
|
847
|
+
Args:
|
|
848
|
+
text: 입력 텍스트
|
|
849
|
+
lang: 언어 코드
|
|
850
|
+
|
|
851
|
+
Returns:
|
|
852
|
+
LemmaResult 리스트
|
|
853
|
+
"""
|
|
854
|
+
analyzer = get_unified_analyzer(lang)
|
|
855
|
+
return analyzer.analyze(text)
|