tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""
|
|
2
|
+
English Morphological Analyzer - 자체 구현
|
|
3
|
+
==========================================
|
|
4
|
+
|
|
5
|
+
특징:
|
|
6
|
+
- 어간 추출 (Stemming)
|
|
7
|
+
- 불규칙 동사/명사 처리
|
|
8
|
+
- 접사 분리 (prefix/suffix)
|
|
9
|
+
- 품사 태깅 (POS tagging)
|
|
10
|
+
|
|
11
|
+
품사 태그 (Penn Treebank):
|
|
12
|
+
- NN: 명사, NNS: 복수명사, NNP: 고유명사
|
|
13
|
+
- VB: 동사원형, VBD: 과거, VBG: 현재분사, VBN: 과거분사, VBZ: 3인칭단수
|
|
14
|
+
- JJ: 형용사, JJR: 비교급, JJS: 최상급
|
|
15
|
+
- RB: 부사, RBR: 비교급, RBS: 최상급
|
|
16
|
+
- DT: 관사, IN: 전치사, CC: 접속사
|
|
17
|
+
- PRP: 인칭대명사, PRP$: 소유대명사
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import re
|
|
21
|
+
from typing import List, Tuple, Dict, Optional
|
|
22
|
+
from dataclasses import dataclass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class Morpheme:
|
|
27
|
+
surface: str
|
|
28
|
+
lemma: str
|
|
29
|
+
pos: str
|
|
30
|
+
start: int
|
|
31
|
+
end: int
|
|
32
|
+
|
|
33
|
+
def __repr__(self):
|
|
34
|
+
return f"{self.surface}/{self.pos}"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EnglishAnalyzer:
|
|
38
|
+
"""영어 형태소 분석기"""
|
|
39
|
+
|
|
40
|
+
def __init__(self):
|
|
41
|
+
self._build_dictionary()
|
|
42
|
+
self._build_rules()
|
|
43
|
+
|
|
44
|
+
def _build_dictionary(self):
|
|
45
|
+
"""사전 구축"""
|
|
46
|
+
|
|
47
|
+
# 불규칙 동사 (과거형 -> 원형)
|
|
48
|
+
self.irregular_verbs = {
|
|
49
|
+
# be
|
|
50
|
+
'am': 'be', 'is': 'be', 'are': 'be', 'was': 'be', 'were': 'be', 'been': 'be', 'being': 'be',
|
|
51
|
+
# have
|
|
52
|
+
'has': 'have', 'had': 'have', 'having': 'have',
|
|
53
|
+
# do
|
|
54
|
+
'does': 'do', 'did': 'do', 'done': 'do', 'doing': 'do',
|
|
55
|
+
# go
|
|
56
|
+
'goes': 'go', 'went': 'go', 'gone': 'go', 'going': 'go',
|
|
57
|
+
# come
|
|
58
|
+
'came': 'come', 'coming': 'come',
|
|
59
|
+
# get
|
|
60
|
+
'gets': 'get', 'got': 'get', 'gotten': 'get', 'getting': 'get',
|
|
61
|
+
# make
|
|
62
|
+
'makes': 'make', 'made': 'make', 'making': 'make',
|
|
63
|
+
# say
|
|
64
|
+
'says': 'say', 'said': 'say', 'saying': 'say',
|
|
65
|
+
# take
|
|
66
|
+
'takes': 'take', 'took': 'take', 'taken': 'take', 'taking': 'take',
|
|
67
|
+
# see
|
|
68
|
+
'sees': 'see', 'saw': 'see', 'seen': 'see', 'seeing': 'see',
|
|
69
|
+
# know
|
|
70
|
+
'knows': 'know', 'knew': 'know', 'known': 'know', 'knowing': 'know',
|
|
71
|
+
# think
|
|
72
|
+
'thinks': 'think', 'thought': 'think', 'thinking': 'think',
|
|
73
|
+
# give
|
|
74
|
+
'gives': 'give', 'gave': 'give', 'given': 'give', 'giving': 'give',
|
|
75
|
+
# find
|
|
76
|
+
'finds': 'find', 'found': 'find', 'finding': 'find',
|
|
77
|
+
# tell
|
|
78
|
+
'tells': 'tell', 'told': 'tell', 'telling': 'tell',
|
|
79
|
+
# become
|
|
80
|
+
'becomes': 'become', 'became': 'become', 'becoming': 'become',
|
|
81
|
+
# leave
|
|
82
|
+
'leaves': 'leave', 'left': 'leave', 'leaving': 'leave',
|
|
83
|
+
# put
|
|
84
|
+
'puts': 'put', 'putting': 'put',
|
|
85
|
+
# keep
|
|
86
|
+
'keeps': 'keep', 'kept': 'keep', 'keeping': 'keep',
|
|
87
|
+
# let
|
|
88
|
+
'lets': 'let', 'letting': 'let',
|
|
89
|
+
# begin
|
|
90
|
+
'begins': 'begin', 'began': 'begin', 'begun': 'begin', 'beginning': 'begin',
|
|
91
|
+
# write
|
|
92
|
+
'writes': 'write', 'wrote': 'write', 'written': 'write', 'writing': 'write',
|
|
93
|
+
# run
|
|
94
|
+
'runs': 'run', 'ran': 'run', 'running': 'run',
|
|
95
|
+
# read
|
|
96
|
+
'reads': 'read', 'reading': 'read',
|
|
97
|
+
# speak
|
|
98
|
+
'speaks': 'speak', 'spoke': 'speak', 'spoken': 'speak', 'speaking': 'speak',
|
|
99
|
+
# buy
|
|
100
|
+
'buys': 'buy', 'bought': 'buy', 'buying': 'buy',
|
|
101
|
+
# bring
|
|
102
|
+
'brings': 'bring', 'brought': 'bring', 'bringing': 'bring',
|
|
103
|
+
# sit
|
|
104
|
+
'sits': 'sit', 'sat': 'sit', 'sitting': 'sit',
|
|
105
|
+
# stand
|
|
106
|
+
'stands': 'stand', 'stood': 'stand', 'standing': 'stand',
|
|
107
|
+
# lose
|
|
108
|
+
'loses': 'lose', 'lost': 'lose', 'losing': 'lose',
|
|
109
|
+
# pay
|
|
110
|
+
'pays': 'pay', 'paid': 'pay', 'paying': 'pay',
|
|
111
|
+
# meet
|
|
112
|
+
'meets': 'meet', 'met': 'meet', 'meeting': 'meet',
|
|
113
|
+
# send
|
|
114
|
+
'sends': 'send', 'sent': 'send', 'sending': 'send',
|
|
115
|
+
# build
|
|
116
|
+
'builds': 'build', 'built': 'build', 'building': 'build',
|
|
117
|
+
# fall
|
|
118
|
+
'falls': 'fall', 'fell': 'fall', 'fallen': 'fall', 'falling': 'fall',
|
|
119
|
+
# cut
|
|
120
|
+
'cuts': 'cut', 'cutting': 'cut',
|
|
121
|
+
# drive
|
|
122
|
+
'drives': 'drive', 'drove': 'drive', 'driven': 'drive', 'driving': 'drive',
|
|
123
|
+
# break
|
|
124
|
+
'breaks': 'break', 'broke': 'break', 'broken': 'break', 'breaking': 'break',
|
|
125
|
+
# grow
|
|
126
|
+
'grows': 'grow', 'grew': 'grow', 'grown': 'grow', 'growing': 'grow',
|
|
127
|
+
# choose
|
|
128
|
+
'chooses': 'choose', 'chose': 'choose', 'chosen': 'choose', 'choosing': 'choose',
|
|
129
|
+
# eat
|
|
130
|
+
'eats': 'eat', 'ate': 'eat', 'eaten': 'eat', 'eating': 'eat',
|
|
131
|
+
# draw
|
|
132
|
+
'draws': 'draw', 'drew': 'draw', 'drawn': 'draw', 'drawing': 'draw',
|
|
133
|
+
# fly
|
|
134
|
+
'flies': 'fly', 'flew': 'fly', 'flown': 'fly', 'flying': 'fly',
|
|
135
|
+
# throw
|
|
136
|
+
'throws': 'throw', 'threw': 'throw', 'thrown': 'throw', 'throwing': 'throw',
|
|
137
|
+
# catch
|
|
138
|
+
'catches': 'catch', 'caught': 'catch', 'catching': 'catch',
|
|
139
|
+
# teach
|
|
140
|
+
'teaches': 'teach', 'taught': 'teach', 'teaching': 'teach',
|
|
141
|
+
# wear
|
|
142
|
+
'wears': 'wear', 'wore': 'wear', 'worn': 'wear', 'wearing': 'wear',
|
|
143
|
+
# win
|
|
144
|
+
'wins': 'win', 'won': 'win', 'winning': 'win',
|
|
145
|
+
# sell
|
|
146
|
+
'sells': 'sell', 'sold': 'sell', 'selling': 'sell',
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# 불규칙 복수형
|
|
150
|
+
self.irregular_plurals = {
|
|
151
|
+
'men': 'man', 'women': 'woman', 'children': 'child',
|
|
152
|
+
'feet': 'foot', 'teeth': 'tooth', 'geese': 'goose',
|
|
153
|
+
'mice': 'mouse', 'people': 'person', 'lives': 'life',
|
|
154
|
+
'wives': 'wife', 'knives': 'knife', 'leaves': 'leaf',
|
|
155
|
+
'selves': 'self', 'halves': 'half', 'wolves': 'wolf',
|
|
156
|
+
'thieves': 'thief', 'shelves': 'shelf', 'loaves': 'loaf',
|
|
157
|
+
'potatoes': 'potato', 'tomatoes': 'tomato', 'heroes': 'hero',
|
|
158
|
+
'analyses': 'analysis', 'bases': 'basis', 'crises': 'crisis',
|
|
159
|
+
'criteria': 'criterion', 'phenomena': 'phenomenon',
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# 기능어 (closed class)
|
|
163
|
+
self.determiners = {'the', 'a', 'an', 'this', 'that', 'these', 'those', 'my', 'your', 'his', 'her', 'its', 'our', 'their'}
|
|
164
|
+
self.pronouns = {'i', 'me', 'my', 'mine', 'you', 'your', 'yours', 'he', 'him', 'his', 'she', 'her', 'hers', 'it', 'its', 'we', 'us', 'our', 'ours', 'they', 'them', 'their', 'theirs', 'who', 'whom', 'whose', 'which', 'what', 'this', 'that', 'these', 'those'}
|
|
165
|
+
self.prepositions = {'in', 'on', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'out', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'of'}
|
|
166
|
+
self.conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so', 'because', 'although', 'while', 'if', 'when', 'where', 'as', 'than', 'whether', 'that'}
|
|
167
|
+
self.auxiliaries = {'be', 'am', 'is', 'are', 'was', 'were', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'can', 'could'}
|
|
168
|
+
|
|
169
|
+
# 접미사 -> 품사
|
|
170
|
+
self.suffix_pos = {
|
|
171
|
+
# 명사
|
|
172
|
+
'tion': 'NN', 'sion': 'NN', 'ment': 'NN', 'ness': 'NN', 'ity': 'NN',
|
|
173
|
+
'er': 'NN', 'or': 'NN', 'ist': 'NN', 'ism': 'NN', 'ance': 'NN', 'ence': 'NN',
|
|
174
|
+
# 형용사
|
|
175
|
+
'able': 'JJ', 'ible': 'JJ', 'ful': 'JJ', 'less': 'JJ', 'ous': 'JJ',
|
|
176
|
+
'ive': 'JJ', 'al': 'JJ', 'ic': 'JJ', 'ical': 'JJ',
|
|
177
|
+
# 부사
|
|
178
|
+
'ly': 'RB',
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
def _build_rules(self):
|
|
182
|
+
"""규칙 구축"""
|
|
183
|
+
# 동사 활용 패턴
|
|
184
|
+
self.verb_patterns = [
|
|
185
|
+
# -ing (현재분사)
|
|
186
|
+
(r'(.+)ying$', r'\1y', 'VBG'), # studying -> study
|
|
187
|
+
(r'(.+)ing$', r'\1e', 'VBG'), # making -> make
|
|
188
|
+
(r'(.+)ing$', r'\1', 'VBG'), # going -> go
|
|
189
|
+
# -ed (과거/과거분사)
|
|
190
|
+
(r'(.+)ied$', r'\1y', 'VBD'), # studied -> study
|
|
191
|
+
(r'(.+)ed$', r'\1e', 'VBD'), # liked -> like
|
|
192
|
+
(r'(.+)ed$', r'\1', 'VBD'), # walked -> walk
|
|
193
|
+
# -s/-es (3인칭 단수)
|
|
194
|
+
(r'(.+)ies$', r'\1y', 'VBZ'), # studies -> study
|
|
195
|
+
(r'(.+)es$', r'\1', 'VBZ'), # goes -> go
|
|
196
|
+
(r'(.+)s$', r'\1', 'VBZ'), # walks -> walk
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
# 명사 복수형 패턴
|
|
200
|
+
self.plural_patterns = [
|
|
201
|
+
(r'(.+)ies$', r'\1y'), # cities -> city
|
|
202
|
+
(r'(.+)ves$', r'\1f'), # lives -> life
|
|
203
|
+
(r'(.+)ves$', r'\1fe'), # wives -> wife
|
|
204
|
+
(r'(.+)es$', r'\1'), # boxes -> box
|
|
205
|
+
(r'(.+)s$', r'\1'), # cats -> cat
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
# 비교급/최상급 패턴
|
|
209
|
+
self.comparative_patterns = [
|
|
210
|
+
(r'(.+)ier$', r'\1y', 'JJR'), # happier -> happy
|
|
211
|
+
(r'(.+)iest$', r'\1y', 'JJS'), # happiest -> happy
|
|
212
|
+
(r'(.+)er$', r'\1', 'JJR'), # bigger -> big
|
|
213
|
+
(r'(.+)est$', r'\1', 'JJS'), # biggest -> big
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
def analyze(self, text: str) -> List[Morpheme]:
|
|
217
|
+
"""형태소 분석"""
|
|
218
|
+
if not text:
|
|
219
|
+
return []
|
|
220
|
+
|
|
221
|
+
tokens = self._tokenize(text)
|
|
222
|
+
result = []
|
|
223
|
+
|
|
224
|
+
for token, start, end in tokens:
|
|
225
|
+
morpheme = self._analyze_token(token, start, end)
|
|
226
|
+
result.append(morpheme)
|
|
227
|
+
|
|
228
|
+
return result
|
|
229
|
+
|
|
230
|
+
def _tokenize(self, text: str) -> List[Tuple[str, int, int]]:
|
|
231
|
+
"""토큰화"""
|
|
232
|
+
tokens = []
|
|
233
|
+
for match in re.finditer(r"[a-zA-Z]+|[0-9]+|[^\s\w]", text):
|
|
234
|
+
tokens.append((match.group(), match.start(), match.end()))
|
|
235
|
+
return tokens
|
|
236
|
+
|
|
237
|
+
def _analyze_token(self, token: str, start: int, end: int) -> Morpheme:
|
|
238
|
+
"""단일 토큰 분석"""
|
|
239
|
+
lower = token.lower()
|
|
240
|
+
|
|
241
|
+
# 1. 기능어 체크
|
|
242
|
+
if lower in self.determiners:
|
|
243
|
+
return Morpheme(token, lower, 'DT', start, end)
|
|
244
|
+
if lower in self.pronouns:
|
|
245
|
+
return Morpheme(token, lower, 'PRP', start, end)
|
|
246
|
+
if lower in self.prepositions:
|
|
247
|
+
return Morpheme(token, lower, 'IN', start, end)
|
|
248
|
+
if lower in self.conjunctions:
|
|
249
|
+
return Morpheme(token, lower, 'CC', start, end)
|
|
250
|
+
if lower in self.auxiliaries:
|
|
251
|
+
lemma = self.irregular_verbs.get(lower, lower)
|
|
252
|
+
return Morpheme(token, lemma, 'VB', start, end)
|
|
253
|
+
|
|
254
|
+
# 2. 불규칙 동사 체크
|
|
255
|
+
if lower in self.irregular_verbs:
|
|
256
|
+
lemma = self.irregular_verbs[lower]
|
|
257
|
+
pos = self._get_verb_form(lower)
|
|
258
|
+
return Morpheme(token, lemma, pos, start, end)
|
|
259
|
+
|
|
260
|
+
# 3. 불규칙 복수형 체크
|
|
261
|
+
if lower in self.irregular_plurals:
|
|
262
|
+
lemma = self.irregular_plurals[lower]
|
|
263
|
+
return Morpheme(token, lemma, 'NNS', start, end)
|
|
264
|
+
|
|
265
|
+
# 4. 규칙 기반 분석
|
|
266
|
+
# 동사 활용
|
|
267
|
+
for pattern, replacement, pos in self.verb_patterns:
|
|
268
|
+
if re.match(pattern, lower):
|
|
269
|
+
lemma = re.sub(pattern, replacement, lower)
|
|
270
|
+
return Morpheme(token, lemma, pos, start, end)
|
|
271
|
+
|
|
272
|
+
# 복수형
|
|
273
|
+
for pattern, replacement in self.plural_patterns:
|
|
274
|
+
if re.match(pattern, lower) and len(lower) > 3:
|
|
275
|
+
lemma = re.sub(pattern, replacement, lower)
|
|
276
|
+
return Morpheme(token, lemma, 'NNS', start, end)
|
|
277
|
+
|
|
278
|
+
# 비교급/최상급
|
|
279
|
+
for pattern, replacement, pos in self.comparative_patterns:
|
|
280
|
+
if re.match(pattern, lower):
|
|
281
|
+
lemma = re.sub(pattern, replacement, lower)
|
|
282
|
+
return Morpheme(token, lemma, pos, start, end)
|
|
283
|
+
|
|
284
|
+
# 5. 접미사 기반 품사 추정
|
|
285
|
+
for suffix, pos in self.suffix_pos.items():
|
|
286
|
+
if lower.endswith(suffix) and len(lower) > len(suffix) + 2:
|
|
287
|
+
return Morpheme(token, lower, pos, start, end)
|
|
288
|
+
|
|
289
|
+
# 6. 대문자로 시작하면 고유명사
|
|
290
|
+
if token[0].isupper() and start > 0:
|
|
291
|
+
return Morpheme(token, token, 'NNP', start, end)
|
|
292
|
+
|
|
293
|
+
# 7. 기본값: 명사
|
|
294
|
+
return Morpheme(token, lower, 'NN', start, end)
|
|
295
|
+
|
|
296
|
+
def _get_verb_form(self, word: str) -> str:
|
|
297
|
+
"""동사 형태 판별"""
|
|
298
|
+
if word.endswith('ing'):
|
|
299
|
+
return 'VBG'
|
|
300
|
+
elif word.endswith('ed') or word in {'was', 'were', 'had', 'did', 'went', 'came', 'saw', 'took', 'made', 'said', 'got', 'gave', 'found', 'thought', 'told', 'became', 'left', 'kept', 'began', 'wrote', 'ran', 'spoke', 'bought', 'brought', 'sat', 'stood', 'lost', 'paid', 'met', 'sent', 'built', 'fell', 'drove', 'broke', 'grew', 'chose', 'ate', 'drew', 'flew', 'threw', 'caught', 'taught', 'wore', 'won', 'sold'}:
|
|
301
|
+
return 'VBD'
|
|
302
|
+
elif word.endswith('en') or word in {'been', 'done', 'gone', 'seen', 'known', 'given', 'taken', 'written', 'spoken', 'chosen', 'eaten', 'drawn', 'flown', 'thrown', 'worn', 'driven', 'broken', 'grown', 'fallen', 'forgotten', 'gotten'}:
|
|
303
|
+
return 'VBN'
|
|
304
|
+
elif word.endswith('s'):
|
|
305
|
+
return 'VBZ'
|
|
306
|
+
else:
|
|
307
|
+
return 'VB'
|
|
308
|
+
|
|
309
|
+
def lemmatize(self, text: str) -> List[str]:
|
|
310
|
+
"""기본형 추출"""
|
|
311
|
+
return [m.lemma for m in self.analyze(text)]
|
|
312
|
+
|
|
313
|
+
def pos_tag(self, text: str) -> List[Tuple[str, str]]:
|
|
314
|
+
"""품사 태깅"""
|
|
315
|
+
return [(m.surface, m.pos) for m in self.analyze(text)]
|