tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,315 @@
1
+ """
2
+ English Morphological Analyzer - 자체 구현
3
+ ==========================================
4
+
5
+ 특징:
6
+ - 어간 추출 (Stemming)
7
+ - 불규칙 동사/명사 처리
8
+ - 접사 분리 (prefix/suffix)
9
+ - 품사 태깅 (POS tagging)
10
+
11
+ 품사 태그 (Penn Treebank):
12
+ - NN: 명사, NNS: 복수명사, NNP: 고유명사
13
+ - VB: 동사원형, VBD: 과거, VBG: 현재분사, VBN: 과거분사, VBZ: 3인칭단수
14
+ - JJ: 형용사, JJR: 비교급, JJS: 최상급
15
+ - RB: 부사, RBR: 비교급, RBS: 최상급
16
+ - DT: 관사, IN: 전치사, CC: 접속사
17
+ - PRP: 인칭대명사, PRP$: 소유대명사
18
+ """
19
+
20
+ import re
21
+ from typing import List, Tuple, Dict, Optional
22
+ from dataclasses import dataclass
23
+
24
+
25
+ @dataclass
26
+ class Morpheme:
27
+ surface: str
28
+ lemma: str
29
+ pos: str
30
+ start: int
31
+ end: int
32
+
33
+ def __repr__(self):
34
+ return f"{self.surface}/{self.pos}"
35
+
36
+
37
+ class EnglishAnalyzer:
38
+ """영어 형태소 분석기"""
39
+
40
+ def __init__(self):
41
+ self._build_dictionary()
42
+ self._build_rules()
43
+
44
+ def _build_dictionary(self):
45
+ """사전 구축"""
46
+
47
+ # 불규칙 동사 (과거형 -> 원형)
48
+ self.irregular_verbs = {
49
+ # be
50
+ 'am': 'be', 'is': 'be', 'are': 'be', 'was': 'be', 'were': 'be', 'been': 'be', 'being': 'be',
51
+ # have
52
+ 'has': 'have', 'had': 'have', 'having': 'have',
53
+ # do
54
+ 'does': 'do', 'did': 'do', 'done': 'do', 'doing': 'do',
55
+ # go
56
+ 'goes': 'go', 'went': 'go', 'gone': 'go', 'going': 'go',
57
+ # come
58
+ 'came': 'come', 'coming': 'come',
59
+ # get
60
+ 'gets': 'get', 'got': 'get', 'gotten': 'get', 'getting': 'get',
61
+ # make
62
+ 'makes': 'make', 'made': 'make', 'making': 'make',
63
+ # say
64
+ 'says': 'say', 'said': 'say', 'saying': 'say',
65
+ # take
66
+ 'takes': 'take', 'took': 'take', 'taken': 'take', 'taking': 'take',
67
+ # see
68
+ 'sees': 'see', 'saw': 'see', 'seen': 'see', 'seeing': 'see',
69
+ # know
70
+ 'knows': 'know', 'knew': 'know', 'known': 'know', 'knowing': 'know',
71
+ # think
72
+ 'thinks': 'think', 'thought': 'think', 'thinking': 'think',
73
+ # give
74
+ 'gives': 'give', 'gave': 'give', 'given': 'give', 'giving': 'give',
75
+ # find
76
+ 'finds': 'find', 'found': 'find', 'finding': 'find',
77
+ # tell
78
+ 'tells': 'tell', 'told': 'tell', 'telling': 'tell',
79
+ # become
80
+ 'becomes': 'become', 'became': 'become', 'becoming': 'become',
81
+ # leave
82
+ 'leaves': 'leave', 'left': 'leave', 'leaving': 'leave',
83
+ # put
84
+ 'puts': 'put', 'putting': 'put',
85
+ # keep
86
+ 'keeps': 'keep', 'kept': 'keep', 'keeping': 'keep',
87
+ # let
88
+ 'lets': 'let', 'letting': 'let',
89
+ # begin
90
+ 'begins': 'begin', 'began': 'begin', 'begun': 'begin', 'beginning': 'begin',
91
+ # write
92
+ 'writes': 'write', 'wrote': 'write', 'written': 'write', 'writing': 'write',
93
+ # run
94
+ 'runs': 'run', 'ran': 'run', 'running': 'run',
95
+ # read
96
+ 'reads': 'read', 'reading': 'read',
97
+ # speak
98
+ 'speaks': 'speak', 'spoke': 'speak', 'spoken': 'speak', 'speaking': 'speak',
99
+ # buy
100
+ 'buys': 'buy', 'bought': 'buy', 'buying': 'buy',
101
+ # bring
102
+ 'brings': 'bring', 'brought': 'bring', 'bringing': 'bring',
103
+ # sit
104
+ 'sits': 'sit', 'sat': 'sit', 'sitting': 'sit',
105
+ # stand
106
+ 'stands': 'stand', 'stood': 'stand', 'standing': 'stand',
107
+ # lose
108
+ 'loses': 'lose', 'lost': 'lose', 'losing': 'lose',
109
+ # pay
110
+ 'pays': 'pay', 'paid': 'pay', 'paying': 'pay',
111
+ # meet
112
+ 'meets': 'meet', 'met': 'meet', 'meeting': 'meet',
113
+ # send
114
+ 'sends': 'send', 'sent': 'send', 'sending': 'send',
115
+ # build
116
+ 'builds': 'build', 'built': 'build', 'building': 'build',
117
+ # fall
118
+ 'falls': 'fall', 'fell': 'fall', 'fallen': 'fall', 'falling': 'fall',
119
+ # cut
120
+ 'cuts': 'cut', 'cutting': 'cut',
121
+ # drive
122
+ 'drives': 'drive', 'drove': 'drive', 'driven': 'drive', 'driving': 'drive',
123
+ # break
124
+ 'breaks': 'break', 'broke': 'break', 'broken': 'break', 'breaking': 'break',
125
+ # grow
126
+ 'grows': 'grow', 'grew': 'grow', 'grown': 'grow', 'growing': 'grow',
127
+ # choose
128
+ 'chooses': 'choose', 'chose': 'choose', 'chosen': 'choose', 'choosing': 'choose',
129
+ # eat
130
+ 'eats': 'eat', 'ate': 'eat', 'eaten': 'eat', 'eating': 'eat',
131
+ # draw
132
+ 'draws': 'draw', 'drew': 'draw', 'drawn': 'draw', 'drawing': 'draw',
133
+ # fly
134
+ 'flies': 'fly', 'flew': 'fly', 'flown': 'fly', 'flying': 'fly',
135
+ # throw
136
+ 'throws': 'throw', 'threw': 'throw', 'thrown': 'throw', 'throwing': 'throw',
137
+ # catch
138
+ 'catches': 'catch', 'caught': 'catch', 'catching': 'catch',
139
+ # teach
140
+ 'teaches': 'teach', 'taught': 'teach', 'teaching': 'teach',
141
+ # wear
142
+ 'wears': 'wear', 'wore': 'wear', 'worn': 'wear', 'wearing': 'wear',
143
+ # win
144
+ 'wins': 'win', 'won': 'win', 'winning': 'win',
145
+ # sell
146
+ 'sells': 'sell', 'sold': 'sell', 'selling': 'sell',
147
+ }
148
+
149
+ # 불규칙 복수형
150
+ self.irregular_plurals = {
151
+ 'men': 'man', 'women': 'woman', 'children': 'child',
152
+ 'feet': 'foot', 'teeth': 'tooth', 'geese': 'goose',
153
+ 'mice': 'mouse', 'people': 'person', 'lives': 'life',
154
+ 'wives': 'wife', 'knives': 'knife', 'leaves': 'leaf',
155
+ 'selves': 'self', 'halves': 'half', 'wolves': 'wolf',
156
+ 'thieves': 'thief', 'shelves': 'shelf', 'loaves': 'loaf',
157
+ 'potatoes': 'potato', 'tomatoes': 'tomato', 'heroes': 'hero',
158
+ 'analyses': 'analysis', 'bases': 'basis', 'crises': 'crisis',
159
+ 'criteria': 'criterion', 'phenomena': 'phenomenon',
160
+ }
161
+
162
+ # 기능어 (closed class)
163
+ self.determiners = {'the', 'a', 'an', 'this', 'that', 'these', 'those', 'my', 'your', 'his', 'her', 'its', 'our', 'their'}
164
+ self.pronouns = {'i', 'me', 'my', 'mine', 'you', 'your', 'yours', 'he', 'him', 'his', 'she', 'her', 'hers', 'it', 'its', 'we', 'us', 'our', 'ours', 'they', 'them', 'their', 'theirs', 'who', 'whom', 'whose', 'which', 'what', 'this', 'that', 'these', 'those'}
165
+ self.prepositions = {'in', 'on', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'out', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'of'}
166
+ self.conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so', 'because', 'although', 'while', 'if', 'when', 'where', 'as', 'than', 'whether', 'that'}
167
+ self.auxiliaries = {'be', 'am', 'is', 'are', 'was', 'were', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'can', 'could'}
168
+
169
+ # 접미사 -> 품사
170
+ self.suffix_pos = {
171
+ # 명사
172
+ 'tion': 'NN', 'sion': 'NN', 'ment': 'NN', 'ness': 'NN', 'ity': 'NN',
173
+ 'er': 'NN', 'or': 'NN', 'ist': 'NN', 'ism': 'NN', 'ance': 'NN', 'ence': 'NN',
174
+ # 형용사
175
+ 'able': 'JJ', 'ible': 'JJ', 'ful': 'JJ', 'less': 'JJ', 'ous': 'JJ',
176
+ 'ive': 'JJ', 'al': 'JJ', 'ic': 'JJ', 'ical': 'JJ',
177
+ # 부사
178
+ 'ly': 'RB',
179
+ }
180
+
181
+ def _build_rules(self):
182
+ """규칙 구축"""
183
+ # 동사 활용 패턴
184
+ self.verb_patterns = [
185
+ # -ing (현재분사)
186
+ (r'(.+)ying$', r'\1y', 'VBG'), # studying -> study
187
+ (r'(.+)ing$', r'\1e', 'VBG'), # making -> make
188
+ (r'(.+)ing$', r'\1', 'VBG'), # going -> go
189
+ # -ed (과거/과거분사)
190
+ (r'(.+)ied$', r'\1y', 'VBD'), # studied -> study
191
+ (r'(.+)ed$', r'\1e', 'VBD'), # liked -> like
192
+ (r'(.+)ed$', r'\1', 'VBD'), # walked -> walk
193
+ # -s/-es (3인칭 단수)
194
+ (r'(.+)ies$', r'\1y', 'VBZ'), # studies -> study
195
+ (r'(.+)es$', r'\1', 'VBZ'), # goes -> go
196
+ (r'(.+)s$', r'\1', 'VBZ'), # walks -> walk
197
+ ]
198
+
199
+ # 명사 복수형 패턴
200
+ self.plural_patterns = [
201
+ (r'(.+)ies$', r'\1y'), # cities -> city
202
+ (r'(.+)ves$', r'\1f'), # lives -> life
203
+ (r'(.+)ves$', r'\1fe'), # wives -> wife
204
+ (r'(.+)es$', r'\1'), # boxes -> box
205
+ (r'(.+)s$', r'\1'), # cats -> cat
206
+ ]
207
+
208
+ # 비교급/최상급 패턴
209
+ self.comparative_patterns = [
210
+ (r'(.+)ier$', r'\1y', 'JJR'), # happier -> happy
211
+ (r'(.+)iest$', r'\1y', 'JJS'), # happiest -> happy
212
+ (r'(.+)er$', r'\1', 'JJR'), # bigger -> big
213
+ (r'(.+)est$', r'\1', 'JJS'), # biggest -> big
214
+ ]
215
+
216
+ def analyze(self, text: str) -> List[Morpheme]:
217
+ """형태소 분석"""
218
+ if not text:
219
+ return []
220
+
221
+ tokens = self._tokenize(text)
222
+ result = []
223
+
224
+ for token, start, end in tokens:
225
+ morpheme = self._analyze_token(token, start, end)
226
+ result.append(morpheme)
227
+
228
+ return result
229
+
230
+ def _tokenize(self, text: str) -> List[Tuple[str, int, int]]:
231
+ """토큰화"""
232
+ tokens = []
233
+ for match in re.finditer(r"[a-zA-Z]+|[0-9]+|[^\s\w]", text):
234
+ tokens.append((match.group(), match.start(), match.end()))
235
+ return tokens
236
+
237
+ def _analyze_token(self, token: str, start: int, end: int) -> Morpheme:
238
+ """단일 토큰 분석"""
239
+ lower = token.lower()
240
+
241
+ # 1. 기능어 체크
242
+ if lower in self.determiners:
243
+ return Morpheme(token, lower, 'DT', start, end)
244
+ if lower in self.pronouns:
245
+ return Morpheme(token, lower, 'PRP', start, end)
246
+ if lower in self.prepositions:
247
+ return Morpheme(token, lower, 'IN', start, end)
248
+ if lower in self.conjunctions:
249
+ return Morpheme(token, lower, 'CC', start, end)
250
+ if lower in self.auxiliaries:
251
+ lemma = self.irregular_verbs.get(lower, lower)
252
+ return Morpheme(token, lemma, 'VB', start, end)
253
+
254
+ # 2. 불규칙 동사 체크
255
+ if lower in self.irregular_verbs:
256
+ lemma = self.irregular_verbs[lower]
257
+ pos = self._get_verb_form(lower)
258
+ return Morpheme(token, lemma, pos, start, end)
259
+
260
+ # 3. 불규칙 복수형 체크
261
+ if lower in self.irregular_plurals:
262
+ lemma = self.irregular_plurals[lower]
263
+ return Morpheme(token, lemma, 'NNS', start, end)
264
+
265
+ # 4. 규칙 기반 분석
266
+ # 동사 활용
267
+ for pattern, replacement, pos in self.verb_patterns:
268
+ if re.match(pattern, lower):
269
+ lemma = re.sub(pattern, replacement, lower)
270
+ return Morpheme(token, lemma, pos, start, end)
271
+
272
+ # 복수형
273
+ for pattern, replacement in self.plural_patterns:
274
+ if re.match(pattern, lower) and len(lower) > 3:
275
+ lemma = re.sub(pattern, replacement, lower)
276
+ return Morpheme(token, lemma, 'NNS', start, end)
277
+
278
+ # 비교급/최상급
279
+ for pattern, replacement, pos in self.comparative_patterns:
280
+ if re.match(pattern, lower):
281
+ lemma = re.sub(pattern, replacement, lower)
282
+ return Morpheme(token, lemma, pos, start, end)
283
+
284
+ # 5. 접미사 기반 품사 추정
285
+ for suffix, pos in self.suffix_pos.items():
286
+ if lower.endswith(suffix) and len(lower) > len(suffix) + 2:
287
+ return Morpheme(token, lower, pos, start, end)
288
+
289
+ # 6. 대문자로 시작하면 고유명사
290
+ if token[0].isupper() and start > 0:
291
+ return Morpheme(token, token, 'NNP', start, end)
292
+
293
+ # 7. 기본값: 명사
294
+ return Morpheme(token, lower, 'NN', start, end)
295
+
296
+ def _get_verb_form(self, word: str) -> str:
297
+ """동사 형태 판별"""
298
+ if word.endswith('ing'):
299
+ return 'VBG'
300
+ elif word.endswith('ed') or word in {'was', 'were', 'had', 'did', 'went', 'came', 'saw', 'took', 'made', 'said', 'got', 'gave', 'found', 'thought', 'told', 'became', 'left', 'kept', 'began', 'wrote', 'ran', 'spoke', 'bought', 'brought', 'sat', 'stood', 'lost', 'paid', 'met', 'sent', 'built', 'fell', 'drove', 'broke', 'grew', 'chose', 'ate', 'drew', 'flew', 'threw', 'caught', 'taught', 'wore', 'won', 'sold'}:
301
+ return 'VBD'
302
+ elif word.endswith('en') or word in {'been', 'done', 'gone', 'seen', 'known', 'given', 'taken', 'written', 'spoken', 'chosen', 'eaten', 'drawn', 'flown', 'thrown', 'worn', 'driven', 'broken', 'grown', 'fallen', 'forgotten', 'gotten'}:
303
+ return 'VBN'
304
+ elif word.endswith('s'):
305
+ return 'VBZ'
306
+ else:
307
+ return 'VB'
308
+
309
+ def lemmatize(self, text: str) -> List[str]:
310
+ """기본형 추출"""
311
+ return [m.lemma for m in self.analyze(text)]
312
+
313
+ def pos_tag(self, text: str) -> List[Tuple[str, str]]:
314
+ """품사 태깅"""
315
+ return [(m.surface, m.pos) for m in self.analyze(text)]