tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hindi Advanced Morphological Analyzer
|
|
3
|
+
=====================================
|
|
4
|
+
|
|
5
|
+
5가지 고급 기능을 지원하는 힌디어 형태소 분석기
|
|
6
|
+
|
|
7
|
+
특징:
|
|
8
|
+
- 데바나가리 문자 처리
|
|
9
|
+
- 후치사 분리
|
|
10
|
+
- 동사 활용 분석
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from typing import List, Tuple, Dict, Optional
|
|
15
|
+
|
|
16
|
+
from .advanced_base import (
|
|
17
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HindiAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
22
|
+
"""힌디어 고급 형태소 분석기"""
|
|
23
|
+
|
|
24
|
+
LANG_CODE = "hi"
|
|
25
|
+
LANG_NAME = "Hindi"
|
|
26
|
+
|
|
27
|
+
# 데바나가리 문자 패턴
|
|
28
|
+
DEVANAGARI_PATTERN = re.compile(r'[\u0900-\u097F]+')
|
|
29
|
+
NUMBER_PATTERN = re.compile(r'[0-9०-९]+(?:[.,][0-9०-९]+)?')
|
|
30
|
+
|
|
31
|
+
def __init__(self):
|
|
32
|
+
super().__init__()
|
|
33
|
+
|
|
34
|
+
def _build_base_dictionary(self):
|
|
35
|
+
"""기본 사전 구축"""
|
|
36
|
+
|
|
37
|
+
# 후치사 (Postpositions)
|
|
38
|
+
self.postpositions = {
|
|
39
|
+
'का': 'PSP', 'की': 'PSP', 'के': 'PSP', # ~의
|
|
40
|
+
'को': 'PSP', # ~에게
|
|
41
|
+
'से': 'PSP', # ~로부터
|
|
42
|
+
'में': 'PSP', # ~안에
|
|
43
|
+
'पर': 'PSP', # ~위에
|
|
44
|
+
'तक': 'PSP', # ~까지
|
|
45
|
+
'लिए': 'PSP', # ~을 위해
|
|
46
|
+
'साथ': 'PSP', # ~와 함께
|
|
47
|
+
'बिना': 'PSP', # ~없이
|
|
48
|
+
'द्वारा': 'PSP', # ~에 의해
|
|
49
|
+
'बारे': 'PSP', # ~에 대해
|
|
50
|
+
'ने': 'PSP', # 행위자 표지
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# 대명사
|
|
54
|
+
self.pronouns = {
|
|
55
|
+
'मैं': 'PRON', 'तू': 'PRON', 'तुम': 'PRON', 'आप': 'PRON',
|
|
56
|
+
'वह': 'PRON', 'यह': 'PRON', 'वे': 'PRON', 'ये': 'PRON',
|
|
57
|
+
'हम': 'PRON', 'वो': 'PRON',
|
|
58
|
+
'मुझे': 'PRON', 'तुझे': 'PRON', 'उसे': 'PRON', 'इसे': 'PRON',
|
|
59
|
+
'हमें': 'PRON', 'उन्हें': 'PRON', 'इन्हें': 'PRON',
|
|
60
|
+
'मेरा': 'PRON', 'तेरा': 'PRON', 'उसका': 'PRON', 'इसका': 'PRON',
|
|
61
|
+
'कौन': 'PRON', 'क्या': 'PRON', 'कहाँ': 'PRON', 'कब': 'PRON',
|
|
62
|
+
'जो': 'REL', 'जिसे': 'REL', 'जिसको': 'REL',
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# 접속사
|
|
66
|
+
self.conjunctions = {
|
|
67
|
+
'और': 'CONJ', 'या': 'CONJ', 'लेकिन': 'CONJ', 'परंतु': 'CONJ',
|
|
68
|
+
'कि': 'CONJ', 'क्योंकि': 'CONJ', 'अगर': 'CONJ', 'यदि': 'CONJ',
|
|
69
|
+
'जब': 'CONJ', 'तो': 'CONJ', 'तब': 'CONJ', 'फिर': 'CONJ',
|
|
70
|
+
'इसलिए': 'CONJ', 'जबकि': 'CONJ', 'हालांकि': 'CONJ',
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# 부사
|
|
74
|
+
self.adverbs = {
|
|
75
|
+
'बहुत': 'ADV', 'अच्छी': 'ADV', 'तरह': 'ADV', 'जल्दी': 'ADV',
|
|
76
|
+
'धीरे': 'ADV', 'अभी': 'ADV', 'अब': 'ADV', 'कल': 'ADV',
|
|
77
|
+
'आज': 'ADV', 'फिर': 'ADV', 'यहाँ': 'ADV', 'वहाँ': 'ADV',
|
|
78
|
+
'कभी': 'ADV', 'हमेशा': 'ADV', 'कभी': 'ADV', 'बस': 'ADV',
|
|
79
|
+
'सिर्फ': 'ADV', 'भी': 'ADV', 'ही': 'ADV', 'तो': 'ADV',
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
# 부정어/불변화사
|
|
83
|
+
self.particles = {
|
|
84
|
+
'नहीं': 'NEG', 'न': 'NEG', 'मत': 'NEG',
|
|
85
|
+
'ही': 'PRT', 'भी': 'PRT', 'तो': 'PRT',
|
|
86
|
+
'जी': 'PRT', 'हाँ': 'PRT',
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# 조동사
|
|
90
|
+
self.auxiliaries = {
|
|
91
|
+
'है': 'AUX', 'हैं': 'AUX', 'था': 'AUX', 'थी': 'AUX', 'थे': 'AUX', 'थीं': 'AUX',
|
|
92
|
+
'हूँ': 'AUX', 'हो': 'AUX',
|
|
93
|
+
'रहा': 'AUX', 'रही': 'AUX', 'रहे': 'AUX',
|
|
94
|
+
'गया': 'AUX', 'गयी': 'AUX', 'गए': 'AUX',
|
|
95
|
+
'सकता': 'AUX', 'सकती': 'AUX', 'सकते': 'AUX',
|
|
96
|
+
'चाहिए': 'AUX', 'होगा': 'AUX', 'होगी': 'AUX',
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# 일반 명사 (고빈도)
|
|
100
|
+
self.common_nouns = {
|
|
101
|
+
'लोग': 'NC', 'आदमी': 'NC', 'औरत': 'NC', 'बच्चा': 'NC',
|
|
102
|
+
'घर': 'NC', 'काम': 'NC', 'समय': 'NC', 'दिन': 'NC',
|
|
103
|
+
'साल': 'NC', 'बात': 'NC', 'पानी': 'NC', 'खाना': 'NC',
|
|
104
|
+
'देश': 'NC', 'शहर': 'NC', 'सरकार': 'NC', 'कंपनी': 'NC',
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# 동사 어간 (고빈도)
|
|
108
|
+
self.verb_stems = {
|
|
109
|
+
'कर': 'V', 'हो': 'V', 'जा': 'V', 'आ': 'V', 'दे': 'V',
|
|
110
|
+
'ले': 'V', 'रह': 'V', 'बोल': 'V', 'कह': 'V', 'सुन': 'V',
|
|
111
|
+
'देख': 'V', 'खा': 'V', 'पी': 'V', 'लिख': 'V', 'पढ़': 'V',
|
|
112
|
+
'चल': 'V', 'बैठ': 'V', 'उठ': 'V', 'सो': 'V', 'मिल': 'V',
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
def _build_domain_dictionaries(self):
|
|
116
|
+
"""도메인별 사전"""
|
|
117
|
+
self._domain_dictionaries[Domain.TECH] = {
|
|
118
|
+
'सेब': ('Apple', 'NP'), # Apple company
|
|
119
|
+
'बादल': ('cloud', 'NC'), # cloud computing
|
|
120
|
+
}
|
|
121
|
+
self._domain_dictionaries[Domain.FOOD] = {
|
|
122
|
+
'सेब': ('सेब', 'NC'), # apple (fruit)
|
|
123
|
+
}
|
|
124
|
+
self._domain_dictionaries[Domain.FINANCE] = {
|
|
125
|
+
'बैंक': ('बैंक', 'NC'), # bank
|
|
126
|
+
'शेयर': ('शेयर', 'NC'), # stock
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
130
|
+
if not text or not text.strip():
|
|
131
|
+
return [AnalysisResult([])]
|
|
132
|
+
morphemes = self._analyze_text(text, domain)
|
|
133
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
134
|
+
result.score = self._score_analysis(result)
|
|
135
|
+
return [result]
|
|
136
|
+
|
|
137
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
138
|
+
result = []
|
|
139
|
+
pos = 0
|
|
140
|
+
while pos < len(text):
|
|
141
|
+
if text[pos].isspace():
|
|
142
|
+
pos += 1
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
devanagari_match = self.DEVANAGARI_PATTERN.match(text[pos:])
|
|
146
|
+
if devanagari_match:
|
|
147
|
+
word = devanagari_match.group()
|
|
148
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
149
|
+
result.append(morpheme)
|
|
150
|
+
pos += len(word)
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
# 라틴 문자 (영어 차용어)
|
|
154
|
+
latin_match = re.match(r'[a-zA-Z]+', text[pos:])
|
|
155
|
+
if latin_match:
|
|
156
|
+
word = latin_match.group()
|
|
157
|
+
result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
|
|
158
|
+
pos += len(word)
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
162
|
+
if num_match:
|
|
163
|
+
num = num_match.group()
|
|
164
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
165
|
+
pos += len(num)
|
|
166
|
+
continue
|
|
167
|
+
|
|
168
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
169
|
+
pos += 1
|
|
170
|
+
return result
|
|
171
|
+
|
|
172
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
173
|
+
"""단어 분석"""
|
|
174
|
+
|
|
175
|
+
# 런타임 사전
|
|
176
|
+
if word in self._user_dictionary:
|
|
177
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
178
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
179
|
+
|
|
180
|
+
# 도메인 사전
|
|
181
|
+
domain_sense = self._get_domain_sense(word, domain)
|
|
182
|
+
if domain_sense:
|
|
183
|
+
return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
|
|
184
|
+
|
|
185
|
+
# 후치사
|
|
186
|
+
if word in self.postpositions:
|
|
187
|
+
return Morpheme(surface=word, lemma=word, pos='PSP', start=offset, end=offset + len(word))
|
|
188
|
+
|
|
189
|
+
# 대명사
|
|
190
|
+
if word in self.pronouns:
|
|
191
|
+
return Morpheme(surface=word, lemma=word, pos='PRON', start=offset, end=offset + len(word))
|
|
192
|
+
|
|
193
|
+
# 접속사
|
|
194
|
+
if word in self.conjunctions:
|
|
195
|
+
return Morpheme(surface=word, lemma=word, pos='CONJ', start=offset, end=offset + len(word))
|
|
196
|
+
|
|
197
|
+
# 부사
|
|
198
|
+
if word in self.adverbs:
|
|
199
|
+
return Morpheme(surface=word, lemma=word, pos='ADV', start=offset, end=offset + len(word))
|
|
200
|
+
|
|
201
|
+
# 불변화사
|
|
202
|
+
if word in self.particles:
|
|
203
|
+
return Morpheme(surface=word, lemma=word, pos=self.particles[word], start=offset, end=offset + len(word))
|
|
204
|
+
|
|
205
|
+
# 조동사
|
|
206
|
+
if word in self.auxiliaries:
|
|
207
|
+
return Morpheme(surface=word, lemma=word, pos='AUX', start=offset, end=offset + len(word))
|
|
208
|
+
|
|
209
|
+
# 일반명사
|
|
210
|
+
if word in self.common_nouns:
|
|
211
|
+
return Morpheme(surface=word, lemma=word, pos='NC', start=offset, end=offset + len(word))
|
|
212
|
+
|
|
213
|
+
# 동사 어간 확인
|
|
214
|
+
for stem, pos_tag in self.verb_stems.items():
|
|
215
|
+
if word.startswith(stem) and len(word) > len(stem):
|
|
216
|
+
return Morpheme(surface=word, lemma=stem, pos='V', start=offset, end=offset + len(word))
|
|
217
|
+
|
|
218
|
+
# 형태 분석
|
|
219
|
+
lemma, pos_tag = self._analyze_morphology(word)
|
|
220
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
221
|
+
|
|
222
|
+
def _analyze_morphology(self, word: str) -> Tuple[str, str]:
|
|
223
|
+
"""형태 분석"""
|
|
224
|
+
# -ना 동사 원형
|
|
225
|
+
if word.endswith('ना') and len(word) > 2:
|
|
226
|
+
return (word, 'V')
|
|
227
|
+
|
|
228
|
+
# -ता/-ती/-ते 현재분사
|
|
229
|
+
if word.endswith(('ता', 'ती', 'ते')) and len(word) > 2:
|
|
230
|
+
return (word[:-1], 'V')
|
|
231
|
+
|
|
232
|
+
# -ा/-ी/-े 과거분사
|
|
233
|
+
if word.endswith(('ा', 'ी', 'े')) and len(word) > 2:
|
|
234
|
+
return (word[:-1], 'V')
|
|
235
|
+
|
|
236
|
+
# -ई 명사 (여성)
|
|
237
|
+
if word.endswith('ई') and len(word) > 2:
|
|
238
|
+
return (word, 'NC')
|
|
239
|
+
|
|
240
|
+
# -आ 명사 (남성)
|
|
241
|
+
if word.endswith('आ') and len(word) > 2:
|
|
242
|
+
return (word, 'NC')
|
|
243
|
+
|
|
244
|
+
# 기본값: 명사
|
|
245
|
+
return (word, 'NC')
|
|
246
|
+
|
|
247
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
248
|
+
alternatives = []
|
|
249
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
250
|
+
for alt_domain in other_domains:
|
|
251
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
252
|
+
result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
|
|
253
|
+
result.score = self._score_analysis(result) * 0.9
|
|
254
|
+
alternatives.append(result)
|
|
255
|
+
return alternatives
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
HindiAnalyzer = HindiAdvancedAnalyzer
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Japanese Morphological Analyzer - 자체 구현
|
|
3
|
+
===========================================
|
|
4
|
+
|
|
5
|
+
외부 라이브러리 없이 순수 Python으로 구현한 일본어 형태소 분석기.
|
|
6
|
+
|
|
7
|
+
옵션 자산(고ROI):
|
|
8
|
+
- `TOKMOR_DATA_DIR/seg_lexicon/ja_wordfreq.pkl` 가 있으면, 긴 한자 run을
|
|
9
|
+
보수적으로 분할하기 위한 Viterbi 후보 점수로 사용한다.
|
|
10
|
+
(코어에는 자산을 번들하지 않음)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
import math
|
|
15
|
+
import pickle
|
|
16
|
+
from typing import List, Tuple, Optional
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
|
|
19
|
+
from ..resources import resolve_seg_lexicon_path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Morpheme:
|
|
24
|
+
"""형태소"""
|
|
25
|
+
surface: str
|
|
26
|
+
lemma: str
|
|
27
|
+
pos: str
|
|
28
|
+
start: int
|
|
29
|
+
end: int
|
|
30
|
+
reading: str = ''
|
|
31
|
+
|
|
32
|
+
def __repr__(self):
|
|
33
|
+
return f"{self.surface}/{self.pos}"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class JapaneseAnalyzer:
|
|
37
|
+
"""
|
|
38
|
+
일본어 형태소 분석기
|
|
39
|
+
|
|
40
|
+
Usage:
|
|
41
|
+
analyzer = JapaneseAnalyzer()
|
|
42
|
+
result = analyzer.analyze("東京に行きます")
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self):
|
|
46
|
+
# Optional kanji wordfreq lexicon (offline): {token:freq}
|
|
47
|
+
self._wordfreq = None
|
|
48
|
+
self._wordfreq_max_len = 4
|
|
49
|
+
self._build_dictionary()
|
|
50
|
+
self._load_seg_lexicon()
|
|
51
|
+
|
|
52
|
+
def _build_dictionary(self):
|
|
53
|
+
"""사전 구축"""
|
|
54
|
+
|
|
55
|
+
# 조사
|
|
56
|
+
self.particles = {
|
|
57
|
+
'は': 'HA', 'が': 'GA', 'を': 'WO', 'に': 'NI', 'へ': 'HE',
|
|
58
|
+
'で': 'DE', 'と': 'TO', 'から': 'KARA', 'まで': 'MADE',
|
|
59
|
+
'より': 'YORI', 'の': 'NO', 'も': 'MO', 'や': 'YA',
|
|
60
|
+
'など': 'NADO', 'か': 'KA', 'ね': 'NE', 'よ': 'YO',
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# 조동사/어미
|
|
64
|
+
self.auxiliaries = {
|
|
65
|
+
'です': 'AUX', 'ます': 'AUX', 'た': 'AUX', 'だ': 'AUX',
|
|
66
|
+
'ない': 'AUX', 'れる': 'AUX', 'られる': 'AUX',
|
|
67
|
+
'せる': 'AUX', 'させる': 'AUX', 'たい': 'AUX',
|
|
68
|
+
'ている': 'AUX', 'てる': 'AUX', 'ました': 'AUX',
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# 명사
|
|
72
|
+
self.nouns = {
|
|
73
|
+
'東京': ('名詞', 'トウキョウ'), '日本': ('名詞', 'ニホン'),
|
|
74
|
+
'大阪': ('名詞', 'オオサカ'), '京都': ('名詞', 'キョウト'),
|
|
75
|
+
'会社': ('名詞', 'カイシャ'), '学校': ('名詞', 'ガッコウ'),
|
|
76
|
+
'仕事': ('名詞', 'シゴト'), '人': ('名詞', 'ヒト'),
|
|
77
|
+
'時間': ('名詞', 'ジカン'), '今日': ('名詞', 'キョウ'),
|
|
78
|
+
'明日': ('名詞', 'アシタ'), '昨日': ('名詞', 'キノウ'),
|
|
79
|
+
'私': ('名詞', 'ワタシ'), '彼': ('名詞', 'カレ'),
|
|
80
|
+
'発表': ('名詞', 'ハッピョウ'), '自動車': ('名詞', 'ジドウシャ'),
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# 動詞 (5段/1段/カ変/サ変)
|
|
84
|
+
self.verbs = {
|
|
85
|
+
'行': ('動詞', '行く', 'godan'),
|
|
86
|
+
'来': ('動詞', '来る', 'kuru'),
|
|
87
|
+
'見': ('動詞', '見る', 'ichidan'),
|
|
88
|
+
'食': ('動詞', '食べる', 'ichidan'),
|
|
89
|
+
'話': ('動詞', '話す', 'godan'),
|
|
90
|
+
'読': ('動詞', '読む', 'godan'),
|
|
91
|
+
'書': ('動詞', '書く', 'godan'),
|
|
92
|
+
'聞': ('動詞', '聞く', 'godan'),
|
|
93
|
+
'思': ('動詞', '思う', 'godan'),
|
|
94
|
+
'言': ('動詞', '言う', 'godan'),
|
|
95
|
+
'し': ('動詞', 'する', 'suru'),
|
|
96
|
+
'する': ('動詞', 'する', 'suru'),
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# 형용사 (어간 + 종지형 모두 등록)
|
|
100
|
+
self.adjectives = {
|
|
101
|
+
# イ形容詞 어간
|
|
102
|
+
'大き': ('形容詞', '大きい'),
|
|
103
|
+
'小さ': ('形容詞', '小さい'),
|
|
104
|
+
'高': ('形容詞', '高い'),
|
|
105
|
+
'安': ('形容詞', '安い'),
|
|
106
|
+
'新し': ('形容詞', '新しい'),
|
|
107
|
+
'古': ('形容詞', '古い'),
|
|
108
|
+
'良': ('形容詞', '良い'),
|
|
109
|
+
'悪': ('形容詞', '悪い'),
|
|
110
|
+
'長': ('形容詞', '長い'),
|
|
111
|
+
'短': ('形容詞', '短い'),
|
|
112
|
+
'早': ('形容詞', '早い'),
|
|
113
|
+
'遅': ('形容詞', '遅い'),
|
|
114
|
+
'強': ('形容詞', '強い'),
|
|
115
|
+
'弱': ('形容詞', '弱い'),
|
|
116
|
+
'多': ('形容詞', '多い'),
|
|
117
|
+
'少な': ('形容詞', '少ない'),
|
|
118
|
+
'美し': ('形容詞', '美しい'),
|
|
119
|
+
'難し': ('形容詞', '難しい'),
|
|
120
|
+
'易し': ('形容詞', '易しい'),
|
|
121
|
+
# イ形容詞 종지형
|
|
122
|
+
'大きい': ('形容詞', '大きい'),
|
|
123
|
+
'小さい': ('形容詞', '小さい'),
|
|
124
|
+
'高い': ('形容詞', '高い'),
|
|
125
|
+
'安い': ('形容詞', '安い'),
|
|
126
|
+
'新しい': ('形容詞', '新しい'),
|
|
127
|
+
'古い': ('形容詞', '古い'),
|
|
128
|
+
'良い': ('形容詞', '良い'),
|
|
129
|
+
'悪い': ('形容詞', '悪い'),
|
|
130
|
+
'長い': ('形容詞', '長い'),
|
|
131
|
+
'短い': ('形容詞', '短い'),
|
|
132
|
+
'早い': ('形容詞', '早い'),
|
|
133
|
+
'遅い': ('形容詞', '遅い'),
|
|
134
|
+
'強い': ('形容詞', '強い'),
|
|
135
|
+
'弱い': ('形容詞', '弱い'),
|
|
136
|
+
'多い': ('形容詞', '多い'),
|
|
137
|
+
'少ない': ('形容詞', '少ない'),
|
|
138
|
+
'美しい': ('形容詞', '美しい'),
|
|
139
|
+
'難しい': ('形容詞', '難しい'),
|
|
140
|
+
'易しい': ('形容詞', '易しい'),
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# 히라가나 범위
|
|
144
|
+
self.hiragana = re.compile(r'[\u3040-\u309f]+')
|
|
145
|
+
# 가타카나 범위
|
|
146
|
+
self.katakana = re.compile(r'[\u30a0-\u30ff]+')
|
|
147
|
+
# 한자 범위
|
|
148
|
+
self.kanji = re.compile(r'[\u4e00-\u9fff]+')
|
|
149
|
+
|
|
150
|
+
def _load_seg_lexicon(self) -> None:
|
|
151
|
+
p = resolve_seg_lexicon_path("ja")
|
|
152
|
+
if not p:
|
|
153
|
+
return
|
|
154
|
+
try:
|
|
155
|
+
obj = pickle.loads(p.read_bytes())
|
|
156
|
+
if not isinstance(obj, dict):
|
|
157
|
+
return
|
|
158
|
+
wf = {}
|
|
159
|
+
mx = 2
|
|
160
|
+
for k, v in obj.items():
|
|
161
|
+
if isinstance(k, str) and k and isinstance(v, int) and v > 0:
|
|
162
|
+
# Japanese kanji-only lexicon expected; keep it conservative
|
|
163
|
+
if len(k) < 2:
|
|
164
|
+
continue
|
|
165
|
+
wf[k] = int(v)
|
|
166
|
+
if len(k) > mx:
|
|
167
|
+
mx = len(k)
|
|
168
|
+
if wf:
|
|
169
|
+
self._wordfreq = wf
|
|
170
|
+
self._wordfreq_max_len = max(2, min(int(mx), 8))
|
|
171
|
+
except Exception:
|
|
172
|
+
return
|
|
173
|
+
|
|
174
|
+
def _viterbi_kanji_run(self, run: str, start: int) -> List[Morpheme]:
|
|
175
|
+
"""
|
|
176
|
+
Viterbi segmentation over a pure-Kanji run using optional wordfreq.
|
|
177
|
+
If lexicon is missing, caller should not use this.
|
|
178
|
+
"""
|
|
179
|
+
wf = self._wordfreq or {}
|
|
180
|
+
max_len = max(self._wordfreq_max_len, 4)
|
|
181
|
+
max_len = max(2, min(int(max_len), 8))
|
|
182
|
+
n = len(run)
|
|
183
|
+
|
|
184
|
+
# Conservative scoring: prefer longer spans, penalize 1-char, reward known spans.
|
|
185
|
+
len_bonus = 0.75
|
|
186
|
+
single_penalty = 1.2
|
|
187
|
+
unk_base = -1.6
|
|
188
|
+
unk_len_penalty = 0.35
|
|
189
|
+
freq_cap = 200_000
|
|
190
|
+
|
|
191
|
+
best = [-1e100] * (n + 1)
|
|
192
|
+
back = [-1] * (n + 1)
|
|
193
|
+
back_len = [1] * (n + 1)
|
|
194
|
+
best[0] = 0.0
|
|
195
|
+
|
|
196
|
+
for i in range(n):
|
|
197
|
+
if best[i] <= -1e90:
|
|
198
|
+
continue
|
|
199
|
+
# allow 1..max_len
|
|
200
|
+
for L in range(1, max_len + 1):
|
|
201
|
+
j = i + L
|
|
202
|
+
if j > n:
|
|
203
|
+
break
|
|
204
|
+
span = run[i:j]
|
|
205
|
+
f = wf.get(span)
|
|
206
|
+
if f is not None:
|
|
207
|
+
f2 = min(int(f), freq_cap)
|
|
208
|
+
s = best[i] + math.log(f2 + 1.0) + len_bonus * (L - 1)
|
|
209
|
+
if L == 1:
|
|
210
|
+
s -= single_penalty
|
|
211
|
+
else:
|
|
212
|
+
# unknown: allow grouping, but penalize longer unknown spans
|
|
213
|
+
s = best[i] + unk_base - unk_len_penalty * (L - 1)
|
|
214
|
+
if L == 1:
|
|
215
|
+
s -= single_penalty
|
|
216
|
+
|
|
217
|
+
if s > best[j]:
|
|
218
|
+
best[j] = s
|
|
219
|
+
back[j] = i
|
|
220
|
+
back_len[j] = L
|
|
221
|
+
|
|
222
|
+
# backtrack
|
|
223
|
+
out: List[Morpheme] = []
|
|
224
|
+
j = n
|
|
225
|
+
if best[j] <= -1e90:
|
|
226
|
+
# fallback: whole run
|
|
227
|
+
return [Morpheme(run, run, '名詞', start, start + n)]
|
|
228
|
+
|
|
229
|
+
while j > 0:
|
|
230
|
+
i = back[j]
|
|
231
|
+
L = back_len[j]
|
|
232
|
+
if i < 0:
|
|
233
|
+
break
|
|
234
|
+
span = run[i:j]
|
|
235
|
+
out.append(Morpheme(span, span, '名詞', start + i, start + j))
|
|
236
|
+
j = i
|
|
237
|
+
out.reverse()
|
|
238
|
+
return out
|
|
239
|
+
|
|
240
|
+
def analyze(self, text: str) -> List[Morpheme]:
|
|
241
|
+
"""형태소 분석"""
|
|
242
|
+
if not text:
|
|
243
|
+
return []
|
|
244
|
+
|
|
245
|
+
result = []
|
|
246
|
+
pos = 0
|
|
247
|
+
|
|
248
|
+
while pos < len(text):
|
|
249
|
+
matched = False
|
|
250
|
+
|
|
251
|
+
# 공백
|
|
252
|
+
if text[pos].isspace():
|
|
253
|
+
pos += 1
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# 한자 연속 확인 (인명/고유명사 우선)
|
|
257
|
+
# 2자 이상 한자 연속이면 먼저 인명으로 처리
|
|
258
|
+
if self.kanji.match(text[pos]):
|
|
259
|
+
kanji_match = self.kanji.match(text[pos:])
|
|
260
|
+
kanji_chunk = kanji_match.group()
|
|
261
|
+
# 2자 이상 한자 연속 + 다음 문자가 조사이면 고유명사로 처리
|
|
262
|
+
next_pos = pos + len(kanji_chunk)
|
|
263
|
+
if len(kanji_chunk) >= 2:
|
|
264
|
+
if next_pos >= len(text) or text[next_pos] in self.particles or text[next_pos].isspace():
|
|
265
|
+
# 먼저 사전에서 찾기
|
|
266
|
+
found_in_dict = False
|
|
267
|
+
for l in range(len(kanji_chunk), 0, -1):
|
|
268
|
+
if text[pos:pos+l] in self.nouns:
|
|
269
|
+
info = self.nouns[text[pos:pos+l]]
|
|
270
|
+
result.append(Morpheme(
|
|
271
|
+
text[pos:pos+l], text[pos:pos+l], info[0],
|
|
272
|
+
pos, pos + l, info[1]
|
|
273
|
+
))
|
|
274
|
+
pos += l
|
|
275
|
+
found_in_dict = True
|
|
276
|
+
matched = True
|
|
277
|
+
break
|
|
278
|
+
if found_in_dict:
|
|
279
|
+
continue
|
|
280
|
+
# Optional: if we have a kanji lexicon, split long runs *only when it clearly helps*.
|
|
281
|
+
if self._wordfreq and len(kanji_chunk) >= 6:
|
|
282
|
+
parts = self._viterbi_kanji_run(kanji_chunk, pos)
|
|
283
|
+
# accept only if we got multiple non-trivial parts (avoid over-fragmentation)
|
|
284
|
+
if len(parts) >= 2 and all(len(p.surface) >= 2 for p in parts):
|
|
285
|
+
result.extend(parts)
|
|
286
|
+
pos += len(kanji_chunk)
|
|
287
|
+
matched = True
|
|
288
|
+
continue
|
|
289
|
+
|
|
290
|
+
# 사전에 없으면 전체를 고유명사로
|
|
291
|
+
result.append(Morpheme(
|
|
292
|
+
kanji_chunk, kanji_chunk, '固有名詞',
|
|
293
|
+
pos, pos + len(kanji_chunk)
|
|
294
|
+
))
|
|
295
|
+
pos += len(kanji_chunk)
|
|
296
|
+
matched = True
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
# 최장일치로 사전 탐색
|
|
300
|
+
for length in range(min(len(text) - pos, 10), 0, -1):
|
|
301
|
+
substring = text[pos:pos+length]
|
|
302
|
+
|
|
303
|
+
# 형용사 (명사보다 먼저 확인) - 단독 형용사만
|
|
304
|
+
if substring in self.adjectives and length > 1:
|
|
305
|
+
info = self.adjectives[substring]
|
|
306
|
+
result.append(Morpheme(
|
|
307
|
+
substring, info[1], '形容詞',
|
|
308
|
+
pos, pos + length
|
|
309
|
+
))
|
|
310
|
+
pos += length
|
|
311
|
+
matched = True
|
|
312
|
+
break
|
|
313
|
+
|
|
314
|
+
# 명사
|
|
315
|
+
if substring in self.nouns:
|
|
316
|
+
info = self.nouns[substring]
|
|
317
|
+
result.append(Morpheme(
|
|
318
|
+
substring, substring, info[0],
|
|
319
|
+
pos, pos + length, info[1]
|
|
320
|
+
))
|
|
321
|
+
pos += length
|
|
322
|
+
matched = True
|
|
323
|
+
break
|
|
324
|
+
|
|
325
|
+
# 조사
|
|
326
|
+
if substring in self.particles:
|
|
327
|
+
result.append(Morpheme(
|
|
328
|
+
substring, substring, '助詞',
|
|
329
|
+
pos, pos + length
|
|
330
|
+
))
|
|
331
|
+
pos += length
|
|
332
|
+
matched = True
|
|
333
|
+
break
|
|
334
|
+
|
|
335
|
+
# 조동사
|
|
336
|
+
if substring in self.auxiliaries:
|
|
337
|
+
result.append(Morpheme(
|
|
338
|
+
substring, substring, '助動詞',
|
|
339
|
+
pos, pos + length
|
|
340
|
+
))
|
|
341
|
+
pos += length
|
|
342
|
+
matched = True
|
|
343
|
+
break
|
|
344
|
+
|
|
345
|
+
if not matched:
|
|
346
|
+
# 한자 청크
|
|
347
|
+
if self.kanji.match(text[pos]):
|
|
348
|
+
match = self.kanji.match(text[pos:])
|
|
349
|
+
chunk = match.group()
|
|
350
|
+
# If we have a kanji wordfreq lexicon, split long runs conservatively.
|
|
351
|
+
if self._wordfreq and len(chunk) >= 5:
|
|
352
|
+
result.extend(self._viterbi_kanji_run(chunk, pos))
|
|
353
|
+
else:
|
|
354
|
+
result.append(Morpheme(
|
|
355
|
+
chunk, chunk, '名詞',
|
|
356
|
+
pos, pos + len(chunk)
|
|
357
|
+
))
|
|
358
|
+
pos += len(chunk)
|
|
359
|
+
# 히라가나 청크
|
|
360
|
+
elif self.hiragana.match(text[pos]):
|
|
361
|
+
match = self.hiragana.match(text[pos:])
|
|
362
|
+
chunk = match.group()
|
|
363
|
+
# 동사 활용 분석 시도
|
|
364
|
+
analyzed = self._analyze_verb_form(chunk, pos)
|
|
365
|
+
if analyzed:
|
|
366
|
+
result.extend(analyzed)
|
|
367
|
+
else:
|
|
368
|
+
result.append(Morpheme(
|
|
369
|
+
chunk, chunk, '名詞',
|
|
370
|
+
pos, pos + len(chunk)
|
|
371
|
+
))
|
|
372
|
+
pos += len(chunk)
|
|
373
|
+
# 가타카나 청크 (외래어)
|
|
374
|
+
elif self.katakana.match(text[pos]):
|
|
375
|
+
match = self.katakana.match(text[pos:])
|
|
376
|
+
chunk = match.group()
|
|
377
|
+
result.append(Morpheme(
|
|
378
|
+
chunk, chunk, '名詞',
|
|
379
|
+
pos, pos + len(chunk)
|
|
380
|
+
))
|
|
381
|
+
pos += len(chunk)
|
|
382
|
+
else:
|
|
383
|
+
# 기타 (숫자, 기호 등)
|
|
384
|
+
result.append(Morpheme(
|
|
385
|
+
text[pos], text[pos], '記号',
|
|
386
|
+
pos, pos + 1
|
|
387
|
+
))
|
|
388
|
+
pos += 1
|
|
389
|
+
|
|
390
|
+
return result
|
|
391
|
+
|
|
392
|
+
def _analyze_verb_form(self, form: str, offset: int) -> List[Morpheme]:
|
|
393
|
+
"""동사 활용형 분석"""
|
|
394
|
+
results = []
|
|
395
|
+
|
|
396
|
+
# ます형
|
|
397
|
+
if form.endswith('ます'):
|
|
398
|
+
stem = form[:-2]
|
|
399
|
+
if stem:
|
|
400
|
+
results.append(Morpheme(stem, stem + 'る', '動詞', offset, offset + len(stem)))
|
|
401
|
+
results.append(Morpheme('ます', 'ます', '助動詞', offset + len(stem), offset + len(form)))
|
|
402
|
+
return results
|
|
403
|
+
|
|
404
|
+
# た형
|
|
405
|
+
if form.endswith('た') or form.endswith('だ'):
|
|
406
|
+
stem = form[:-1]
|
|
407
|
+
if stem:
|
|
408
|
+
results.append(Morpheme(stem, stem, '動詞', offset, offset + len(stem)))
|
|
409
|
+
results.append(Morpheme(form[-1], form[-1], '助動詞', offset + len(stem), offset + len(form)))
|
|
410
|
+
return results
|
|
411
|
+
|
|
412
|
+
return []
|
|
413
|
+
|
|
414
|
+
def pos_tag(self, text: str) -> List[Tuple[str, str]]:
|
|
415
|
+
"""품사 태깅"""
|
|
416
|
+
morphemes = self.analyze(text)
|
|
417
|
+
return [(m.surface, m.pos) for m in morphemes]
|