tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Japanese Advanced Morphological Analyzer
|
|
3
|
+
========================================
|
|
4
|
+
|
|
5
|
+
5가지 고급 기능을 지원하는 일본어 형태소 분석기
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
1. NER Gazetteer Integration - 개체명 경계 보존
|
|
9
|
+
2. Real-time Dictionary Extension - 런타임 사전 확장
|
|
10
|
+
3. Domain Adaptation - 도메인별 분석 최적화
|
|
11
|
+
4. Code-switching - 영일 혼용 텍스트 처리
|
|
12
|
+
5. N-best Analysis - 다중 후보 + 신뢰도 점수
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
import json
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import List, Tuple, Dict, Set, Optional, Any
|
|
19
|
+
|
|
20
|
+
from .advanced_base import (
|
|
21
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# 확장 사전 경로
|
|
25
|
+
from .. import resources
|
|
26
|
+
|
|
27
|
+
# Optional external asset dir (default: none). If you want extended dictionaries,
|
|
28
|
+
# provide them under: TOKMOR_DATA_DIR/extended_dict/{lang}_extended.json
|
|
29
|
+
DICT_DIR = resources.data_dir() / "extended_dict"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class JapaneseAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
33
|
+
"""
|
|
34
|
+
일본어 고급 형태소 분석기
|
|
35
|
+
|
|
36
|
+
Usage:
|
|
37
|
+
analyzer = JapaneseAdvancedAnalyzer()
|
|
38
|
+
|
|
39
|
+
# 기본 분석
|
|
40
|
+
result = analyzer.analyze("東京に行きます")
|
|
41
|
+
|
|
42
|
+
# 개체명 보존
|
|
43
|
+
analyzer.add_entity("東京大学", "ORG")
|
|
44
|
+
result = analyzer.analyze("東京大学に行きます", preserve_entities=True)
|
|
45
|
+
|
|
46
|
+
# 도메인 적응
|
|
47
|
+
result = analyzer.analyze("株を買う", domain="finance")
|
|
48
|
+
|
|
49
|
+
# N-best 분석
|
|
50
|
+
result = analyzer.analyze("橋", n_best=3)
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
LANG_CODE = "ja"
|
|
54
|
+
LANG_NAME = "Japanese"
|
|
55
|
+
|
|
56
|
+
# Unicode patterns
|
|
57
|
+
HIRAGANA_PATTERN = re.compile(r'[\u3040-\u309f]+')
|
|
58
|
+
KATAKANA_PATTERN = re.compile(r'[\u30a0-\u30ff]+')
|
|
59
|
+
KANJI_PATTERN = re.compile(r'[\u4e00-\u9fff]+')
|
|
60
|
+
LATIN_PATTERN = re.compile(r'[a-zA-Z]+')
|
|
61
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+')
|
|
62
|
+
|
|
63
|
+
def __init__(self):
|
|
64
|
+
super().__init__()
|
|
65
|
+
|
|
66
|
+
def _build_base_dictionary(self):
|
|
67
|
+
"""기본 사전 구축"""
|
|
68
|
+
|
|
69
|
+
# =================================================================
|
|
70
|
+
# 助詞 (조사)
|
|
71
|
+
# =================================================================
|
|
72
|
+
self.particles = {
|
|
73
|
+
# 格助詞
|
|
74
|
+
'は': 'HA', 'が': 'GA', 'を': 'WO', 'に': 'NI', 'へ': 'HE',
|
|
75
|
+
'で': 'DE', 'と': 'TO', 'から': 'KARA', 'まで': 'MADE',
|
|
76
|
+
'より': 'YORI', 'の': 'NO',
|
|
77
|
+
# 接続助詞
|
|
78
|
+
'ば': 'BA', 'たら': 'TARA', 'ても': 'TEMO', 'けど': 'KEDO',
|
|
79
|
+
'けれど': 'KEREDO', 'が': 'GA', 'のに': 'NONI',
|
|
80
|
+
# 副助詞
|
|
81
|
+
'も': 'MO', 'だけ': 'DAKE', 'しか': 'SHIKA', 'ばかり': 'BAKARI',
|
|
82
|
+
'など': 'NADO', 'くらい': 'KURAI', 'ほど': 'HODO',
|
|
83
|
+
# 終助詞
|
|
84
|
+
'か': 'KA', 'ね': 'NE', 'よ': 'YO', 'な': 'NA', 'わ': 'WA',
|
|
85
|
+
'ぞ': 'ZO', 'さ': 'SA',
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# =================================================================
|
|
89
|
+
# 助動詞 (조동사)
|
|
90
|
+
# =================================================================
|
|
91
|
+
self.auxiliaries = {
|
|
92
|
+
'です': 'AUX', 'ます': 'AUX', 'た': 'AUX', 'だ': 'AUX',
|
|
93
|
+
'ない': 'AUX', 'れる': 'AUX', 'られる': 'AUX',
|
|
94
|
+
'せる': 'AUX', 'させる': 'AUX', 'たい': 'AUX',
|
|
95
|
+
'ている': 'AUX', 'てる': 'AUX', 'ました': 'AUX',
|
|
96
|
+
'でした': 'AUX', 'ません': 'AUX', 'ではない': 'AUX',
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# =================================================================
|
|
100
|
+
# 名詞 (명사)
|
|
101
|
+
# =================================================================
|
|
102
|
+
self.nouns = {
|
|
103
|
+
# 地名
|
|
104
|
+
'東京': ('名詞', 'トウキョウ'),
|
|
105
|
+
'大阪': ('名詞', 'オオサカ'),
|
|
106
|
+
'京都': ('名詞', 'キョウト'),
|
|
107
|
+
'日本': ('名詞', 'ニホン'),
|
|
108
|
+
'中国': ('名詞', 'チュウゴク'),
|
|
109
|
+
'韓国': ('名詞', 'カンコク'),
|
|
110
|
+
'アメリカ': ('名詞', 'アメリカ'),
|
|
111
|
+
# 組織
|
|
112
|
+
'会社': ('名詞', 'カイシャ'),
|
|
113
|
+
'学校': ('名詞', 'ガッコウ'),
|
|
114
|
+
'大学': ('名詞', 'ダイガク'),
|
|
115
|
+
'政府': ('名詞', 'セイフ'),
|
|
116
|
+
'銀行': ('名詞', 'ギンコウ'),
|
|
117
|
+
# 一般名詞
|
|
118
|
+
'人': ('名詞', 'ヒト'),
|
|
119
|
+
'仕事': ('名詞', 'シゴト'),
|
|
120
|
+
'時間': ('名詞', 'ジカン'),
|
|
121
|
+
'今日': ('名詞', 'キョウ'),
|
|
122
|
+
'明日': ('名詞', 'アシタ'),
|
|
123
|
+
'昨日': ('名詞', 'キノウ'),
|
|
124
|
+
'発表': ('名詞', 'ハッピョウ'),
|
|
125
|
+
'自動車': ('名詞', 'ジドウシャ'),
|
|
126
|
+
'電話': ('名詞', 'デンワ'),
|
|
127
|
+
'食べ物': ('名詞', 'タベモノ'),
|
|
128
|
+
'飲み物': ('名詞', 'ノミモノ'),
|
|
129
|
+
'橋': ('名詞', 'ハシ'), # 다의어: 橋/箸/端
|
|
130
|
+
'株': ('名詞', 'カブ'), # 다의어: 株式/木の株
|
|
131
|
+
# 代名詞
|
|
132
|
+
'私': ('代名詞', 'ワタシ'),
|
|
133
|
+
'僕': ('代名詞', 'ボク'),
|
|
134
|
+
'彼': ('代名詞', 'カレ'),
|
|
135
|
+
'彼女': ('代名詞', 'カノジョ'),
|
|
136
|
+
'これ': ('代名詞', 'コレ'),
|
|
137
|
+
'それ': ('代名詞', 'ソレ'),
|
|
138
|
+
'あれ': ('代名詞', 'アレ'),
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# =================================================================
|
|
142
|
+
# 動詞 (동사) - 어간 + 활용 타입
|
|
143
|
+
# =================================================================
|
|
144
|
+
self.verbs = {
|
|
145
|
+
# 5段動詞
|
|
146
|
+
'行': ('動詞', '行く', 'godan_k'),
|
|
147
|
+
'書': ('動詞', '書く', 'godan_k'),
|
|
148
|
+
'聞': ('動詞', '聞く', 'godan_k'),
|
|
149
|
+
'読': ('動詞', '読む', 'godan_m'),
|
|
150
|
+
'飲': ('動詞', '飲む', 'godan_m'),
|
|
151
|
+
'話': ('動詞', '話す', 'godan_s'),
|
|
152
|
+
'待': ('動詞', '待つ', 'godan_t'),
|
|
153
|
+
'買': ('動詞', '買う', 'godan_w'),
|
|
154
|
+
'言': ('動詞', '言う', 'godan_w'),
|
|
155
|
+
'思': ('動詞', '思う', 'godan_w'),
|
|
156
|
+
# 1段動詞
|
|
157
|
+
'見': ('動詞', '見る', 'ichidan'),
|
|
158
|
+
'食': ('動詞', '食べる', 'ichidan'),
|
|
159
|
+
'起': ('動詞', '起きる', 'ichidan'),
|
|
160
|
+
'寝': ('動詞', '寝る', 'ichidan'),
|
|
161
|
+
# カ変・サ変
|
|
162
|
+
'来': ('動詞', '来る', 'kuru'),
|
|
163
|
+
'し': ('動詞', 'する', 'suru'),
|
|
164
|
+
'する': ('動詞', 'する', 'suru'),
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
# =================================================================
|
|
168
|
+
# 形容詞 (형용사) - 기본형 + 어간형
|
|
169
|
+
# =================================================================
|
|
170
|
+
self.adjectives = {
|
|
171
|
+
# 기본형 (終止形)
|
|
172
|
+
'大きい': ('形容詞', '大きい'),
|
|
173
|
+
'小さい': ('形容詞', '小さい'),
|
|
174
|
+
'高い': ('形容詞', '高い'),
|
|
175
|
+
'安い': ('形容詞', '安い'),
|
|
176
|
+
'新しい': ('形容詞', '新しい'),
|
|
177
|
+
'古い': ('形容詞', '古い'),
|
|
178
|
+
'良い': ('形容詞', '良い'),
|
|
179
|
+
'いい': ('形容詞', '良い'), # 口語形
|
|
180
|
+
'悪い': ('形容詞', '悪い'),
|
|
181
|
+
'美しい': ('形容詞', '美しい'),
|
|
182
|
+
'嬉しい': ('形容詞', '嬉しい'),
|
|
183
|
+
'楽しい': ('形容詞', '楽しい'),
|
|
184
|
+
'難しい': ('形容詞', '難しい'),
|
|
185
|
+
'早い': ('形容詞', '早い'),
|
|
186
|
+
'速い': ('形容詞', '速い'),
|
|
187
|
+
'遅い': ('形容詞', '遅い'),
|
|
188
|
+
'若い': ('形容詞', '若い'),
|
|
189
|
+
'白い': ('形容詞', '白い'),
|
|
190
|
+
'黒い': ('形容詞', '黒い'),
|
|
191
|
+
'赤い': ('形容詞', '赤い'),
|
|
192
|
+
'青い': ('形容詞', '青い'),
|
|
193
|
+
'長い': ('形容詞', '長い'),
|
|
194
|
+
'短い': ('形容詞', '短い'),
|
|
195
|
+
'多い': ('形容詞', '多い'),
|
|
196
|
+
'少ない': ('形容詞', '少ない'),
|
|
197
|
+
'強い': ('形容詞', '強い'),
|
|
198
|
+
'弱い': ('形容詞', '弱い'),
|
|
199
|
+
'広い': ('形容詞', '広い'),
|
|
200
|
+
'狭い': ('形容詞', '狭い'),
|
|
201
|
+
'近い': ('形容詞', '近い'),
|
|
202
|
+
'遠い': ('形容詞', '遠い'),
|
|
203
|
+
# 어간 (活用形에서 사용)
|
|
204
|
+
'大き': ('形容詞', '大きい'),
|
|
205
|
+
'小さ': ('形容詞', '小さい'),
|
|
206
|
+
'高': ('形容詞', '高い'),
|
|
207
|
+
'安': ('形容詞', '安い'),
|
|
208
|
+
'新し': ('形容詞', '新しい'),
|
|
209
|
+
'古': ('形容詞', '古い'),
|
|
210
|
+
'良': ('形容詞', '良い'),
|
|
211
|
+
'悪': ('形容詞', '悪い'),
|
|
212
|
+
'美し': ('形容詞', '美しい'),
|
|
213
|
+
'嬉し': ('形容詞', '嬉しい'),
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
# =================================================================
|
|
217
|
+
# 動詞活用形 (동사 활용형)
|
|
218
|
+
# =================================================================
|
|
219
|
+
self.verb_forms = {
|
|
220
|
+
# 行く
|
|
221
|
+
'行きます': [('行', '動詞'), ('き', '連用形'), ('ます', '助動詞')],
|
|
222
|
+
'行った': [('行', '動詞'), ('っ', '促音便'), ('た', '助動詞')],
|
|
223
|
+
'行く': [('行', '動詞'), ('く', '終止形')],
|
|
224
|
+
'行かない': [('行', '動詞'), ('か', '未然形'), ('ない', '助動詞')],
|
|
225
|
+
# 来る
|
|
226
|
+
'来ます': [('来', '動詞'), ('ます', '助動詞')],
|
|
227
|
+
'来た': [('来', '動詞'), ('た', '助動詞')],
|
|
228
|
+
'来ない': [('来', '動詞'), ('ない', '助動詞')],
|
|
229
|
+
# する
|
|
230
|
+
'します': [('し', '動詞'), ('ます', '助動詞')],
|
|
231
|
+
'した': [('し', '動詞'), ('た', '助動詞')],
|
|
232
|
+
'しない': [('し', '動詞'), ('ない', '助動詞')],
|
|
233
|
+
# 食べる
|
|
234
|
+
'食べます': [('食', '動詞'), ('べ', '連用形'), ('ます', '助動詞')],
|
|
235
|
+
'食べた': [('食', '動詞'), ('べ', '連用形'), ('た', '助動詞')],
|
|
236
|
+
'食べない': [('食', '動詞'), ('べ', '未然形'), ('ない', '助動詞')],
|
|
237
|
+
# 見る
|
|
238
|
+
'見ます': [('見', '動詞'), ('ます', '助動詞')],
|
|
239
|
+
'見た': [('見', '動詞'), ('た', '助動詞')],
|
|
240
|
+
'見ない': [('見', '動詞'), ('ない', '助動詞')],
|
|
241
|
+
# 買う
|
|
242
|
+
'買います': [('買', '動詞'), ('い', '連用形'), ('ます', '助動詞')],
|
|
243
|
+
'買った': [('買', '動詞'), ('っ', '促音便'), ('た', '助動詞')],
|
|
244
|
+
'買う': [('買', '動詞'), ('う', '終止形')],
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
# =================================================================
|
|
248
|
+
# 副詞 (부사)
|
|
249
|
+
# =================================================================
|
|
250
|
+
self.adverbs = {
|
|
251
|
+
'とても': '副詞', 'すごく': '副詞', 'かなり': '副詞',
|
|
252
|
+
'よく': '副詞', 'まだ': '副詞', 'もう': '副詞',
|
|
253
|
+
'たくさん': '副詞', 'ちょっと': '副詞', '少し': '副詞',
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
# =================================================================
|
|
257
|
+
# 확장 사전 로드 (optional external asset)
|
|
258
|
+
# =================================================================
|
|
259
|
+
self._load_extended_dictionary()
|
|
260
|
+
|
|
261
|
+
def _load_extended_dictionary(self):
|
|
262
|
+
"""Load optional external extended dictionary"""
|
|
263
|
+
dict_path = DICT_DIR / 'ja_extended.json'
|
|
264
|
+
if not dict_path.exists():
|
|
265
|
+
return
|
|
266
|
+
|
|
267
|
+
with open(dict_path, 'r', encoding='utf-8') as f:
|
|
268
|
+
extended = json.load(f)
|
|
269
|
+
|
|
270
|
+
# 기존 사전에 추가
|
|
271
|
+
for word, upos in extended.items():
|
|
272
|
+
if upos in ('NOUN', 'PROPN'):
|
|
273
|
+
if word not in self.nouns:
|
|
274
|
+
self.nouns[word] = ('名詞', word)
|
|
275
|
+
elif upos == 'VERB':
|
|
276
|
+
if word not in self.verbs:
|
|
277
|
+
self.verbs[word] = ('動詞', word, 'godan')
|
|
278
|
+
elif upos == 'ADJ':
|
|
279
|
+
if word not in self.adjectives:
|
|
280
|
+
self.adjectives[word] = ('形容詞', word)
|
|
281
|
+
elif upos == 'ADV':
|
|
282
|
+
if word not in self.adverbs:
|
|
283
|
+
self.adverbs[word] = '副詞'
|
|
284
|
+
|
|
285
|
+
def _build_domain_dictionaries(self):
|
|
286
|
+
"""도메인별 사전 구축"""
|
|
287
|
+
|
|
288
|
+
# FOOD 도메인
|
|
289
|
+
self._domain_dictionaries[Domain.FOOD] = {
|
|
290
|
+
'橋': ('箸', '名詞'), # はし -> 젓가락
|
|
291
|
+
'株': ('株', '名詞'), # かぶ -> 순무
|
|
292
|
+
'飯': ('飯', '名詞'),
|
|
293
|
+
'酒': ('酒', '名詞'),
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
# TECH 도메인
|
|
297
|
+
self._domain_dictionaries[Domain.TECH] = {
|
|
298
|
+
'橋': ('ブリッジ', '名詞'), # 네트워크 브리지
|
|
299
|
+
'株': ('株', '名詞'),
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
# FINANCE 도메인
|
|
303
|
+
self._domain_dictionaries[Domain.FINANCE] = {
|
|
304
|
+
'橋': ('橋', '名詞'), # 일반적 다리
|
|
305
|
+
'株': ('株式', '名詞'), # 주식
|
|
306
|
+
'銀行': ('銀行', '名詞'),
|
|
307
|
+
'投資': ('投資', '名詞'),
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
# ENTERTAINMENT 도메인
|
|
311
|
+
self._domain_dictionaries[Domain.ENTERTAINMENT] = {
|
|
312
|
+
'AKB': ('AKB48', '名詞'),
|
|
313
|
+
'嵐': ('嵐', '名詞'), # 아이돌 그룹
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
317
|
+
"""분석 후보 생성"""
|
|
318
|
+
if not text or not text.strip():
|
|
319
|
+
return [AnalysisResult([])]
|
|
320
|
+
|
|
321
|
+
candidates = []
|
|
322
|
+
|
|
323
|
+
# 기본 분석
|
|
324
|
+
main_morphemes = self._analyze_text(text, domain)
|
|
325
|
+
main_result = AnalysisResult(
|
|
326
|
+
morphemes=main_morphemes,
|
|
327
|
+
score=1.0,
|
|
328
|
+
domain=domain
|
|
329
|
+
)
|
|
330
|
+
main_result.score = self._score_analysis(main_result)
|
|
331
|
+
candidates.append(main_result)
|
|
332
|
+
|
|
333
|
+
return candidates
|
|
334
|
+
|
|
335
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
336
|
+
"""텍스트 분석"""
|
|
337
|
+
if not text:
|
|
338
|
+
return []
|
|
339
|
+
|
|
340
|
+
result = []
|
|
341
|
+
pos = 0
|
|
342
|
+
|
|
343
|
+
while pos < len(text):
|
|
344
|
+
matched = False
|
|
345
|
+
|
|
346
|
+
# 공백 스킵
|
|
347
|
+
if text[pos].isspace():
|
|
348
|
+
pos += 1
|
|
349
|
+
continue
|
|
350
|
+
|
|
351
|
+
# 런타임 사전 우선 확인 (최장일치)
|
|
352
|
+
for length in range(min(len(text) - pos, 20), 0, -1):
|
|
353
|
+
substring = text[pos:pos+length]
|
|
354
|
+
|
|
355
|
+
if substring in self._user_dictionary:
|
|
356
|
+
lemma, pos_tag, _ = self._user_dictionary[substring]
|
|
357
|
+
result.append(Morpheme(
|
|
358
|
+
surface=substring, lemma=lemma, pos=pos_tag,
|
|
359
|
+
start=pos, end=pos + length
|
|
360
|
+
))
|
|
361
|
+
pos += length
|
|
362
|
+
matched = True
|
|
363
|
+
break
|
|
364
|
+
|
|
365
|
+
if matched:
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
# 도메인 사전 확인
|
|
369
|
+
for length in range(min(len(text) - pos, 10), 0, -1):
|
|
370
|
+
substring = text[pos:pos+length]
|
|
371
|
+
domain_sense = self._get_domain_sense(substring, domain)
|
|
372
|
+
|
|
373
|
+
if domain_sense:
|
|
374
|
+
result.append(Morpheme(
|
|
375
|
+
surface=substring, lemma=domain_sense[0], pos=domain_sense[1],
|
|
376
|
+
start=pos, end=pos + length
|
|
377
|
+
))
|
|
378
|
+
pos += length
|
|
379
|
+
matched = True
|
|
380
|
+
break
|
|
381
|
+
|
|
382
|
+
if matched:
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
# 활용형 사전 확인
|
|
386
|
+
for length in range(min(len(text) - pos, 10), 0, -1):
|
|
387
|
+
substring = text[pos:pos+length]
|
|
388
|
+
|
|
389
|
+
if substring in self.verb_forms:
|
|
390
|
+
curr_pos = pos
|
|
391
|
+
for surface, tag in self.verb_forms[substring]:
|
|
392
|
+
result.append(Morpheme(
|
|
393
|
+
surface=surface, lemma=surface, pos=tag,
|
|
394
|
+
start=curr_pos, end=curr_pos + len(surface)
|
|
395
|
+
))
|
|
396
|
+
curr_pos += len(surface)
|
|
397
|
+
pos += length
|
|
398
|
+
matched = True
|
|
399
|
+
break
|
|
400
|
+
|
|
401
|
+
if matched:
|
|
402
|
+
continue
|
|
403
|
+
|
|
404
|
+
# 최장일치 사전 탐색
|
|
405
|
+
for length in range(min(len(text) - pos, 10), 0, -1):
|
|
406
|
+
substring = text[pos:pos+length]
|
|
407
|
+
|
|
408
|
+
# 명사
|
|
409
|
+
if substring in self.nouns:
|
|
410
|
+
info = self.nouns[substring]
|
|
411
|
+
result.append(Morpheme(
|
|
412
|
+
surface=substring, lemma=substring, pos=info[0],
|
|
413
|
+
start=pos, end=pos + length,
|
|
414
|
+
features={'reading': info[1]}
|
|
415
|
+
))
|
|
416
|
+
pos += length
|
|
417
|
+
matched = True
|
|
418
|
+
break
|
|
419
|
+
|
|
420
|
+
# 조사
|
|
421
|
+
if substring in self.particles:
|
|
422
|
+
result.append(Morpheme(
|
|
423
|
+
surface=substring, lemma=substring, pos='助詞',
|
|
424
|
+
start=pos, end=pos + length
|
|
425
|
+
))
|
|
426
|
+
pos += length
|
|
427
|
+
matched = True
|
|
428
|
+
break
|
|
429
|
+
|
|
430
|
+
# 조동사
|
|
431
|
+
if substring in self.auxiliaries:
|
|
432
|
+
result.append(Morpheme(
|
|
433
|
+
surface=substring, lemma=substring, pos='助動詞',
|
|
434
|
+
start=pos, end=pos + length
|
|
435
|
+
))
|
|
436
|
+
pos += length
|
|
437
|
+
matched = True
|
|
438
|
+
break
|
|
439
|
+
|
|
440
|
+
# 부사
|
|
441
|
+
if substring in self.adverbs:
|
|
442
|
+
result.append(Morpheme(
|
|
443
|
+
surface=substring, lemma=substring, pos='副詞',
|
|
444
|
+
start=pos, end=pos + length
|
|
445
|
+
))
|
|
446
|
+
pos += length
|
|
447
|
+
matched = True
|
|
448
|
+
break
|
|
449
|
+
|
|
450
|
+
# 형용사
|
|
451
|
+
if substring in self.adjectives:
|
|
452
|
+
info = self.adjectives[substring]
|
|
453
|
+
result.append(Morpheme(
|
|
454
|
+
surface=substring, lemma=info[1], pos=info[0],
|
|
455
|
+
start=pos, end=pos + length
|
|
456
|
+
))
|
|
457
|
+
pos += length
|
|
458
|
+
matched = True
|
|
459
|
+
break
|
|
460
|
+
|
|
461
|
+
if not matched:
|
|
462
|
+
# 스크립트별 청크 처리
|
|
463
|
+
char = text[pos]
|
|
464
|
+
|
|
465
|
+
# 한자
|
|
466
|
+
if self.KANJI_PATTERN.match(char):
|
|
467
|
+
match = self.KANJI_PATTERN.match(text[pos:])
|
|
468
|
+
chunk = match.group()
|
|
469
|
+
result.append(Morpheme(
|
|
470
|
+
surface=chunk, lemma=chunk, pos='名詞',
|
|
471
|
+
start=pos, end=pos + len(chunk)
|
|
472
|
+
))
|
|
473
|
+
pos += len(chunk)
|
|
474
|
+
|
|
475
|
+
# 히라가나
|
|
476
|
+
elif self.HIRAGANA_PATTERN.match(char):
|
|
477
|
+
match = self.HIRAGANA_PATTERN.match(text[pos:])
|
|
478
|
+
chunk = match.group()
|
|
479
|
+
# 조사/조동사 분리 시도
|
|
480
|
+
analyzed = self._analyze_hiragana_chunk(chunk, pos)
|
|
481
|
+
result.extend(analyzed)
|
|
482
|
+
pos += len(chunk)
|
|
483
|
+
|
|
484
|
+
# 가타카나 (외래어)
|
|
485
|
+
elif self.KATAKANA_PATTERN.match(char):
|
|
486
|
+
match = self.KATAKANA_PATTERN.match(text[pos:])
|
|
487
|
+
chunk = match.group()
|
|
488
|
+
result.append(Morpheme(
|
|
489
|
+
surface=chunk, lemma=chunk, pos='名詞',
|
|
490
|
+
start=pos, end=pos + len(chunk),
|
|
491
|
+
features={'type': 'katakana'}
|
|
492
|
+
))
|
|
493
|
+
pos += len(chunk)
|
|
494
|
+
|
|
495
|
+
# 라틴 문자 (영어)
|
|
496
|
+
elif self.LATIN_PATTERN.match(char):
|
|
497
|
+
match = self.LATIN_PATTERN.match(text[pos:])
|
|
498
|
+
chunk = match.group()
|
|
499
|
+
result.append(Morpheme(
|
|
500
|
+
surface=chunk, lemma=chunk, pos='名詞',
|
|
501
|
+
start=pos, end=pos + len(chunk),
|
|
502
|
+
features={'type': 'latin'}
|
|
503
|
+
))
|
|
504
|
+
pos += len(chunk)
|
|
505
|
+
|
|
506
|
+
# 숫자
|
|
507
|
+
elif self.NUMBER_PATTERN.match(char):
|
|
508
|
+
match = self.NUMBER_PATTERN.match(text[pos:])
|
|
509
|
+
chunk = match.group()
|
|
510
|
+
result.append(Morpheme(
|
|
511
|
+
surface=chunk, lemma=chunk, pos='数詞',
|
|
512
|
+
start=pos, end=pos + len(chunk)
|
|
513
|
+
))
|
|
514
|
+
pos += len(chunk)
|
|
515
|
+
|
|
516
|
+
# 기타 (기호 등)
|
|
517
|
+
else:
|
|
518
|
+
result.append(Morpheme(
|
|
519
|
+
surface=char, lemma=char, pos='記号',
|
|
520
|
+
start=pos, end=pos + 1
|
|
521
|
+
))
|
|
522
|
+
pos += 1
|
|
523
|
+
|
|
524
|
+
return result
|
|
525
|
+
|
|
526
|
+
def _analyze_hiragana_chunk(self, chunk: str, offset: int) -> List[Morpheme]:
|
|
527
|
+
"""히라가나 청크 분석"""
|
|
528
|
+
results = []
|
|
529
|
+
|
|
530
|
+
# 조사/조동사 최장일치
|
|
531
|
+
pos = 0
|
|
532
|
+
while pos < len(chunk):
|
|
533
|
+
matched = False
|
|
534
|
+
|
|
535
|
+
for length in range(min(len(chunk) - pos, 5), 0, -1):
|
|
536
|
+
substring = chunk[pos:pos+length]
|
|
537
|
+
|
|
538
|
+
if substring in self.particles:
|
|
539
|
+
results.append(Morpheme(
|
|
540
|
+
surface=substring, lemma=substring, pos='助詞',
|
|
541
|
+
start=offset + pos, end=offset + pos + length
|
|
542
|
+
))
|
|
543
|
+
pos += length
|
|
544
|
+
matched = True
|
|
545
|
+
break
|
|
546
|
+
|
|
547
|
+
if substring in self.auxiliaries:
|
|
548
|
+
results.append(Morpheme(
|
|
549
|
+
surface=substring, lemma=substring, pos='助動詞',
|
|
550
|
+
start=offset + pos, end=offset + pos + length
|
|
551
|
+
))
|
|
552
|
+
pos += length
|
|
553
|
+
matched = True
|
|
554
|
+
break
|
|
555
|
+
|
|
556
|
+
if not matched:
|
|
557
|
+
# 남은 부분은 명사로
|
|
558
|
+
remaining = chunk[pos:]
|
|
559
|
+
if remaining:
|
|
560
|
+
results.append(Morpheme(
|
|
561
|
+
surface=remaining, lemma=remaining, pos='名詞',
|
|
562
|
+
start=offset + pos, end=offset + len(chunk)
|
|
563
|
+
))
|
|
564
|
+
break
|
|
565
|
+
|
|
566
|
+
return results
|
|
567
|
+
|
|
568
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
569
|
+
"""대안 분석 결과 생성 (N-best용)"""
|
|
570
|
+
alternatives = []
|
|
571
|
+
|
|
572
|
+
# 다른 도메인으로 분석
|
|
573
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
574
|
+
|
|
575
|
+
for alt_domain in other_domains:
|
|
576
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
577
|
+
result = AnalysisResult(
|
|
578
|
+
morphemes=morphemes,
|
|
579
|
+
score=0.8,
|
|
580
|
+
domain=alt_domain
|
|
581
|
+
)
|
|
582
|
+
result.score = self._score_analysis(result) * 0.9
|
|
583
|
+
alternatives.append(result)
|
|
584
|
+
|
|
585
|
+
return alternatives
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
# Alias for backward compatibility
|
|
589
|
+
JapaneseAnalyzer = JapaneseAdvancedAnalyzer
|