tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,589 @@
1
+ """
2
+ Japanese Advanced Morphological Analyzer
3
+ ========================================
4
+
5
+ 5가지 고급 기능을 지원하는 일본어 형태소 분석기
6
+
7
+ Features:
8
+ 1. NER Gazetteer Integration - 개체명 경계 보존
9
+ 2. Real-time Dictionary Extension - 런타임 사전 확장
10
+ 3. Domain Adaptation - 도메인별 분석 최적화
11
+ 4. Code-switching - 영일 혼용 텍스트 처리
12
+ 5. N-best Analysis - 다중 후보 + 신뢰도 점수
13
+ """
14
+
15
+ import re
16
+ import json
17
+ from pathlib import Path
18
+ from typing import List, Tuple, Dict, Set, Optional, Any
19
+
20
+ from .advanced_base import (
21
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
22
+ )
23
+
24
+ # 확장 사전 경로
25
+ from .. import resources
26
+
27
+ # Optional external asset dir (default: none). If you want extended dictionaries,
28
+ # provide them under: TOKMOR_DATA_DIR/extended_dict/{lang}_extended.json
29
+ DICT_DIR = resources.data_dir() / "extended_dict"
30
+
31
+
32
+ class JapaneseAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
33
+ """
34
+ 일본어 고급 형태소 분석기
35
+
36
+ Usage:
37
+ analyzer = JapaneseAdvancedAnalyzer()
38
+
39
+ # 기본 분석
40
+ result = analyzer.analyze("東京に行きます")
41
+
42
+ # 개체명 보존
43
+ analyzer.add_entity("東京大学", "ORG")
44
+ result = analyzer.analyze("東京大学に行きます", preserve_entities=True)
45
+
46
+ # 도메인 적응
47
+ result = analyzer.analyze("株を買う", domain="finance")
48
+
49
+ # N-best 분석
50
+ result = analyzer.analyze("橋", n_best=3)
51
+ """
52
+
53
+ LANG_CODE = "ja"
54
+ LANG_NAME = "Japanese"
55
+
56
+ # Unicode patterns
57
+ HIRAGANA_PATTERN = re.compile(r'[\u3040-\u309f]+')
58
+ KATAKANA_PATTERN = re.compile(r'[\u30a0-\u30ff]+')
59
+ KANJI_PATTERN = re.compile(r'[\u4e00-\u9fff]+')
60
+ LATIN_PATTERN = re.compile(r'[a-zA-Z]+')
61
+ NUMBER_PATTERN = re.compile(r'[0-9]+')
62
+
63
+ def __init__(self):
64
+ super().__init__()
65
+
66
+ def _build_base_dictionary(self):
67
+ """기본 사전 구축"""
68
+
69
+ # =================================================================
70
+ # 助詞 (조사)
71
+ # =================================================================
72
+ self.particles = {
73
+ # 格助詞
74
+ 'は': 'HA', 'が': 'GA', 'を': 'WO', 'に': 'NI', 'へ': 'HE',
75
+ 'で': 'DE', 'と': 'TO', 'から': 'KARA', 'まで': 'MADE',
76
+ 'より': 'YORI', 'の': 'NO',
77
+ # 接続助詞
78
+ 'ば': 'BA', 'たら': 'TARA', 'ても': 'TEMO', 'けど': 'KEDO',
79
+ 'けれど': 'KEREDO', 'が': 'GA', 'のに': 'NONI',
80
+ # 副助詞
81
+ 'も': 'MO', 'だけ': 'DAKE', 'しか': 'SHIKA', 'ばかり': 'BAKARI',
82
+ 'など': 'NADO', 'くらい': 'KURAI', 'ほど': 'HODO',
83
+ # 終助詞
84
+ 'か': 'KA', 'ね': 'NE', 'よ': 'YO', 'な': 'NA', 'わ': 'WA',
85
+ 'ぞ': 'ZO', 'さ': 'SA',
86
+ }
87
+
88
+ # =================================================================
89
+ # 助動詞 (조동사)
90
+ # =================================================================
91
+ self.auxiliaries = {
92
+ 'です': 'AUX', 'ます': 'AUX', 'た': 'AUX', 'だ': 'AUX',
93
+ 'ない': 'AUX', 'れる': 'AUX', 'られる': 'AUX',
94
+ 'せる': 'AUX', 'させる': 'AUX', 'たい': 'AUX',
95
+ 'ている': 'AUX', 'てる': 'AUX', 'ました': 'AUX',
96
+ 'でした': 'AUX', 'ません': 'AUX', 'ではない': 'AUX',
97
+ }
98
+
99
+ # =================================================================
100
+ # 名詞 (명사)
101
+ # =================================================================
102
+ self.nouns = {
103
+ # 地名
104
+ '東京': ('名詞', 'トウキョウ'),
105
+ '大阪': ('名詞', 'オオサカ'),
106
+ '京都': ('名詞', 'キョウト'),
107
+ '日本': ('名詞', 'ニホン'),
108
+ '中国': ('名詞', 'チュウゴク'),
109
+ '韓国': ('名詞', 'カンコク'),
110
+ 'アメリカ': ('名詞', 'アメリカ'),
111
+ # 組織
112
+ '会社': ('名詞', 'カイシャ'),
113
+ '学校': ('名詞', 'ガッコウ'),
114
+ '大学': ('名詞', 'ダイガク'),
115
+ '政府': ('名詞', 'セイフ'),
116
+ '銀行': ('名詞', 'ギンコウ'),
117
+ # 一般名詞
118
+ '人': ('名詞', 'ヒト'),
119
+ '仕事': ('名詞', 'シゴト'),
120
+ '時間': ('名詞', 'ジカン'),
121
+ '今日': ('名詞', 'キョウ'),
122
+ '明日': ('名詞', 'アシタ'),
123
+ '昨日': ('名詞', 'キノウ'),
124
+ '発表': ('名詞', 'ハッピョウ'),
125
+ '自動車': ('名詞', 'ジドウシャ'),
126
+ '電話': ('名詞', 'デンワ'),
127
+ '食べ物': ('名詞', 'タベモノ'),
128
+ '飲み物': ('名詞', 'ノミモノ'),
129
+ '橋': ('名詞', 'ハシ'), # 다의어: 橋/箸/端
130
+ '株': ('名詞', 'カブ'), # 다의어: 株式/木の株
131
+ # 代名詞
132
+ '私': ('代名詞', 'ワタシ'),
133
+ '僕': ('代名詞', 'ボク'),
134
+ '彼': ('代名詞', 'カレ'),
135
+ '彼女': ('代名詞', 'カノジョ'),
136
+ 'これ': ('代名詞', 'コレ'),
137
+ 'それ': ('代名詞', 'ソレ'),
138
+ 'あれ': ('代名詞', 'アレ'),
139
+ }
140
+
141
+ # =================================================================
142
+ # 動詞 (동사) - 어간 + 활용 타입
143
+ # =================================================================
144
+ self.verbs = {
145
+ # 5段動詞
146
+ '行': ('動詞', '行く', 'godan_k'),
147
+ '書': ('動詞', '書く', 'godan_k'),
148
+ '聞': ('動詞', '聞く', 'godan_k'),
149
+ '読': ('動詞', '読む', 'godan_m'),
150
+ '飲': ('動詞', '飲む', 'godan_m'),
151
+ '話': ('動詞', '話す', 'godan_s'),
152
+ '待': ('動詞', '待つ', 'godan_t'),
153
+ '買': ('動詞', '買う', 'godan_w'),
154
+ '言': ('動詞', '言う', 'godan_w'),
155
+ '思': ('動詞', '思う', 'godan_w'),
156
+ # 1段動詞
157
+ '見': ('動詞', '見る', 'ichidan'),
158
+ '食': ('動詞', '食べる', 'ichidan'),
159
+ '起': ('動詞', '起きる', 'ichidan'),
160
+ '寝': ('動詞', '寝る', 'ichidan'),
161
+ # カ変・サ変
162
+ '来': ('動詞', '来る', 'kuru'),
163
+ 'し': ('動詞', 'する', 'suru'),
164
+ 'する': ('動詞', 'する', 'suru'),
165
+ }
166
+
167
+ # =================================================================
168
+ # 形容詞 (형용사) - 기본형 + 어간형
169
+ # =================================================================
170
+ self.adjectives = {
171
+ # 기본형 (終止形)
172
+ '大きい': ('形容詞', '大きい'),
173
+ '小さい': ('形容詞', '小さい'),
174
+ '高い': ('形容詞', '高い'),
175
+ '安い': ('形容詞', '安い'),
176
+ '新しい': ('形容詞', '新しい'),
177
+ '古い': ('形容詞', '古い'),
178
+ '良い': ('形容詞', '良い'),
179
+ 'いい': ('形容詞', '良い'), # 口語形
180
+ '悪い': ('形容詞', '悪い'),
181
+ '美しい': ('形容詞', '美しい'),
182
+ '嬉しい': ('形容詞', '嬉しい'),
183
+ '楽しい': ('形容詞', '楽しい'),
184
+ '難しい': ('形容詞', '難しい'),
185
+ '早い': ('形容詞', '早い'),
186
+ '速い': ('形容詞', '速い'),
187
+ '遅い': ('形容詞', '遅い'),
188
+ '若い': ('形容詞', '若い'),
189
+ '白い': ('形容詞', '白い'),
190
+ '黒い': ('形容詞', '黒い'),
191
+ '赤い': ('形容詞', '赤い'),
192
+ '青い': ('形容詞', '青い'),
193
+ '長い': ('形容詞', '長い'),
194
+ '短い': ('形容詞', '短い'),
195
+ '多い': ('形容詞', '多い'),
196
+ '少ない': ('形容詞', '少ない'),
197
+ '強い': ('形容詞', '強い'),
198
+ '弱い': ('形容詞', '弱い'),
199
+ '広い': ('形容詞', '広い'),
200
+ '狭い': ('形容詞', '狭い'),
201
+ '近い': ('形容詞', '近い'),
202
+ '遠い': ('形容詞', '遠い'),
203
+ # 어간 (活用形에서 사용)
204
+ '大き': ('形容詞', '大きい'),
205
+ '小さ': ('形容詞', '小さい'),
206
+ '高': ('形容詞', '高い'),
207
+ '安': ('形容詞', '安い'),
208
+ '新し': ('形容詞', '新しい'),
209
+ '古': ('形容詞', '古い'),
210
+ '良': ('形容詞', '良い'),
211
+ '悪': ('形容詞', '悪い'),
212
+ '美し': ('形容詞', '美しい'),
213
+ '嬉し': ('形容詞', '嬉しい'),
214
+ }
215
+
216
+ # =================================================================
217
+ # 動詞活用形 (동사 활용형)
218
+ # =================================================================
219
+ self.verb_forms = {
220
+ # 行く
221
+ '行きます': [('行', '動詞'), ('き', '連用形'), ('ます', '助動詞')],
222
+ '行った': [('行', '動詞'), ('っ', '促音便'), ('た', '助動詞')],
223
+ '行く': [('行', '動詞'), ('く', '終止形')],
224
+ '行かない': [('行', '動詞'), ('か', '未然形'), ('ない', '助動詞')],
225
+ # 来る
226
+ '来ます': [('来', '動詞'), ('ます', '助動詞')],
227
+ '来た': [('来', '動詞'), ('た', '助動詞')],
228
+ '来ない': [('来', '動詞'), ('ない', '助動詞')],
229
+ # する
230
+ 'します': [('し', '動詞'), ('ます', '助動詞')],
231
+ 'した': [('し', '動詞'), ('た', '助動詞')],
232
+ 'しない': [('し', '動詞'), ('ない', '助動詞')],
233
+ # 食べる
234
+ '食べます': [('食', '動詞'), ('べ', '連用形'), ('ます', '助動詞')],
235
+ '食べた': [('食', '動詞'), ('べ', '連用形'), ('た', '助動詞')],
236
+ '食べない': [('食', '動詞'), ('べ', '未然形'), ('ない', '助動詞')],
237
+ # 見る
238
+ '見ます': [('見', '動詞'), ('ます', '助動詞')],
239
+ '見た': [('見', '動詞'), ('た', '助動詞')],
240
+ '見ない': [('見', '動詞'), ('ない', '助動詞')],
241
+ # 買う
242
+ '買います': [('買', '動詞'), ('い', '連用形'), ('ます', '助動詞')],
243
+ '買った': [('買', '動詞'), ('っ', '促音便'), ('た', '助動詞')],
244
+ '買う': [('買', '動詞'), ('う', '終止形')],
245
+ }
246
+
247
+ # =================================================================
248
+ # 副詞 (부사)
249
+ # =================================================================
250
+ self.adverbs = {
251
+ 'とても': '副詞', 'すごく': '副詞', 'かなり': '副詞',
252
+ 'よく': '副詞', 'まだ': '副詞', 'もう': '副詞',
253
+ 'たくさん': '副詞', 'ちょっと': '副詞', '少し': '副詞',
254
+ }
255
+
256
+ # =================================================================
257
+ # 확장 사전 로드 (optional external asset)
258
+ # =================================================================
259
+ self._load_extended_dictionary()
260
+
261
+ def _load_extended_dictionary(self):
262
+ """Load optional external extended dictionary"""
263
+ dict_path = DICT_DIR / 'ja_extended.json'
264
+ if not dict_path.exists():
265
+ return
266
+
267
+ with open(dict_path, 'r', encoding='utf-8') as f:
268
+ extended = json.load(f)
269
+
270
+ # 기존 사전에 추가
271
+ for word, upos in extended.items():
272
+ if upos in ('NOUN', 'PROPN'):
273
+ if word not in self.nouns:
274
+ self.nouns[word] = ('名詞', word)
275
+ elif upos == 'VERB':
276
+ if word not in self.verbs:
277
+ self.verbs[word] = ('動詞', word, 'godan')
278
+ elif upos == 'ADJ':
279
+ if word not in self.adjectives:
280
+ self.adjectives[word] = ('形容詞', word)
281
+ elif upos == 'ADV':
282
+ if word not in self.adverbs:
283
+ self.adverbs[word] = '副詞'
284
+
285
+ def _build_domain_dictionaries(self):
286
+ """도메인별 사전 구축"""
287
+
288
+ # FOOD 도메인
289
+ self._domain_dictionaries[Domain.FOOD] = {
290
+ '橋': ('箸', '名詞'), # はし -> 젓가락
291
+ '株': ('株', '名詞'), # かぶ -> 순무
292
+ '飯': ('飯', '名詞'),
293
+ '酒': ('酒', '名詞'),
294
+ }
295
+
296
+ # TECH 도메인
297
+ self._domain_dictionaries[Domain.TECH] = {
298
+ '橋': ('ブリッジ', '名詞'), # 네트워크 브리지
299
+ '株': ('株', '名詞'),
300
+ }
301
+
302
+ # FINANCE 도메인
303
+ self._domain_dictionaries[Domain.FINANCE] = {
304
+ '橋': ('橋', '名詞'), # 일반적 다리
305
+ '株': ('株式', '名詞'), # 주식
306
+ '銀行': ('銀行', '名詞'),
307
+ '投資': ('投資', '名詞'),
308
+ }
309
+
310
+ # ENTERTAINMENT 도메인
311
+ self._domain_dictionaries[Domain.ENTERTAINMENT] = {
312
+ 'AKB': ('AKB48', '名詞'),
313
+ '嵐': ('嵐', '名詞'), # 아이돌 그룹
314
+ }
315
+
316
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
317
+ """분석 후보 생성"""
318
+ if not text or not text.strip():
319
+ return [AnalysisResult([])]
320
+
321
+ candidates = []
322
+
323
+ # 기본 분석
324
+ main_morphemes = self._analyze_text(text, domain)
325
+ main_result = AnalysisResult(
326
+ morphemes=main_morphemes,
327
+ score=1.0,
328
+ domain=domain
329
+ )
330
+ main_result.score = self._score_analysis(main_result)
331
+ candidates.append(main_result)
332
+
333
+ return candidates
334
+
335
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
336
+ """텍스트 분석"""
337
+ if not text:
338
+ return []
339
+
340
+ result = []
341
+ pos = 0
342
+
343
+ while pos < len(text):
344
+ matched = False
345
+
346
+ # 공백 스킵
347
+ if text[pos].isspace():
348
+ pos += 1
349
+ continue
350
+
351
+ # 런타임 사전 우선 확인 (최장일치)
352
+ for length in range(min(len(text) - pos, 20), 0, -1):
353
+ substring = text[pos:pos+length]
354
+
355
+ if substring in self._user_dictionary:
356
+ lemma, pos_tag, _ = self._user_dictionary[substring]
357
+ result.append(Morpheme(
358
+ surface=substring, lemma=lemma, pos=pos_tag,
359
+ start=pos, end=pos + length
360
+ ))
361
+ pos += length
362
+ matched = True
363
+ break
364
+
365
+ if matched:
366
+ continue
367
+
368
+ # 도메인 사전 확인
369
+ for length in range(min(len(text) - pos, 10), 0, -1):
370
+ substring = text[pos:pos+length]
371
+ domain_sense = self._get_domain_sense(substring, domain)
372
+
373
+ if domain_sense:
374
+ result.append(Morpheme(
375
+ surface=substring, lemma=domain_sense[0], pos=domain_sense[1],
376
+ start=pos, end=pos + length
377
+ ))
378
+ pos += length
379
+ matched = True
380
+ break
381
+
382
+ if matched:
383
+ continue
384
+
385
+ # 활용형 사전 확인
386
+ for length in range(min(len(text) - pos, 10), 0, -1):
387
+ substring = text[pos:pos+length]
388
+
389
+ if substring in self.verb_forms:
390
+ curr_pos = pos
391
+ for surface, tag in self.verb_forms[substring]:
392
+ result.append(Morpheme(
393
+ surface=surface, lemma=surface, pos=tag,
394
+ start=curr_pos, end=curr_pos + len(surface)
395
+ ))
396
+ curr_pos += len(surface)
397
+ pos += length
398
+ matched = True
399
+ break
400
+
401
+ if matched:
402
+ continue
403
+
404
+ # 최장일치 사전 탐색
405
+ for length in range(min(len(text) - pos, 10), 0, -1):
406
+ substring = text[pos:pos+length]
407
+
408
+ # 명사
409
+ if substring in self.nouns:
410
+ info = self.nouns[substring]
411
+ result.append(Morpheme(
412
+ surface=substring, lemma=substring, pos=info[0],
413
+ start=pos, end=pos + length,
414
+ features={'reading': info[1]}
415
+ ))
416
+ pos += length
417
+ matched = True
418
+ break
419
+
420
+ # 조사
421
+ if substring in self.particles:
422
+ result.append(Morpheme(
423
+ surface=substring, lemma=substring, pos='助詞',
424
+ start=pos, end=pos + length
425
+ ))
426
+ pos += length
427
+ matched = True
428
+ break
429
+
430
+ # 조동사
431
+ if substring in self.auxiliaries:
432
+ result.append(Morpheme(
433
+ surface=substring, lemma=substring, pos='助動詞',
434
+ start=pos, end=pos + length
435
+ ))
436
+ pos += length
437
+ matched = True
438
+ break
439
+
440
+ # 부사
441
+ if substring in self.adverbs:
442
+ result.append(Morpheme(
443
+ surface=substring, lemma=substring, pos='副詞',
444
+ start=pos, end=pos + length
445
+ ))
446
+ pos += length
447
+ matched = True
448
+ break
449
+
450
+ # 형용사
451
+ if substring in self.adjectives:
452
+ info = self.adjectives[substring]
453
+ result.append(Morpheme(
454
+ surface=substring, lemma=info[1], pos=info[0],
455
+ start=pos, end=pos + length
456
+ ))
457
+ pos += length
458
+ matched = True
459
+ break
460
+
461
+ if not matched:
462
+ # 스크립트별 청크 처리
463
+ char = text[pos]
464
+
465
+ # 한자
466
+ if self.KANJI_PATTERN.match(char):
467
+ match = self.KANJI_PATTERN.match(text[pos:])
468
+ chunk = match.group()
469
+ result.append(Morpheme(
470
+ surface=chunk, lemma=chunk, pos='名詞',
471
+ start=pos, end=pos + len(chunk)
472
+ ))
473
+ pos += len(chunk)
474
+
475
+ # 히라가나
476
+ elif self.HIRAGANA_PATTERN.match(char):
477
+ match = self.HIRAGANA_PATTERN.match(text[pos:])
478
+ chunk = match.group()
479
+ # 조사/조동사 분리 시도
480
+ analyzed = self._analyze_hiragana_chunk(chunk, pos)
481
+ result.extend(analyzed)
482
+ pos += len(chunk)
483
+
484
+ # 가타카나 (외래어)
485
+ elif self.KATAKANA_PATTERN.match(char):
486
+ match = self.KATAKANA_PATTERN.match(text[pos:])
487
+ chunk = match.group()
488
+ result.append(Morpheme(
489
+ surface=chunk, lemma=chunk, pos='名詞',
490
+ start=pos, end=pos + len(chunk),
491
+ features={'type': 'katakana'}
492
+ ))
493
+ pos += len(chunk)
494
+
495
+ # 라틴 문자 (영어)
496
+ elif self.LATIN_PATTERN.match(char):
497
+ match = self.LATIN_PATTERN.match(text[pos:])
498
+ chunk = match.group()
499
+ result.append(Morpheme(
500
+ surface=chunk, lemma=chunk, pos='名詞',
501
+ start=pos, end=pos + len(chunk),
502
+ features={'type': 'latin'}
503
+ ))
504
+ pos += len(chunk)
505
+
506
+ # 숫자
507
+ elif self.NUMBER_PATTERN.match(char):
508
+ match = self.NUMBER_PATTERN.match(text[pos:])
509
+ chunk = match.group()
510
+ result.append(Morpheme(
511
+ surface=chunk, lemma=chunk, pos='数詞',
512
+ start=pos, end=pos + len(chunk)
513
+ ))
514
+ pos += len(chunk)
515
+
516
+ # 기타 (기호 등)
517
+ else:
518
+ result.append(Morpheme(
519
+ surface=char, lemma=char, pos='記号',
520
+ start=pos, end=pos + 1
521
+ ))
522
+ pos += 1
523
+
524
+ return result
525
+
526
+ def _analyze_hiragana_chunk(self, chunk: str, offset: int) -> List[Morpheme]:
527
+ """히라가나 청크 분석"""
528
+ results = []
529
+
530
+ # 조사/조동사 최장일치
531
+ pos = 0
532
+ while pos < len(chunk):
533
+ matched = False
534
+
535
+ for length in range(min(len(chunk) - pos, 5), 0, -1):
536
+ substring = chunk[pos:pos+length]
537
+
538
+ if substring in self.particles:
539
+ results.append(Morpheme(
540
+ surface=substring, lemma=substring, pos='助詞',
541
+ start=offset + pos, end=offset + pos + length
542
+ ))
543
+ pos += length
544
+ matched = True
545
+ break
546
+
547
+ if substring in self.auxiliaries:
548
+ results.append(Morpheme(
549
+ surface=substring, lemma=substring, pos='助動詞',
550
+ start=offset + pos, end=offset + pos + length
551
+ ))
552
+ pos += length
553
+ matched = True
554
+ break
555
+
556
+ if not matched:
557
+ # 남은 부분은 명사로
558
+ remaining = chunk[pos:]
559
+ if remaining:
560
+ results.append(Morpheme(
561
+ surface=remaining, lemma=remaining, pos='名詞',
562
+ start=offset + pos, end=offset + len(chunk)
563
+ ))
564
+ break
565
+
566
+ return results
567
+
568
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
569
+ """대안 분석 결과 생성 (N-best용)"""
570
+ alternatives = []
571
+
572
+ # 다른 도메인으로 분석
573
+ other_domains = [d for d in Domain if d != domain][:count]
574
+
575
+ for alt_domain in other_domains:
576
+ morphemes = self._analyze_text(text, alt_domain)
577
+ result = AnalysisResult(
578
+ morphemes=morphemes,
579
+ score=0.8,
580
+ domain=alt_domain
581
+ )
582
+ result.score = self._score_analysis(result) * 0.9
583
+ alternatives.append(result)
584
+
585
+ return alternatives
586
+
587
+
588
+ # Alias for backward compatibility
589
+ JapaneseAnalyzer = JapaneseAdvancedAnalyzer