tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,258 @@
1
+ """
2
+ Hindi Advanced Morphological Analyzer
3
+ =====================================
4
+
5
+ 5가지 고급 기능을 지원하는 힌디어 형태소 분석기
6
+
7
+ 특징:
8
+ - 데바나가리 문자 처리
9
+ - 후치사 분리
10
+ - 동사 활용 분석
11
+ """
12
+
13
+ import re
14
+ from typing import List, Tuple, Dict, Optional
15
+
16
+ from .advanced_base import (
17
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
18
+ )
19
+
20
+
21
+ class HindiAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
22
+ """힌디어 고급 형태소 분석기"""
23
+
24
+ LANG_CODE = "hi"
25
+ LANG_NAME = "Hindi"
26
+
27
+ # 데바나가리 문자 패턴
28
+ DEVANAGARI_PATTERN = re.compile(r'[\u0900-\u097F]+')
29
+ NUMBER_PATTERN = re.compile(r'[0-9०-९]+(?:[.,][0-9०-९]+)?')
30
+
31
+ def __init__(self):
32
+ super().__init__()
33
+
34
+ def _build_base_dictionary(self):
35
+ """기본 사전 구축"""
36
+
37
+ # 후치사 (Postpositions)
38
+ self.postpositions = {
39
+ 'का': 'PSP', 'की': 'PSP', 'के': 'PSP', # ~의
40
+ 'को': 'PSP', # ~에게
41
+ 'से': 'PSP', # ~로부터
42
+ 'में': 'PSP', # ~안에
43
+ 'पर': 'PSP', # ~위에
44
+ 'तक': 'PSP', # ~까지
45
+ 'लिए': 'PSP', # ~을 위해
46
+ 'साथ': 'PSP', # ~와 함께
47
+ 'बिना': 'PSP', # ~없이
48
+ 'द्वारा': 'PSP', # ~에 의해
49
+ 'बारे': 'PSP', # ~에 대해
50
+ 'ने': 'PSP', # 행위자 표지
51
+ }
52
+
53
+ # 대명사
54
+ self.pronouns = {
55
+ 'मैं': 'PRON', 'तू': 'PRON', 'तुम': 'PRON', 'आप': 'PRON',
56
+ 'वह': 'PRON', 'यह': 'PRON', 'वे': 'PRON', 'ये': 'PRON',
57
+ 'हम': 'PRON', 'वो': 'PRON',
58
+ 'मुझे': 'PRON', 'तुझे': 'PRON', 'उसे': 'PRON', 'इसे': 'PRON',
59
+ 'हमें': 'PRON', 'उन्हें': 'PRON', 'इन्हें': 'PRON',
60
+ 'मेरा': 'PRON', 'तेरा': 'PRON', 'उसका': 'PRON', 'इसका': 'PRON',
61
+ 'कौन': 'PRON', 'क्या': 'PRON', 'कहाँ': 'PRON', 'कब': 'PRON',
62
+ 'जो': 'REL', 'जिसे': 'REL', 'जिसको': 'REL',
63
+ }
64
+
65
+ # 접속사
66
+ self.conjunctions = {
67
+ 'और': 'CONJ', 'या': 'CONJ', 'लेकिन': 'CONJ', 'परंतु': 'CONJ',
68
+ 'कि': 'CONJ', 'क्योंकि': 'CONJ', 'अगर': 'CONJ', 'यदि': 'CONJ',
69
+ 'जब': 'CONJ', 'तो': 'CONJ', 'तब': 'CONJ', 'फिर': 'CONJ',
70
+ 'इसलिए': 'CONJ', 'जबकि': 'CONJ', 'हालांकि': 'CONJ',
71
+ }
72
+
73
+ # 부사
74
+ self.adverbs = {
75
+ 'बहुत': 'ADV', 'अच्छी': 'ADV', 'तरह': 'ADV', 'जल्दी': 'ADV',
76
+ 'धीरे': 'ADV', 'अभी': 'ADV', 'अब': 'ADV', 'कल': 'ADV',
77
+ 'आज': 'ADV', 'फिर': 'ADV', 'यहाँ': 'ADV', 'वहाँ': 'ADV',
78
+ 'कभी': 'ADV', 'हमेशा': 'ADV', 'कभी': 'ADV', 'बस': 'ADV',
79
+ 'सिर्फ': 'ADV', 'भी': 'ADV', 'ही': 'ADV', 'तो': 'ADV',
80
+ }
81
+
82
+ # 부정어/불변화사
83
+ self.particles = {
84
+ 'नहीं': 'NEG', 'न': 'NEG', 'मत': 'NEG',
85
+ 'ही': 'PRT', 'भी': 'PRT', 'तो': 'PRT',
86
+ 'जी': 'PRT', 'हाँ': 'PRT',
87
+ }
88
+
89
+ # 조동사
90
+ self.auxiliaries = {
91
+ 'है': 'AUX', 'हैं': 'AUX', 'था': 'AUX', 'थी': 'AUX', 'थे': 'AUX', 'थीं': 'AUX',
92
+ 'हूँ': 'AUX', 'हो': 'AUX',
93
+ 'रहा': 'AUX', 'रही': 'AUX', 'रहे': 'AUX',
94
+ 'गया': 'AUX', 'गयी': 'AUX', 'गए': 'AUX',
95
+ 'सकता': 'AUX', 'सकती': 'AUX', 'सकते': 'AUX',
96
+ 'चाहिए': 'AUX', 'होगा': 'AUX', 'होगी': 'AUX',
97
+ }
98
+
99
+ # 일반 명사 (고빈도)
100
+ self.common_nouns = {
101
+ 'लोग': 'NC', 'आदमी': 'NC', 'औरत': 'NC', 'बच्चा': 'NC',
102
+ 'घर': 'NC', 'काम': 'NC', 'समय': 'NC', 'दिन': 'NC',
103
+ 'साल': 'NC', 'बात': 'NC', 'पानी': 'NC', 'खाना': 'NC',
104
+ 'देश': 'NC', 'शहर': 'NC', 'सरकार': 'NC', 'कंपनी': 'NC',
105
+ }
106
+
107
+ # 동사 어간 (고빈도)
108
+ self.verb_stems = {
109
+ 'कर': 'V', 'हो': 'V', 'जा': 'V', 'आ': 'V', 'दे': 'V',
110
+ 'ले': 'V', 'रह': 'V', 'बोल': 'V', 'कह': 'V', 'सुन': 'V',
111
+ 'देख': 'V', 'खा': 'V', 'पी': 'V', 'लिख': 'V', 'पढ़': 'V',
112
+ 'चल': 'V', 'बैठ': 'V', 'उठ': 'V', 'सो': 'V', 'मिल': 'V',
113
+ }
114
+
115
+ def _build_domain_dictionaries(self):
116
+ """도메인별 사전"""
117
+ self._domain_dictionaries[Domain.TECH] = {
118
+ 'सेब': ('Apple', 'NP'), # Apple company
119
+ 'बादल': ('cloud', 'NC'), # cloud computing
120
+ }
121
+ self._domain_dictionaries[Domain.FOOD] = {
122
+ 'सेब': ('सेब', 'NC'), # apple (fruit)
123
+ }
124
+ self._domain_dictionaries[Domain.FINANCE] = {
125
+ 'बैंक': ('बैंक', 'NC'), # bank
126
+ 'शेयर': ('शेयर', 'NC'), # stock
127
+ }
128
+
129
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
130
+ if not text or not text.strip():
131
+ return [AnalysisResult([])]
132
+ morphemes = self._analyze_text(text, domain)
133
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
134
+ result.score = self._score_analysis(result)
135
+ return [result]
136
+
137
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
138
+ result = []
139
+ pos = 0
140
+ while pos < len(text):
141
+ if text[pos].isspace():
142
+ pos += 1
143
+ continue
144
+
145
+ devanagari_match = self.DEVANAGARI_PATTERN.match(text[pos:])
146
+ if devanagari_match:
147
+ word = devanagari_match.group()
148
+ morpheme = self._analyze_word(word, pos, domain)
149
+ result.append(morpheme)
150
+ pos += len(word)
151
+ continue
152
+
153
+ # 라틴 문자 (영어 차용어)
154
+ latin_match = re.match(r'[a-zA-Z]+', text[pos:])
155
+ if latin_match:
156
+ word = latin_match.group()
157
+ result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
158
+ pos += len(word)
159
+ continue
160
+
161
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
162
+ if num_match:
163
+ num = num_match.group()
164
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
165
+ pos += len(num)
166
+ continue
167
+
168
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
169
+ pos += 1
170
+ return result
171
+
172
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
173
+ """단어 분석"""
174
+
175
+ # 런타임 사전
176
+ if word in self._user_dictionary:
177
+ lemma, pos_tag, _ = self._user_dictionary[word]
178
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
179
+
180
+ # 도메인 사전
181
+ domain_sense = self._get_domain_sense(word, domain)
182
+ if domain_sense:
183
+ return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
184
+
185
+ # 후치사
186
+ if word in self.postpositions:
187
+ return Morpheme(surface=word, lemma=word, pos='PSP', start=offset, end=offset + len(word))
188
+
189
+ # 대명사
190
+ if word in self.pronouns:
191
+ return Morpheme(surface=word, lemma=word, pos='PRON', start=offset, end=offset + len(word))
192
+
193
+ # 접속사
194
+ if word in self.conjunctions:
195
+ return Morpheme(surface=word, lemma=word, pos='CONJ', start=offset, end=offset + len(word))
196
+
197
+ # 부사
198
+ if word in self.adverbs:
199
+ return Morpheme(surface=word, lemma=word, pos='ADV', start=offset, end=offset + len(word))
200
+
201
+ # 불변화사
202
+ if word in self.particles:
203
+ return Morpheme(surface=word, lemma=word, pos=self.particles[word], start=offset, end=offset + len(word))
204
+
205
+ # 조동사
206
+ if word in self.auxiliaries:
207
+ return Morpheme(surface=word, lemma=word, pos='AUX', start=offset, end=offset + len(word))
208
+
209
+ # 일반명사
210
+ if word in self.common_nouns:
211
+ return Morpheme(surface=word, lemma=word, pos='NC', start=offset, end=offset + len(word))
212
+
213
+ # 동사 어간 확인
214
+ for stem, pos_tag in self.verb_stems.items():
215
+ if word.startswith(stem) and len(word) > len(stem):
216
+ return Morpheme(surface=word, lemma=stem, pos='V', start=offset, end=offset + len(word))
217
+
218
+ # 형태 분석
219
+ lemma, pos_tag = self._analyze_morphology(word)
220
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
221
+
222
+ def _analyze_morphology(self, word: str) -> Tuple[str, str]:
223
+ """형태 분석"""
224
+ # -ना 동사 원형
225
+ if word.endswith('ना') and len(word) > 2:
226
+ return (word, 'V')
227
+
228
+ # -ता/-ती/-ते 현재분사
229
+ if word.endswith(('ता', 'ती', 'ते')) and len(word) > 2:
230
+ return (word[:-1], 'V')
231
+
232
+ # -ा/-ी/-े 과거분사
233
+ if word.endswith(('ा', 'ी', 'े')) and len(word) > 2:
234
+ return (word[:-1], 'V')
235
+
236
+ # -ई 명사 (여성)
237
+ if word.endswith('ई') and len(word) > 2:
238
+ return (word, 'NC')
239
+
240
+ # -आ 명사 (남성)
241
+ if word.endswith('आ') and len(word) > 2:
242
+ return (word, 'NC')
243
+
244
+ # 기본값: 명사
245
+ return (word, 'NC')
246
+
247
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
248
+ alternatives = []
249
+ other_domains = [d for d in Domain if d != domain][:count]
250
+ for alt_domain in other_domains:
251
+ morphemes = self._analyze_text(text, alt_domain)
252
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
253
+ result.score = self._score_analysis(result) * 0.9
254
+ alternatives.append(result)
255
+ return alternatives
256
+
257
+
258
+ HindiAnalyzer = HindiAdvancedAnalyzer
@@ -0,0 +1,417 @@
1
+ """
2
+ Japanese Morphological Analyzer - 자체 구현
3
+ ===========================================
4
+
5
+ 외부 라이브러리 없이 순수 Python으로 구현한 일본어 형태소 분석기.
6
+
7
+ 옵션 자산(고ROI):
8
+ - `TOKMOR_DATA_DIR/seg_lexicon/ja_wordfreq.pkl` 가 있으면, 긴 한자 run을
9
+ 보수적으로 분할하기 위한 Viterbi 후보 점수로 사용한다.
10
+ (코어에는 자산을 번들하지 않음)
11
+ """
12
+
13
+ import re
14
+ import math
15
+ import pickle
16
+ from typing import List, Tuple, Optional
17
+ from dataclasses import dataclass
18
+
19
+ from ..resources import resolve_seg_lexicon_path
20
+
21
+
22
+ @dataclass
23
+ class Morpheme:
24
+ """형태소"""
25
+ surface: str
26
+ lemma: str
27
+ pos: str
28
+ start: int
29
+ end: int
30
+ reading: str = ''
31
+
32
+ def __repr__(self):
33
+ return f"{self.surface}/{self.pos}"
34
+
35
+
36
+ class JapaneseAnalyzer:
37
+ """
38
+ 일본어 형태소 분석기
39
+
40
+ Usage:
41
+ analyzer = JapaneseAnalyzer()
42
+ result = analyzer.analyze("東京に行きます")
43
+ """
44
+
45
+ def __init__(self):
46
+ # Optional kanji wordfreq lexicon (offline): {token:freq}
47
+ self._wordfreq = None
48
+ self._wordfreq_max_len = 4
49
+ self._build_dictionary()
50
+ self._load_seg_lexicon()
51
+
52
+ def _build_dictionary(self):
53
+ """사전 구축"""
54
+
55
+ # 조사
56
+ self.particles = {
57
+ 'は': 'HA', 'が': 'GA', 'を': 'WO', 'に': 'NI', 'へ': 'HE',
58
+ 'で': 'DE', 'と': 'TO', 'から': 'KARA', 'まで': 'MADE',
59
+ 'より': 'YORI', 'の': 'NO', 'も': 'MO', 'や': 'YA',
60
+ 'など': 'NADO', 'か': 'KA', 'ね': 'NE', 'よ': 'YO',
61
+ }
62
+
63
+ # 조동사/어미
64
+ self.auxiliaries = {
65
+ 'です': 'AUX', 'ます': 'AUX', 'た': 'AUX', 'だ': 'AUX',
66
+ 'ない': 'AUX', 'れる': 'AUX', 'られる': 'AUX',
67
+ 'せる': 'AUX', 'させる': 'AUX', 'たい': 'AUX',
68
+ 'ている': 'AUX', 'てる': 'AUX', 'ました': 'AUX',
69
+ }
70
+
71
+ # 명사
72
+ self.nouns = {
73
+ '東京': ('名詞', 'トウキョウ'), '日本': ('名詞', 'ニホン'),
74
+ '大阪': ('名詞', 'オオサカ'), '京都': ('名詞', 'キョウト'),
75
+ '会社': ('名詞', 'カイシャ'), '学校': ('名詞', 'ガッコウ'),
76
+ '仕事': ('名詞', 'シゴト'), '人': ('名詞', 'ヒト'),
77
+ '時間': ('名詞', 'ジカン'), '今日': ('名詞', 'キョウ'),
78
+ '明日': ('名詞', 'アシタ'), '昨日': ('名詞', 'キノウ'),
79
+ '私': ('名詞', 'ワタシ'), '彼': ('名詞', 'カレ'),
80
+ '発表': ('名詞', 'ハッピョウ'), '自動車': ('名詞', 'ジドウシャ'),
81
+ }
82
+
83
+ # 動詞 (5段/1段/カ変/サ変)
84
+ self.verbs = {
85
+ '行': ('動詞', '行く', 'godan'),
86
+ '来': ('動詞', '来る', 'kuru'),
87
+ '見': ('動詞', '見る', 'ichidan'),
88
+ '食': ('動詞', '食べる', 'ichidan'),
89
+ '話': ('動詞', '話す', 'godan'),
90
+ '読': ('動詞', '読む', 'godan'),
91
+ '書': ('動詞', '書く', 'godan'),
92
+ '聞': ('動詞', '聞く', 'godan'),
93
+ '思': ('動詞', '思う', 'godan'),
94
+ '言': ('動詞', '言う', 'godan'),
95
+ 'し': ('動詞', 'する', 'suru'),
96
+ 'する': ('動詞', 'する', 'suru'),
97
+ }
98
+
99
+ # 형용사 (어간 + 종지형 모두 등록)
100
+ self.adjectives = {
101
+ # イ形容詞 어간
102
+ '大き': ('形容詞', '大きい'),
103
+ '小さ': ('形容詞', '小さい'),
104
+ '高': ('形容詞', '高い'),
105
+ '安': ('形容詞', '安い'),
106
+ '新し': ('形容詞', '新しい'),
107
+ '古': ('形容詞', '古い'),
108
+ '良': ('形容詞', '良い'),
109
+ '悪': ('形容詞', '悪い'),
110
+ '長': ('形容詞', '長い'),
111
+ '短': ('形容詞', '短い'),
112
+ '早': ('形容詞', '早い'),
113
+ '遅': ('形容詞', '遅い'),
114
+ '強': ('形容詞', '強い'),
115
+ '弱': ('形容詞', '弱い'),
116
+ '多': ('形容詞', '多い'),
117
+ '少な': ('形容詞', '少ない'),
118
+ '美し': ('形容詞', '美しい'),
119
+ '難し': ('形容詞', '難しい'),
120
+ '易し': ('形容詞', '易しい'),
121
+ # イ形容詞 종지형
122
+ '大きい': ('形容詞', '大きい'),
123
+ '小さい': ('形容詞', '小さい'),
124
+ '高い': ('形容詞', '高い'),
125
+ '安い': ('形容詞', '安い'),
126
+ '新しい': ('形容詞', '新しい'),
127
+ '古い': ('形容詞', '古い'),
128
+ '良い': ('形容詞', '良い'),
129
+ '悪い': ('形容詞', '悪い'),
130
+ '長い': ('形容詞', '長い'),
131
+ '短い': ('形容詞', '短い'),
132
+ '早い': ('形容詞', '早い'),
133
+ '遅い': ('形容詞', '遅い'),
134
+ '強い': ('形容詞', '強い'),
135
+ '弱い': ('形容詞', '弱い'),
136
+ '多い': ('形容詞', '多い'),
137
+ '少ない': ('形容詞', '少ない'),
138
+ '美しい': ('形容詞', '美しい'),
139
+ '難しい': ('形容詞', '難しい'),
140
+ '易しい': ('形容詞', '易しい'),
141
+ }
142
+
143
+ # 히라가나 범위
144
+ self.hiragana = re.compile(r'[\u3040-\u309f]+')
145
+ # 가타카나 범위
146
+ self.katakana = re.compile(r'[\u30a0-\u30ff]+')
147
+ # 한자 범위
148
+ self.kanji = re.compile(r'[\u4e00-\u9fff]+')
149
+
150
+ def _load_seg_lexicon(self) -> None:
151
+ p = resolve_seg_lexicon_path("ja")
152
+ if not p:
153
+ return
154
+ try:
155
+ obj = pickle.loads(p.read_bytes())
156
+ if not isinstance(obj, dict):
157
+ return
158
+ wf = {}
159
+ mx = 2
160
+ for k, v in obj.items():
161
+ if isinstance(k, str) and k and isinstance(v, int) and v > 0:
162
+ # Japanese kanji-only lexicon expected; keep it conservative
163
+ if len(k) < 2:
164
+ continue
165
+ wf[k] = int(v)
166
+ if len(k) > mx:
167
+ mx = len(k)
168
+ if wf:
169
+ self._wordfreq = wf
170
+ self._wordfreq_max_len = max(2, min(int(mx), 8))
171
+ except Exception:
172
+ return
173
+
174
+ def _viterbi_kanji_run(self, run: str, start: int) -> List[Morpheme]:
175
+ """
176
+ Viterbi segmentation over a pure-Kanji run using optional wordfreq.
177
+ If lexicon is missing, caller should not use this.
178
+ """
179
+ wf = self._wordfreq or {}
180
+ max_len = max(self._wordfreq_max_len, 4)
181
+ max_len = max(2, min(int(max_len), 8))
182
+ n = len(run)
183
+
184
+ # Conservative scoring: prefer longer spans, penalize 1-char, reward known spans.
185
+ len_bonus = 0.75
186
+ single_penalty = 1.2
187
+ unk_base = -1.6
188
+ unk_len_penalty = 0.35
189
+ freq_cap = 200_000
190
+
191
+ best = [-1e100] * (n + 1)
192
+ back = [-1] * (n + 1)
193
+ back_len = [1] * (n + 1)
194
+ best[0] = 0.0
195
+
196
+ for i in range(n):
197
+ if best[i] <= -1e90:
198
+ continue
199
+ # allow 1..max_len
200
+ for L in range(1, max_len + 1):
201
+ j = i + L
202
+ if j > n:
203
+ break
204
+ span = run[i:j]
205
+ f = wf.get(span)
206
+ if f is not None:
207
+ f2 = min(int(f), freq_cap)
208
+ s = best[i] + math.log(f2 + 1.0) + len_bonus * (L - 1)
209
+ if L == 1:
210
+ s -= single_penalty
211
+ else:
212
+ # unknown: allow grouping, but penalize longer unknown spans
213
+ s = best[i] + unk_base - unk_len_penalty * (L - 1)
214
+ if L == 1:
215
+ s -= single_penalty
216
+
217
+ if s > best[j]:
218
+ best[j] = s
219
+ back[j] = i
220
+ back_len[j] = L
221
+
222
+ # backtrack
223
+ out: List[Morpheme] = []
224
+ j = n
225
+ if best[j] <= -1e90:
226
+ # fallback: whole run
227
+ return [Morpheme(run, run, '名詞', start, start + n)]
228
+
229
+ while j > 0:
230
+ i = back[j]
231
+ L = back_len[j]
232
+ if i < 0:
233
+ break
234
+ span = run[i:j]
235
+ out.append(Morpheme(span, span, '名詞', start + i, start + j))
236
+ j = i
237
+ out.reverse()
238
+ return out
239
+
240
+ def analyze(self, text: str) -> List[Morpheme]:
241
+ """형태소 분석"""
242
+ if not text:
243
+ return []
244
+
245
+ result = []
246
+ pos = 0
247
+
248
+ while pos < len(text):
249
+ matched = False
250
+
251
+ # 공백
252
+ if text[pos].isspace():
253
+ pos += 1
254
+ continue
255
+
256
+ # 한자 연속 확인 (인명/고유명사 우선)
257
+ # 2자 이상 한자 연속이면 먼저 인명으로 처리
258
+ if self.kanji.match(text[pos]):
259
+ kanji_match = self.kanji.match(text[pos:])
260
+ kanji_chunk = kanji_match.group()
261
+ # 2자 이상 한자 연속 + 다음 문자가 조사이면 고유명사로 처리
262
+ next_pos = pos + len(kanji_chunk)
263
+ if len(kanji_chunk) >= 2:
264
+ if next_pos >= len(text) or text[next_pos] in self.particles or text[next_pos].isspace():
265
+ # 먼저 사전에서 찾기
266
+ found_in_dict = False
267
+ for l in range(len(kanji_chunk), 0, -1):
268
+ if text[pos:pos+l] in self.nouns:
269
+ info = self.nouns[text[pos:pos+l]]
270
+ result.append(Morpheme(
271
+ text[pos:pos+l], text[pos:pos+l], info[0],
272
+ pos, pos + l, info[1]
273
+ ))
274
+ pos += l
275
+ found_in_dict = True
276
+ matched = True
277
+ break
278
+ if found_in_dict:
279
+ continue
280
+ # Optional: if we have a kanji lexicon, split long runs *only when it clearly helps*.
281
+ if self._wordfreq and len(kanji_chunk) >= 6:
282
+ parts = self._viterbi_kanji_run(kanji_chunk, pos)
283
+ # accept only if we got multiple non-trivial parts (avoid over-fragmentation)
284
+ if len(parts) >= 2 and all(len(p.surface) >= 2 for p in parts):
285
+ result.extend(parts)
286
+ pos += len(kanji_chunk)
287
+ matched = True
288
+ continue
289
+
290
+ # 사전에 없으면 전체를 고유명사로
291
+ result.append(Morpheme(
292
+ kanji_chunk, kanji_chunk, '固有名詞',
293
+ pos, pos + len(kanji_chunk)
294
+ ))
295
+ pos += len(kanji_chunk)
296
+ matched = True
297
+ continue
298
+
299
+ # 최장일치로 사전 탐색
300
+ for length in range(min(len(text) - pos, 10), 0, -1):
301
+ substring = text[pos:pos+length]
302
+
303
+ # 형용사 (명사보다 먼저 확인) - 단독 형용사만
304
+ if substring in self.adjectives and length > 1:
305
+ info = self.adjectives[substring]
306
+ result.append(Morpheme(
307
+ substring, info[1], '形容詞',
308
+ pos, pos + length
309
+ ))
310
+ pos += length
311
+ matched = True
312
+ break
313
+
314
+ # 명사
315
+ if substring in self.nouns:
316
+ info = self.nouns[substring]
317
+ result.append(Morpheme(
318
+ substring, substring, info[0],
319
+ pos, pos + length, info[1]
320
+ ))
321
+ pos += length
322
+ matched = True
323
+ break
324
+
325
+ # 조사
326
+ if substring in self.particles:
327
+ result.append(Morpheme(
328
+ substring, substring, '助詞',
329
+ pos, pos + length
330
+ ))
331
+ pos += length
332
+ matched = True
333
+ break
334
+
335
+ # 조동사
336
+ if substring in self.auxiliaries:
337
+ result.append(Morpheme(
338
+ substring, substring, '助動詞',
339
+ pos, pos + length
340
+ ))
341
+ pos += length
342
+ matched = True
343
+ break
344
+
345
+ if not matched:
346
+ # 한자 청크
347
+ if self.kanji.match(text[pos]):
348
+ match = self.kanji.match(text[pos:])
349
+ chunk = match.group()
350
+ # If we have a kanji wordfreq lexicon, split long runs conservatively.
351
+ if self._wordfreq and len(chunk) >= 5:
352
+ result.extend(self._viterbi_kanji_run(chunk, pos))
353
+ else:
354
+ result.append(Morpheme(
355
+ chunk, chunk, '名詞',
356
+ pos, pos + len(chunk)
357
+ ))
358
+ pos += len(chunk)
359
+ # 히라가나 청크
360
+ elif self.hiragana.match(text[pos]):
361
+ match = self.hiragana.match(text[pos:])
362
+ chunk = match.group()
363
+ # 동사 활용 분석 시도
364
+ analyzed = self._analyze_verb_form(chunk, pos)
365
+ if analyzed:
366
+ result.extend(analyzed)
367
+ else:
368
+ result.append(Morpheme(
369
+ chunk, chunk, '名詞',
370
+ pos, pos + len(chunk)
371
+ ))
372
+ pos += len(chunk)
373
+ # 가타카나 청크 (외래어)
374
+ elif self.katakana.match(text[pos]):
375
+ match = self.katakana.match(text[pos:])
376
+ chunk = match.group()
377
+ result.append(Morpheme(
378
+ chunk, chunk, '名詞',
379
+ pos, pos + len(chunk)
380
+ ))
381
+ pos += len(chunk)
382
+ else:
383
+ # 기타 (숫자, 기호 등)
384
+ result.append(Morpheme(
385
+ text[pos], text[pos], '記号',
386
+ pos, pos + 1
387
+ ))
388
+ pos += 1
389
+
390
+ return result
391
+
392
+ def _analyze_verb_form(self, form: str, offset: int) -> List[Morpheme]:
393
+ """동사 활용형 분석"""
394
+ results = []
395
+
396
+ # ます형
397
+ if form.endswith('ます'):
398
+ stem = form[:-2]
399
+ if stem:
400
+ results.append(Morpheme(stem, stem + 'る', '動詞', offset, offset + len(stem)))
401
+ results.append(Morpheme('ます', 'ます', '助動詞', offset + len(stem), offset + len(form)))
402
+ return results
403
+
404
+ # た형
405
+ if form.endswith('た') or form.endswith('だ'):
406
+ stem = form[:-1]
407
+ if stem:
408
+ results.append(Morpheme(stem, stem, '動詞', offset, offset + len(stem)))
409
+ results.append(Morpheme(form[-1], form[-1], '助動詞', offset + len(stem), offset + len(form)))
410
+ return results
411
+
412
+ return []
413
+
414
+ def pos_tag(self, text: str) -> List[Tuple[str, str]]:
415
+ """품사 태깅"""
416
+ morphemes = self.analyze(text)
417
+ return [(m.surface, m.pos) for m in morphemes]