tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Brahmic Script Language Template
|
|
3
|
+
================================
|
|
4
|
+
|
|
5
|
+
브라흐미 계열 문자 기반 언어용 템플릿 분석기
|
|
6
|
+
Devanagari, Bengali, Tamil, Telugu, Thai, Khmer, Myanmar, etc.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import List, Tuple, Dict, Optional
|
|
11
|
+
|
|
12
|
+
from ..advanced_base import (
|
|
13
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BrahmicScriptAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
18
|
+
"""
|
|
19
|
+
브라흐미 계열 문자 기반 언어 템플릿
|
|
20
|
+
|
|
21
|
+
Covers:
|
|
22
|
+
- Devanagari: Hindi, Marathi, Nepali, Sanskrit
|
|
23
|
+
- Bengali: Bengali, Assamese
|
|
24
|
+
- Tamil: Tamil
|
|
25
|
+
- Telugu: Telugu
|
|
26
|
+
- Kannada: Kannada
|
|
27
|
+
- Malayalam: Malayalam
|
|
28
|
+
- Gujarati: Gujarati
|
|
29
|
+
- Punjabi (Gurmukhi): Punjabi
|
|
30
|
+
- Odia: Odia
|
|
31
|
+
- Sinhala: Sinhala
|
|
32
|
+
- Thai: Thai
|
|
33
|
+
- Khmer: Khmer
|
|
34
|
+
- Myanmar: Burmese
|
|
35
|
+
- Lao: Lao
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
# Unicode ranges for various Brahmic scripts
|
|
39
|
+
SCRIPT_PATTERNS = {
|
|
40
|
+
'devanagari': re.compile(r'[\u0900-\u097F]+'),
|
|
41
|
+
'bengali': re.compile(r'[\u0980-\u09FF]+'),
|
|
42
|
+
'tamil': re.compile(r'[\u0B80-\u0BFF]+'),
|
|
43
|
+
'telugu': re.compile(r'[\u0C00-\u0C7F]+'),
|
|
44
|
+
'kannada': re.compile(r'[\u0C80-\u0CFF]+'),
|
|
45
|
+
'malayalam': re.compile(r'[\u0D00-\u0D7F]+'),
|
|
46
|
+
'gujarati': re.compile(r'[\u0A80-\u0AFF]+'),
|
|
47
|
+
'gurmukhi': re.compile(r'[\u0A00-\u0A7F]+'),
|
|
48
|
+
'oriya': re.compile(r'[\u0B00-\u0B7F]+'),
|
|
49
|
+
'sinhala': re.compile(r'[\u0D80-\u0DFF]+'),
|
|
50
|
+
'thai': re.compile(r'[\u0E00-\u0E7F]+'),
|
|
51
|
+
'lao': re.compile(r'[\u0E80-\u0EFF]+'),
|
|
52
|
+
'khmer': re.compile(r'[\u1780-\u17FF]+'),
|
|
53
|
+
'myanmar': re.compile(r'[\u1000-\u109F]+'),
|
|
54
|
+
'tibetan': re.compile(r'[\u0F00-\u0FFF]+'),
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Combined pattern for any Brahmic script
|
|
58
|
+
WORD_PATTERN = re.compile(
|
|
59
|
+
r'[\u0900-\u097F' # Devanagari
|
|
60
|
+
r'\u0980-\u09FF' # Bengali
|
|
61
|
+
r'\u0A00-\u0A7F' # Gurmukhi
|
|
62
|
+
r'\u0A80-\u0AFF' # Gujarati
|
|
63
|
+
r'\u0B00-\u0B7F' # Oriya
|
|
64
|
+
r'\u0B80-\u0BFF' # Tamil
|
|
65
|
+
r'\u0C00-\u0C7F' # Telugu
|
|
66
|
+
r'\u0C80-\u0CFF' # Kannada
|
|
67
|
+
r'\u0D00-\u0D7F' # Malayalam
|
|
68
|
+
r'\u0D80-\u0DFF' # Sinhala
|
|
69
|
+
r'\u0E00-\u0E7F' # Thai
|
|
70
|
+
r'\u0E80-\u0EFF' # Lao
|
|
71
|
+
r'\u0F00-\u0FFF' # Tibetan
|
|
72
|
+
r'\u1000-\u109F' # Myanmar
|
|
73
|
+
r'\u1780-\u17FF' # Khmer
|
|
74
|
+
r']+'
|
|
75
|
+
)
|
|
76
|
+
NUMBER_PATTERN = re.compile(r'[0-9०-९০-৯੦-੯૦-૯୦-୯௦-௯౦-౯೦-೯൦-൯๐-๙]+')
|
|
77
|
+
|
|
78
|
+
# Script-specific setting (override in subclass)
|
|
79
|
+
SCRIPT_NAME: str = 'devanagari'
|
|
80
|
+
|
|
81
|
+
def __init__(self):
|
|
82
|
+
super().__init__()
|
|
83
|
+
|
|
84
|
+
def _build_base_dictionary(self):
|
|
85
|
+
"""Override in subclass"""
|
|
86
|
+
self.postpositions: Dict[str, str] = {}
|
|
87
|
+
self.function_words: Dict[str, str] = {}
|
|
88
|
+
self.verb_stems: Dict[str, str] = {}
|
|
89
|
+
|
|
90
|
+
def _build_domain_dictionaries(self):
|
|
91
|
+
"""Override in subclass"""
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
95
|
+
if not text or not text.strip():
|
|
96
|
+
return [AnalysisResult([])]
|
|
97
|
+
|
|
98
|
+
morphemes = self._analyze_text(text, domain)
|
|
99
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
100
|
+
result.score = self._score_analysis(result)
|
|
101
|
+
return [result]
|
|
102
|
+
|
|
103
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
104
|
+
result = []
|
|
105
|
+
pos = 0
|
|
106
|
+
|
|
107
|
+
while pos < len(text):
|
|
108
|
+
if text[pos].isspace():
|
|
109
|
+
pos += 1
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
# Brahmic script word
|
|
113
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
114
|
+
if word_match:
|
|
115
|
+
word = word_match.group()
|
|
116
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
117
|
+
result.append(morpheme)
|
|
118
|
+
pos += len(word)
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
# Latin (foreign/transliteration)
|
|
122
|
+
latin_match = re.match(r'[a-zA-Z]+', text[pos:])
|
|
123
|
+
if latin_match:
|
|
124
|
+
word = latin_match.group()
|
|
125
|
+
result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
|
|
126
|
+
pos += len(word)
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
# Number
|
|
130
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
131
|
+
if num_match:
|
|
132
|
+
num = num_match.group()
|
|
133
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
134
|
+
pos += len(num)
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
# Punctuation
|
|
138
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
139
|
+
pos += 1
|
|
140
|
+
|
|
141
|
+
return result
|
|
142
|
+
|
|
143
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
144
|
+
"""Analyze word"""
|
|
145
|
+
|
|
146
|
+
# User dictionary
|
|
147
|
+
if word in self._user_dictionary:
|
|
148
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
149
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
150
|
+
|
|
151
|
+
# Domain dictionary
|
|
152
|
+
domain_sense = self._get_domain_sense(word, domain)
|
|
153
|
+
if domain_sense:
|
|
154
|
+
return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
|
|
155
|
+
|
|
156
|
+
# Postpositions
|
|
157
|
+
if hasattr(self, 'postpositions') and word in self.postpositions:
|
|
158
|
+
return Morpheme(surface=word, lemma=word, pos='PSP', start=offset, end=offset + len(word))
|
|
159
|
+
|
|
160
|
+
# Function words
|
|
161
|
+
if hasattr(self, 'function_words') and word in self.function_words:
|
|
162
|
+
return Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))
|
|
163
|
+
|
|
164
|
+
# Verb stem check
|
|
165
|
+
if hasattr(self, 'verb_stems'):
|
|
166
|
+
for stem, pos_tag in self.verb_stems.items():
|
|
167
|
+
if word.startswith(stem) and len(word) > len(stem):
|
|
168
|
+
return Morpheme(surface=word, lemma=stem, pos='V', start=offset, end=offset + len(word))
|
|
169
|
+
|
|
170
|
+
# Default: noun
|
|
171
|
+
return Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))
|
|
172
|
+
|
|
173
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
174
|
+
alternatives = []
|
|
175
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
176
|
+
for alt_domain in other_domains:
|
|
177
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
178
|
+
result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
|
|
179
|
+
result.score = self._score_analysis(result) * 0.9
|
|
180
|
+
alternatives.append(result)
|
|
181
|
+
return alternatives
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cyrillic Script Language Template
|
|
3
|
+
=================================
|
|
4
|
+
|
|
5
|
+
키릴 문자 기반 언어용 템플릿 분석기
|
|
6
|
+
Russian, Ukrainian, Bulgarian, Serbian, Macedonian, etc.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import List, Tuple, Dict, Optional
|
|
11
|
+
|
|
12
|
+
from ..advanced_base import (
|
|
13
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CyrillicScriptAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
18
|
+
"""
|
|
19
|
+
키릴 문자 기반 언어 템플릿
|
|
20
|
+
|
|
21
|
+
Covers: Russian, Ukrainian, Bulgarian, Serbian (Cyrillic),
|
|
22
|
+
Macedonian, Belarusian, Kazakh, Uzbek, Mongolian, etc.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
# Cyrillic character pattern (extended)
|
|
26
|
+
WORD_PATTERN = re.compile(
|
|
27
|
+
r'[а-яА-ЯёЁ' # Russian/basic
|
|
28
|
+
r'іїєґІЇЄҐ' # Ukrainian
|
|
29
|
+
r'ўЎ' # Belarusian
|
|
30
|
+
r'ђјљњћџЂЈЉЊЋЏ' # Serbian
|
|
31
|
+
r'ѓќѕѝЃЌЅЍ' # Macedonian
|
|
32
|
+
r'әғқңөұүһӘҒҚҢӨҰҮҺ' # Kazakh
|
|
33
|
+
r'ғқҳўғҚҲЎ' # Uzbek
|
|
34
|
+
r'өүөҮ]+' # Mongolian
|
|
35
|
+
)
|
|
36
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
|
|
37
|
+
|
|
38
|
+
# Override in subclass
|
|
39
|
+
VERB_INFINITIVE_SUFFIX: str = 'ть' # Default: Russian
|
|
40
|
+
REFLEXIVE_SUFFIX: str = 'ся'
|
|
41
|
+
|
|
42
|
+
def __init__(self):
|
|
43
|
+
super().__init__()
|
|
44
|
+
|
|
45
|
+
def _build_base_dictionary(self):
|
|
46
|
+
"""Override in subclass"""
|
|
47
|
+
self.function_words: Dict[str, str] = {}
|
|
48
|
+
self.irregular_verbs: Dict[str, str] = {}
|
|
49
|
+
|
|
50
|
+
def _build_domain_dictionaries(self):
|
|
51
|
+
"""Override in subclass"""
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
55
|
+
if not text or not text.strip():
|
|
56
|
+
return [AnalysisResult([])]
|
|
57
|
+
|
|
58
|
+
morphemes = self._analyze_text(text, domain)
|
|
59
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
60
|
+
result.score = self._score_analysis(result)
|
|
61
|
+
return [result]
|
|
62
|
+
|
|
63
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
64
|
+
result = []
|
|
65
|
+
pos = 0
|
|
66
|
+
|
|
67
|
+
while pos < len(text):
|
|
68
|
+
if text[pos].isspace():
|
|
69
|
+
pos += 1
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
# Cyrillic word
|
|
73
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
74
|
+
if word_match:
|
|
75
|
+
word = word_match.group()
|
|
76
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
77
|
+
result.append(morpheme)
|
|
78
|
+
pos += len(word)
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
# Latin (foreign words)
|
|
82
|
+
latin_match = re.match(r'[a-zA-Z]+', text[pos:])
|
|
83
|
+
if latin_match:
|
|
84
|
+
word = latin_match.group()
|
|
85
|
+
result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
|
|
86
|
+
pos += len(word)
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
# Number
|
|
90
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
91
|
+
if num_match:
|
|
92
|
+
num = num_match.group()
|
|
93
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
94
|
+
pos += len(num)
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
# Punctuation
|
|
98
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
99
|
+
pos += 1
|
|
100
|
+
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
104
|
+
word_lower = word.lower()
|
|
105
|
+
|
|
106
|
+
# User dictionary
|
|
107
|
+
if word_lower in self._user_dictionary:
|
|
108
|
+
lemma, pos_tag, _ = self._user_dictionary[word_lower]
|
|
109
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
110
|
+
|
|
111
|
+
# Domain dictionary
|
|
112
|
+
domain_sense = self._get_domain_sense(word_lower, domain)
|
|
113
|
+
if domain_sense:
|
|
114
|
+
return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
|
|
115
|
+
|
|
116
|
+
# Function words
|
|
117
|
+
if hasattr(self, 'function_words') and word_lower in self.function_words:
|
|
118
|
+
return Morpheme(surface=word, lemma=word_lower, pos=self.function_words[word_lower], start=offset, end=offset + len(word))
|
|
119
|
+
|
|
120
|
+
# Irregular verbs
|
|
121
|
+
if hasattr(self, 'irregular_verbs') and word_lower in self.irregular_verbs:
|
|
122
|
+
return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
|
|
123
|
+
|
|
124
|
+
# Morphological analysis
|
|
125
|
+
lemma, pos_tag = self._analyze_morphology(word)
|
|
126
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
127
|
+
|
|
128
|
+
def _analyze_morphology(self, word: str) -> Tuple[str, str]:
|
|
129
|
+
"""Suffix-based morphological analysis for Cyrillic"""
|
|
130
|
+
word_lower = word.lower()
|
|
131
|
+
|
|
132
|
+
# Reflexive verb
|
|
133
|
+
if word_lower.endswith(self.REFLEXIVE_SUFFIX) and len(word_lower) > 4:
|
|
134
|
+
return (word_lower[:-2], 'V')
|
|
135
|
+
|
|
136
|
+
# Verb infinitive
|
|
137
|
+
if word_lower.endswith(self.VERB_INFINITIVE_SUFFIX) and len(word_lower) > 3:
|
|
138
|
+
return (word_lower, 'V')
|
|
139
|
+
|
|
140
|
+
# Verbal noun suffixes (-ние, -ение, -ание, -ость, -есть)
|
|
141
|
+
if word_lower.endswith(('ние', 'ение', 'ание', 'ость', 'есть')) and len(word_lower) > 5:
|
|
142
|
+
return (word_lower, 'N')
|
|
143
|
+
|
|
144
|
+
# Adjective endings (-ый, -ий, -ой, -ая, -яя, -ое, -ее)
|
|
145
|
+
if word_lower.endswith(('ый', 'ий', 'ой', 'ая', 'яя', 'ое', 'ее', 'ые', 'ие')) and len(word_lower) > 3:
|
|
146
|
+
return (word_lower, 'ADJ')
|
|
147
|
+
|
|
148
|
+
# Adverb (-о, -е for short adjectives used as adverbs)
|
|
149
|
+
if word_lower.endswith('о') and len(word_lower) > 3:
|
|
150
|
+
# Could be adverb or neuter adjective
|
|
151
|
+
return (word_lower, 'ADV')
|
|
152
|
+
|
|
153
|
+
# Proper noun (capitalized)
|
|
154
|
+
if word[0].isupper():
|
|
155
|
+
return (word, 'NP')
|
|
156
|
+
|
|
157
|
+
# Default: noun
|
|
158
|
+
return (word_lower, 'N')
|
|
159
|
+
|
|
160
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
161
|
+
alternatives = []
|
|
162
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
163
|
+
for alt_domain in other_domains:
|
|
164
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
165
|
+
result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
|
|
166
|
+
result.score = self._score_analysis(result) * 0.9
|
|
167
|
+
alternatives.append(result)
|
|
168
|
+
return alternatives
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Latin Script Language Template
|
|
3
|
+
==============================
|
|
4
|
+
|
|
5
|
+
라틴 문자 기반 언어용 템플릿 분석기
|
|
6
|
+
Romance, Germanic, Slavic (Latin), Turkic, etc.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List, Tuple, Dict, Optional, Set
|
|
13
|
+
|
|
14
|
+
from ..advanced_base import (
|
|
15
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# 확장 사전 디렉토리
|
|
19
|
+
from ... import resources
|
|
20
|
+
|
|
21
|
+
# Optional external asset dir (default: none). If you want extended dictionaries,
|
|
22
|
+
# provide them under: TOKMOR_DATA_DIR/extended_dict/{lang}_extended.json
|
|
23
|
+
DICT_DIR = resources.data_dir() / "extended_dict"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LatinScriptAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
27
|
+
"""
|
|
28
|
+
라틴 문자 기반 언어 템플릿
|
|
29
|
+
|
|
30
|
+
서브클래스에서 오버라이드할 항목:
|
|
31
|
+
- LANG_CODE, LANG_NAME
|
|
32
|
+
- _build_base_dictionary()
|
|
33
|
+
- _build_domain_dictionaries()
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# Extended Latin characters (covers most European languages)
|
|
37
|
+
WORD_PATTERN = re.compile(
|
|
38
|
+
r"[a-zA-Zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
|
|
39
|
+
r"ąćęłńóśźżĄĆĘŁŃÓŚŹŻ" # Polish
|
|
40
|
+
r"čďěňřšťůžČĎĚŇŘŠŤŮŽ" # Czech/Slovak
|
|
41
|
+
r"őűŐŰ" # Hungarian
|
|
42
|
+
r"ăâîșțĂÂÎȘȚ" # Romanian
|
|
43
|
+
r"āēīōūĀĒĪŌŪ" # Latvian/Lithuanian
|
|
44
|
+
r"ğışöüçĞİŞÖÜÇ" # Turkish
|
|
45
|
+
r"ảạấầẩẫậắằẳẵặẻẽẹếềểễệỉĩịỏọốồổỗộớờởỡợủũụứừửữựỳỵỷỹ" # Vietnamese
|
|
46
|
+
r"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞß]+"
|
|
47
|
+
r"(?:[-'][a-zA-Zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ]+)*"
|
|
48
|
+
)
|
|
49
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
|
|
50
|
+
|
|
51
|
+
# Common suffixes for morphological analysis (override in subclass)
|
|
52
|
+
VERB_INFINITIVE_SUFFIXES: List[str] = []
|
|
53
|
+
VERB_PARTICIPLE_SUFFIXES: List[str] = []
|
|
54
|
+
NOUN_PLURAL_SUFFIXES: List[str] = []
|
|
55
|
+
ADJECTIVE_SUFFIXES: List[str] = []
|
|
56
|
+
ADVERB_SUFFIXES: List[str] = []
|
|
57
|
+
|
|
58
|
+
def __init__(self):
|
|
59
|
+
super().__init__()
|
|
60
|
+
# 확장 사전 로드 (super().__init__에서 _build_base_dictionary 호출 후)
|
|
61
|
+
self._load_extended_dictionary()
|
|
62
|
+
|
|
63
|
+
def _build_base_dictionary(self):
|
|
64
|
+
"""Override in subclass"""
|
|
65
|
+
self.function_words: Dict[str, str] = {}
|
|
66
|
+
self.irregular_verbs: Dict[str, str] = {}
|
|
67
|
+
self.irregular_nouns: Dict[str, str] = {}
|
|
68
|
+
|
|
69
|
+
def _load_extended_dictionary(self):
|
|
70
|
+
"""Load optional external extended dictionary"""
|
|
71
|
+
# 확장 사전 속성 초기화
|
|
72
|
+
if not hasattr(self, 'extended_nouns'):
|
|
73
|
+
self.extended_nouns: Dict[str, str] = {}
|
|
74
|
+
if not hasattr(self, 'extended_verbs'):
|
|
75
|
+
self.extended_verbs: Dict[str, str] = {}
|
|
76
|
+
if not hasattr(self, 'extended_adjs'):
|
|
77
|
+
self.extended_adjs: Dict[str, str] = {}
|
|
78
|
+
if not hasattr(self, 'extended_advs'):
|
|
79
|
+
self.extended_advs: Dict[str, str] = {}
|
|
80
|
+
|
|
81
|
+
lang_code = getattr(self, 'LANG_CODE', None)
|
|
82
|
+
if not lang_code:
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
dict_path = DICT_DIR / f'{lang_code}_extended.json'
|
|
86
|
+
if not dict_path.exists():
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
with open(dict_path, 'r', encoding='utf-8') as f:
|
|
90
|
+
extended = json.load(f)
|
|
91
|
+
|
|
92
|
+
# 확장 사전에 추가
|
|
93
|
+
for word, upos in extended.items():
|
|
94
|
+
if upos == 'NOUN':
|
|
95
|
+
self.extended_nouns[word] = 'N'
|
|
96
|
+
elif upos == 'PROPN':
|
|
97
|
+
self.extended_nouns[word] = 'NP'
|
|
98
|
+
elif upos == 'VERB':
|
|
99
|
+
self.extended_verbs[word] = 'V'
|
|
100
|
+
elif upos == 'ADJ':
|
|
101
|
+
self.extended_adjs[word] = 'ADJ'
|
|
102
|
+
elif upos == 'ADV':
|
|
103
|
+
self.extended_advs[word] = 'ADV'
|
|
104
|
+
|
|
105
|
+
def _build_domain_dictionaries(self):
|
|
106
|
+
"""Override in subclass"""
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
110
|
+
if not text or not text.strip():
|
|
111
|
+
return [AnalysisResult([])]
|
|
112
|
+
|
|
113
|
+
morphemes = self._analyze_text(text, domain)
|
|
114
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
115
|
+
result.score = self._score_analysis(result)
|
|
116
|
+
return [result]
|
|
117
|
+
|
|
118
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
119
|
+
result = []
|
|
120
|
+
pos = 0
|
|
121
|
+
|
|
122
|
+
while pos < len(text):
|
|
123
|
+
if text[pos].isspace():
|
|
124
|
+
pos += 1
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
# Word matching
|
|
128
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
129
|
+
if word_match:
|
|
130
|
+
word = word_match.group()
|
|
131
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
132
|
+
result.append(morpheme)
|
|
133
|
+
pos += len(word)
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
# Number
|
|
137
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
138
|
+
if num_match:
|
|
139
|
+
num = num_match.group()
|
|
140
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
141
|
+
pos += len(num)
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
# Punctuation/Symbol
|
|
145
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
146
|
+
pos += 1
|
|
147
|
+
|
|
148
|
+
return result
|
|
149
|
+
|
|
150
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
151
|
+
word_lower = word.lower()
|
|
152
|
+
|
|
153
|
+
# 1. User dictionary
|
|
154
|
+
if word_lower in self._user_dictionary:
|
|
155
|
+
lemma, pos_tag, _ = self._user_dictionary[word_lower]
|
|
156
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
157
|
+
|
|
158
|
+
# 2. Domain dictionary
|
|
159
|
+
domain_sense = self._get_domain_sense(word_lower, domain)
|
|
160
|
+
if domain_sense:
|
|
161
|
+
return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
|
|
162
|
+
|
|
163
|
+
# 3. Function words
|
|
164
|
+
if hasattr(self, 'function_words') and word_lower in self.function_words:
|
|
165
|
+
return Morpheme(surface=word, lemma=word_lower, pos=self.function_words[word_lower], start=offset, end=offset + len(word))
|
|
166
|
+
|
|
167
|
+
# 4. Irregular verbs
|
|
168
|
+
if hasattr(self, 'irregular_verbs') and word_lower in self.irregular_verbs:
|
|
169
|
+
return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
|
|
170
|
+
|
|
171
|
+
# 5. Irregular nouns
|
|
172
|
+
if hasattr(self, 'irregular_nouns') and word_lower in self.irregular_nouns:
|
|
173
|
+
return Morpheme(surface=word, lemma=self.irregular_nouns[word_lower], pos='N', start=offset, end=offset + len(word))
|
|
174
|
+
|
|
175
|
+
# 6. Extended dictionary (optional external)
|
|
176
|
+
if hasattr(self, 'extended_verbs') and word_lower in self.extended_verbs:
|
|
177
|
+
return Morpheme(surface=word, lemma=word_lower, pos='V', start=offset, end=offset + len(word))
|
|
178
|
+
if hasattr(self, 'extended_nouns') and word_lower in self.extended_nouns:
|
|
179
|
+
return Morpheme(surface=word, lemma=word_lower, pos=self.extended_nouns[word_lower], start=offset, end=offset + len(word))
|
|
180
|
+
if hasattr(self, 'extended_adjs') and word_lower in self.extended_adjs:
|
|
181
|
+
return Morpheme(surface=word, lemma=word_lower, pos='ADJ', start=offset, end=offset + len(word))
|
|
182
|
+
if hasattr(self, 'extended_advs') and word_lower in self.extended_advs:
|
|
183
|
+
return Morpheme(surface=word, lemma=word_lower, pos='ADV', start=offset, end=offset + len(word))
|
|
184
|
+
|
|
185
|
+
# 7. Morphological analysis
|
|
186
|
+
lemma, pos_tag = self._analyze_morphology(word)
|
|
187
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
188
|
+
|
|
189
|
+
def _analyze_morphology(self, word: str) -> Tuple[str, str]:
|
|
190
|
+
"""Suffix-based morphological analysis"""
|
|
191
|
+
word_lower = word.lower()
|
|
192
|
+
|
|
193
|
+
# Verb infinitive
|
|
194
|
+
for suffix in self.VERB_INFINITIVE_SUFFIXES:
|
|
195
|
+
if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
|
|
196
|
+
return (word_lower, 'V')
|
|
197
|
+
|
|
198
|
+
# Verb participle
|
|
199
|
+
for suffix in self.VERB_PARTICIPLE_SUFFIXES:
|
|
200
|
+
if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
|
|
201
|
+
stem = word_lower[:-len(suffix)]
|
|
202
|
+
return (stem, 'V')
|
|
203
|
+
|
|
204
|
+
# Noun plural
|
|
205
|
+
for suffix in self.NOUN_PLURAL_SUFFIXES:
|
|
206
|
+
if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
|
|
207
|
+
stem = word_lower[:-len(suffix)]
|
|
208
|
+
return (stem, 'N')
|
|
209
|
+
|
|
210
|
+
# Adjective
|
|
211
|
+
for suffix in self.ADJECTIVE_SUFFIXES:
|
|
212
|
+
if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
|
|
213
|
+
return (word_lower, 'ADJ')
|
|
214
|
+
|
|
215
|
+
# Adverb
|
|
216
|
+
for suffix in self.ADVERB_SUFFIXES:
|
|
217
|
+
if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
|
|
218
|
+
return (word_lower, 'ADV')
|
|
219
|
+
|
|
220
|
+
# Proper noun (capitalized)
|
|
221
|
+
if word[0].isupper():
|
|
222
|
+
return (word, 'NP')
|
|
223
|
+
|
|
224
|
+
# Default: noun
|
|
225
|
+
return (word_lower, 'N')
|
|
226
|
+
|
|
227
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
228
|
+
alternatives = []
|
|
229
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
230
|
+
for alt_domain in other_domains:
|
|
231
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
232
|
+
result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
|
|
233
|
+
result.score = self._score_analysis(result) * 0.9
|
|
234
|
+
alternatives.append(result)
|
|
235
|
+
return alternatives
|