tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tier 4 Languages - Extended Global Coverage
|
|
3
|
+
============================================
|
|
4
|
+
|
|
5
|
+
25+ languages: South Asian, Southeast Asian, African, Caucasian, Central Asian, etc.
|
|
6
|
+
bn, ta, te, mr, gu, kn, ml, pa, ur, ne, si, my, km, lo, tl, sw, am, yo, ha, zu, af, ka, az, kk, uz, mn
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .templates.latin_template import LatinScriptAnalyzer
|
|
10
|
+
from .templates.cyrillic_template import CyrillicScriptAnalyzer
|
|
11
|
+
from .templates.arabic_script_template import ArabicScriptAnalyzer
|
|
12
|
+
from .templates.brahmic_template import BrahmicScriptAnalyzer
|
|
13
|
+
from .templates.other_scripts_template import GeorgianScriptAnalyzer, EthiopicScriptAnalyzer
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# =============================================================================
|
|
17
|
+
# South Asian Languages
|
|
18
|
+
# =============================================================================
|
|
19
|
+
|
|
20
|
+
class BengaliAnalyzer(BrahmicScriptAnalyzer):
|
|
21
|
+
LANG_CODE = "bn"
|
|
22
|
+
LANG_NAME = "Bengali"
|
|
23
|
+
SCRIPT_NAME = "bengali"
|
|
24
|
+
|
|
25
|
+
def _build_base_dictionary(self):
|
|
26
|
+
self.postpositions = {'এ': 'PSP', 'তে': 'PSP', 'কে': 'PSP', 'র': 'PSP', 'থেকে': 'PSP'}
|
|
27
|
+
self.function_words = {
|
|
28
|
+
'আমি': 'PRON', 'তুমি': 'PRON', 'সে': 'PRON', 'আমরা': 'PRON', 'তোমরা': 'PRON', 'তারা': 'PRON',
|
|
29
|
+
'এবং': 'CONJ', 'বা': 'CONJ', 'কিন্তু': 'CONJ', 'যে': 'CONJ',
|
|
30
|
+
'না': 'NEG', 'হ্যাঁ': 'ADV', 'খুব': 'ADV', 'ভালো': 'ADV',
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class TamilAnalyzer(BrahmicScriptAnalyzer):
|
|
35
|
+
LANG_CODE = "ta"
|
|
36
|
+
LANG_NAME = "Tamil"
|
|
37
|
+
SCRIPT_NAME = "tamil"
|
|
38
|
+
|
|
39
|
+
def _build_base_dictionary(self):
|
|
40
|
+
self.postpositions = {'இல்': 'PSP', 'இருந்து': 'PSP', 'க்கு': 'PSP', 'உடன்': 'PSP'}
|
|
41
|
+
self.function_words = {
|
|
42
|
+
'நான்': 'PRON', 'நீ': 'PRON', 'அவன்': 'PRON', 'அவள்': 'PRON', 'நாங்கள்': 'PRON', 'அவர்கள்': 'PRON',
|
|
43
|
+
'மற்றும்': 'CONJ', 'அல்லது': 'CONJ', 'ஆனால்': 'CONJ',
|
|
44
|
+
'இல்லை': 'NEG', 'ஆம்': 'ADV', 'மிகவும்': 'ADV', 'நன்றாக': 'ADV',
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class TeluguAnalyzer(BrahmicScriptAnalyzer):
|
|
49
|
+
LANG_CODE = "te"
|
|
50
|
+
LANG_NAME = "Telugu"
|
|
51
|
+
SCRIPT_NAME = "telugu"
|
|
52
|
+
|
|
53
|
+
def _build_base_dictionary(self):
|
|
54
|
+
self.postpositions = {'లో': 'PSP', 'కి': 'PSP', 'నుండి': 'PSP', 'తో': 'PSP'}
|
|
55
|
+
self.function_words = {
|
|
56
|
+
'నేను': 'PRON', 'నీవు': 'PRON', 'అతను': 'PRON', 'ఆమె': 'PRON', 'మేము': 'PRON', 'వారు': 'PRON',
|
|
57
|
+
'మరియు': 'CONJ', 'లేదా': 'CONJ', 'కానీ': 'CONJ',
|
|
58
|
+
'కాదు': 'NEG', 'అవును': 'ADV', 'చాలా': 'ADV', 'బాగా': 'ADV',
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class MarathiAnalyzer(BrahmicScriptAnalyzer):
|
|
63
|
+
LANG_CODE = "mr"
|
|
64
|
+
LANG_NAME = "Marathi"
|
|
65
|
+
SCRIPT_NAME = "devanagari"
|
|
66
|
+
|
|
67
|
+
def _build_base_dictionary(self):
|
|
68
|
+
self.postpositions = {'मध्ये': 'PSP', 'ला': 'PSP', 'पासून': 'PSP', 'साठी': 'PSP', 'बरोबर': 'PSP'}
|
|
69
|
+
self.function_words = {
|
|
70
|
+
'मी': 'PRON', 'तू': 'PRON', 'तो': 'PRON', 'ती': 'PRON', 'आम्ही': 'PRON', 'ते': 'PRON',
|
|
71
|
+
'आणि': 'CONJ', 'किंवा': 'CONJ', 'पण': 'CONJ',
|
|
72
|
+
'नाही': 'NEG', 'हो': 'ADV', 'खूप': 'ADV', 'चांगले': 'ADV',
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class GujaratiAnalyzer(BrahmicScriptAnalyzer):
|
|
77
|
+
LANG_CODE = "gu"
|
|
78
|
+
LANG_NAME = "Gujarati"
|
|
79
|
+
SCRIPT_NAME = "gujarati"
|
|
80
|
+
|
|
81
|
+
def _build_base_dictionary(self):
|
|
82
|
+
self.postpositions = {'માં': 'PSP', 'ને': 'PSP', 'થી': 'PSP', 'માટે': 'PSP'}
|
|
83
|
+
self.function_words = {
|
|
84
|
+
'હું': 'PRON', 'તું': 'PRON', 'તે': 'PRON', 'અમે': 'PRON', 'તેઓ': 'PRON',
|
|
85
|
+
'અને': 'CONJ', 'અથવા': 'CONJ', 'પણ': 'CONJ',
|
|
86
|
+
'નહીં': 'NEG', 'હા': 'ADV', 'ખૂબ': 'ADV', 'સારું': 'ADV',
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class KannadaAnalyzer(BrahmicScriptAnalyzer):
|
|
91
|
+
LANG_CODE = "kn"
|
|
92
|
+
LANG_NAME = "Kannada"
|
|
93
|
+
SCRIPT_NAME = "kannada"
|
|
94
|
+
|
|
95
|
+
def _build_base_dictionary(self):
|
|
96
|
+
self.postpositions = {'ಲ್ಲಿ': 'PSP', 'ಗೆ': 'PSP', 'ಇಂದ': 'PSP', 'ಜೊತೆ': 'PSP'}
|
|
97
|
+
self.function_words = {
|
|
98
|
+
'ನಾನು': 'PRON', 'ನೀನು': 'PRON', 'ಅವನು': 'PRON', 'ಅವಳು': 'PRON', 'ನಾವು': 'PRON', 'ಅವರು': 'PRON',
|
|
99
|
+
'ಮತ್ತು': 'CONJ', 'ಅಥವಾ': 'CONJ', 'ಆದರೆ': 'CONJ',
|
|
100
|
+
'ಇಲ್ಲ': 'NEG', 'ಹೌದು': 'ADV', 'ತುಂಬಾ': 'ADV', 'ಚೆನ್ನಾಗಿ': 'ADV',
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class MalayalamAnalyzer(BrahmicScriptAnalyzer):
|
|
105
|
+
LANG_CODE = "ml"
|
|
106
|
+
LANG_NAME = "Malayalam"
|
|
107
|
+
SCRIPT_NAME = "malayalam"
|
|
108
|
+
|
|
109
|
+
def _build_base_dictionary(self):
|
|
110
|
+
self.postpositions = {'ൽ': 'PSP', 'ക്ക്': 'PSP', 'ൽ നിന്ന്': 'PSP', 'കൂടെ': 'PSP'}
|
|
111
|
+
self.function_words = {
|
|
112
|
+
'ഞാൻ': 'PRON', 'നീ': 'PRON', 'അവൻ': 'PRON', 'അവൾ': 'PRON', 'ഞങ്ങൾ': 'PRON', 'അവർ': 'PRON',
|
|
113
|
+
'ഒപ്പം': 'CONJ', 'അല്ലെങ്കിൽ': 'CONJ', 'പക്ഷേ': 'CONJ',
|
|
114
|
+
'ഇല്ല': 'NEG', 'അതെ': 'ADV', 'വളരെ': 'ADV', 'നന്നായി': 'ADV',
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class PunjabiAnalyzer(BrahmicScriptAnalyzer):
|
|
119
|
+
LANG_CODE = "pa"
|
|
120
|
+
LANG_NAME = "Punjabi"
|
|
121
|
+
SCRIPT_NAME = "gurmukhi"
|
|
122
|
+
|
|
123
|
+
def _build_base_dictionary(self):
|
|
124
|
+
self.postpositions = {'ਵਿੱਚ': 'PSP', 'ਨੂੰ': 'PSP', 'ਤੋਂ': 'PSP', 'ਲਈ': 'PSP', 'ਨਾਲ': 'PSP'}
|
|
125
|
+
self.function_words = {
|
|
126
|
+
'ਮੈਂ': 'PRON', 'ਤੂੰ': 'PRON', 'ਉਹ': 'PRON', 'ਅਸੀਂ': 'PRON', 'ਉਹ': 'PRON',
|
|
127
|
+
'ਅਤੇ': 'CONJ', 'ਜਾਂ': 'CONJ', 'ਪਰ': 'CONJ',
|
|
128
|
+
'ਨਹੀਂ': 'NEG', 'ਹਾਂ': 'ADV', 'ਬਹੁਤ': 'ADV', 'ਚੰਗੀ': 'ADV',
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class UrduAnalyzer(ArabicScriptAnalyzer):
|
|
133
|
+
LANG_CODE = "ur"
|
|
134
|
+
LANG_NAME = "Urdu"
|
|
135
|
+
|
|
136
|
+
def _build_base_dictionary(self):
|
|
137
|
+
self.prefixes = {}
|
|
138
|
+
self.suffixes = {'وں': 'PL', 'یں': 'PL'}
|
|
139
|
+
self.function_words = {
|
|
140
|
+
'میں': 'PRON', 'تم': 'PRON', 'وہ': 'PRON', 'ہم': 'PRON', 'آپ': 'PRON',
|
|
141
|
+
'میں': 'PREP', 'پر': 'PREP', 'سے': 'PREP', 'کو': 'PREP', 'کے': 'PREP',
|
|
142
|
+
'اور': 'CONJ', 'یا': 'CONJ', 'لیکن': 'CONJ', 'کہ': 'CONJ',
|
|
143
|
+
'نہیں': 'NEG', 'ہاں': 'ADV', 'بہت': 'ADV', 'اچھا': 'ADV',
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class NepaliAnalyzer(BrahmicScriptAnalyzer):
|
|
148
|
+
LANG_CODE = "ne"
|
|
149
|
+
LANG_NAME = "Nepali"
|
|
150
|
+
SCRIPT_NAME = "devanagari"
|
|
151
|
+
|
|
152
|
+
def _build_base_dictionary(self):
|
|
153
|
+
self.postpositions = {'मा': 'PSP', 'लाई': 'PSP', 'बाट': 'PSP', 'को': 'PSP', 'सँग': 'PSP'}
|
|
154
|
+
self.function_words = {
|
|
155
|
+
'म': 'PRON', 'तिमी': 'PRON', 'उ': 'PRON', 'हामी': 'PRON', 'उनीहरू': 'PRON',
|
|
156
|
+
'र': 'CONJ', 'वा': 'CONJ', 'तर': 'CONJ',
|
|
157
|
+
'होइन': 'NEG', 'हो': 'ADV', 'धेरै': 'ADV', 'राम्रो': 'ADV',
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class SinhalaAnalyzer(BrahmicScriptAnalyzer):
|
|
162
|
+
LANG_CODE = "si"
|
|
163
|
+
LANG_NAME = "Sinhala"
|
|
164
|
+
SCRIPT_NAME = "sinhala"
|
|
165
|
+
|
|
166
|
+
def _build_base_dictionary(self):
|
|
167
|
+
self.postpositions = {'ට': 'PSP', 'ගෙන්': 'PSP', 'සමග': 'PSP'}
|
|
168
|
+
self.function_words = {
|
|
169
|
+
'මම': 'PRON', 'ඔබ': 'PRON', 'ඔහු': 'PRON', 'ඇය': 'PRON', 'අපි': 'PRON', 'ඔවුන්': 'PRON',
|
|
170
|
+
'සහ': 'CONJ', 'හෝ': 'CONJ', 'නමුත්': 'CONJ',
|
|
171
|
+
'නැත': 'NEG', 'ඔව්': 'ADV', 'ඉතා': 'ADV', 'හොඳ': 'ADV',
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# =============================================================================
|
|
176
|
+
# Southeast Asian Languages
|
|
177
|
+
# =============================================================================
|
|
178
|
+
|
|
179
|
+
class MyanmarAnalyzer(BrahmicScriptAnalyzer):
|
|
180
|
+
LANG_CODE = "my"
|
|
181
|
+
LANG_NAME = "Myanmar/Burmese"
|
|
182
|
+
SCRIPT_NAME = "myanmar"
|
|
183
|
+
|
|
184
|
+
def _build_base_dictionary(self):
|
|
185
|
+
self.function_words = {
|
|
186
|
+
'ကျွန်တော်': 'PRON', 'သင်': 'PRON', 'သူ': 'PRON', 'ကျွန်တော်တို့': 'PRON', 'သူတို့': 'PRON',
|
|
187
|
+
'နှင့်': 'CONJ', 'သို့မဟုတ်': 'CONJ', 'သို့သော်': 'CONJ',
|
|
188
|
+
'မဟုတ်': 'NEG', 'ဟုတ်': 'ADV', 'အလွန်': 'ADV', 'ကောင်း': 'ADV',
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class KhmerAnalyzer(BrahmicScriptAnalyzer):
|
|
193
|
+
LANG_CODE = "km"
|
|
194
|
+
LANG_NAME = "Khmer"
|
|
195
|
+
SCRIPT_NAME = "khmer"
|
|
196
|
+
|
|
197
|
+
def _build_base_dictionary(self):
|
|
198
|
+
self.function_words = {
|
|
199
|
+
'ខ្ញុំ': 'PRON', 'អ្នក': 'PRON', 'គាត់': 'PRON', 'យើង': 'PRON', 'ពួកគេ': 'PRON',
|
|
200
|
+
'និង': 'CONJ', 'ឬ': 'CONJ', 'ប៉ុន្តែ': 'CONJ',
|
|
201
|
+
'មិន': 'NEG', 'បាទ': 'ADV', 'ខ្លាំង': 'ADV', 'ល្អ': 'ADV',
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class LaoAnalyzer(BrahmicScriptAnalyzer):
|
|
206
|
+
LANG_CODE = "lo"
|
|
207
|
+
LANG_NAME = "Lao"
|
|
208
|
+
SCRIPT_NAME = "lao"
|
|
209
|
+
|
|
210
|
+
def _build_base_dictionary(self):
|
|
211
|
+
self.function_words = {
|
|
212
|
+
'ຂ້ອຍ': 'PRON', 'ເຈົ້າ': 'PRON', 'ລາວ': 'PRON', 'ພວກເຮົາ': 'PRON', 'ພວກເຂົາ': 'PRON',
|
|
213
|
+
'ແລະ': 'CONJ', 'ຫຼື': 'CONJ', 'ແຕ່': 'CONJ',
|
|
214
|
+
'ບໍ່': 'NEG', 'ແມ່ນ': 'ADV', 'ຫຼາຍ': 'ADV', 'ດີ': 'ADV',
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class TagalogAnalyzer(LatinScriptAnalyzer):
|
|
219
|
+
LANG_CODE = "tl"
|
|
220
|
+
LANG_NAME = "Tagalog/Filipino"
|
|
221
|
+
|
|
222
|
+
def _build_base_dictionary(self):
|
|
223
|
+
self.function_words = {
|
|
224
|
+
'ang': 'DET', 'ng': 'DET', 'mga': 'DET',
|
|
225
|
+
'ako': 'PRON', 'ikaw': 'PRON', 'siya': 'PRON', 'kami': 'PRON', 'tayo': 'PRON', 'sila': 'PRON',
|
|
226
|
+
'sa': 'PREP', 'para': 'PREP', 'mula': 'PREP',
|
|
227
|
+
'at': 'CONJ', 'o': 'CONJ', 'pero': 'CONJ', 'na': 'CONJ',
|
|
228
|
+
'hindi': 'NEG', 'oo': 'ADV', 'napaka': 'ADV', 'mabuti': 'ADV',
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class MalayAnalyzer(LatinScriptAnalyzer):
|
|
233
|
+
LANG_CODE = "ms"
|
|
234
|
+
LANG_NAME = "Malay"
|
|
235
|
+
|
|
236
|
+
def _build_base_dictionary(self):
|
|
237
|
+
self.function_words = {
|
|
238
|
+
'ini': 'DET', 'itu': 'DET', 'tersebut': 'DET', 'semua': 'DET', 'setiap': 'DET',
|
|
239
|
+
'saya': 'PRON', 'aku': 'PRON', 'awak': 'PRON', 'anda': 'PRON', 'dia': 'PRON', 'ia': 'PRON',
|
|
240
|
+
'kami': 'PRON', 'kita': 'PRON', 'mereka': 'PRON', 'beliau': 'PRON',
|
|
241
|
+
'di': 'PREP', 'ke': 'PREP', 'dari': 'PREP', 'daripada': 'PREP', 'kepada': 'PREP',
|
|
242
|
+
'pada': 'PREP', 'untuk': 'PREP', 'dengan': 'PREP', 'oleh': 'PREP', 'dalam': 'PREP',
|
|
243
|
+
'dan': 'CONJ', 'atau': 'CONJ', 'tetapi': 'CONJ', 'namun': 'CONJ', 'kerana': 'CONJ',
|
|
244
|
+
'jika': 'CONJ', 'kalau': 'CONJ', 'apabila': 'CONJ', 'bahawa': 'CONJ', 'supaya': 'CONJ',
|
|
245
|
+
'tidak': 'NEG', 'bukan': 'NEG', 'belum': 'NEG',
|
|
246
|
+
'adalah': 'AUX', 'ialah': 'AUX', 'ada': 'AUX', 'akan': 'AUX', 'sudah': 'AUX',
|
|
247
|
+
'telah': 'AUX', 'sedang': 'AUX', 'masih': 'AUX', 'boleh': 'AUX', 'dapat': 'AUX',
|
|
248
|
+
'sangat': 'ADV', 'amat': 'ADV', 'lebih': 'ADV', 'paling': 'ADV', 'juga': 'ADV',
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# =============================================================================
|
|
253
|
+
# African Languages
|
|
254
|
+
# =============================================================================
|
|
255
|
+
|
|
256
|
+
class SwahiliAnalyzer(LatinScriptAnalyzer):
|
|
257
|
+
LANG_CODE = "sw"
|
|
258
|
+
LANG_NAME = "Swahili"
|
|
259
|
+
|
|
260
|
+
def _build_base_dictionary(self):
|
|
261
|
+
self.function_words = {
|
|
262
|
+
'mimi': 'PRON', 'wewe': 'PRON', 'yeye': 'PRON', 'sisi': 'PRON', 'ninyi': 'PRON', 'wao': 'PRON',
|
|
263
|
+
'na': 'CONJ', 'au': 'CONJ', 'lakini': 'CONJ', 'kwa': 'PREP',
|
|
264
|
+
'hapana': 'NEG', 'ndiyo': 'ADV', 'sana': 'ADV', 'vizuri': 'ADV',
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class AmharicAnalyzer(EthiopicScriptAnalyzer):
|
|
269
|
+
pass # Uses template
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class YorubaAnalyzer(LatinScriptAnalyzer):
|
|
273
|
+
LANG_CODE = "yo"
|
|
274
|
+
LANG_NAME = "Yoruba"
|
|
275
|
+
|
|
276
|
+
def _build_base_dictionary(self):
|
|
277
|
+
self.function_words = {
|
|
278
|
+
'èmi': 'PRON', 'ìwọ': 'PRON', 'òun': 'PRON', 'àwa': 'PRON', 'wọ́n': 'PRON',
|
|
279
|
+
'àti': 'CONJ', 'tàbí': 'CONJ', 'ṣùgbọ́n': 'CONJ',
|
|
280
|
+
'kò': 'NEG', 'bẹ́ẹ̀ni': 'ADV', 'púpọ̀': 'ADV', 'dáradára': 'ADV',
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class HausaAnalyzer(LatinScriptAnalyzer):
|
|
285
|
+
LANG_CODE = "ha"
|
|
286
|
+
LANG_NAME = "Hausa"
|
|
287
|
+
|
|
288
|
+
def _build_base_dictionary(self):
|
|
289
|
+
self.function_words = {
|
|
290
|
+
'ni': 'PRON', 'kai': 'PRON', 'shi': 'PRON', 'ita': 'PRON', 'mu': 'PRON', 'su': 'PRON',
|
|
291
|
+
'da': 'CONJ', 'ko': 'CONJ', 'amma': 'CONJ',
|
|
292
|
+
'ba': 'NEG', 'e': 'ADV', 'sosai': 'ADV', 'da kyau': 'ADV',
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class ZuluAnalyzer(LatinScriptAnalyzer):
|
|
297
|
+
LANG_CODE = "zu"
|
|
298
|
+
LANG_NAME = "Zulu"
|
|
299
|
+
|
|
300
|
+
def _build_base_dictionary(self):
|
|
301
|
+
self.function_words = {
|
|
302
|
+
'mina': 'PRON', 'wena': 'PRON', 'yena': 'PRON', 'thina': 'PRON', 'bona': 'PRON',
|
|
303
|
+
'na': 'CONJ', 'noma': 'CONJ', 'kodwa': 'CONJ',
|
|
304
|
+
'cha': 'NEG', 'yebo': 'ADV', 'kakhulu': 'ADV', 'kahle': 'ADV',
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
class AfrikaansAnalyzer(LatinScriptAnalyzer):
|
|
309
|
+
LANG_CODE = "af"
|
|
310
|
+
LANG_NAME = "Afrikaans"
|
|
311
|
+
|
|
312
|
+
def _build_base_dictionary(self):
|
|
313
|
+
self.function_words = {
|
|
314
|
+
'die': 'DET', "'n": 'DET',
|
|
315
|
+
'ek': 'PRON', 'jy': 'PRON', 'hy': 'PRON', 'sy': 'PRON', 'ons': 'PRON', 'hulle': 'PRON',
|
|
316
|
+
'in': 'PREP', 'op': 'PREP', 'van': 'PREP', 'met': 'PREP', 'vir': 'PREP',
|
|
317
|
+
'en': 'CONJ', 'of': 'CONJ', 'maar': 'CONJ', 'dat': 'CONJ',
|
|
318
|
+
'nie': 'NEG', 'ja': 'ADV', 'baie': 'ADV', 'goed': 'ADV',
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
# =============================================================================
|
|
323
|
+
# Caucasian Languages
|
|
324
|
+
# =============================================================================
|
|
325
|
+
|
|
326
|
+
class GeorgianAnalyzer(GeorgianScriptAnalyzer):
|
|
327
|
+
pass # Uses template
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
class ArmenianAnalyzer(LatinScriptAnalyzer):
|
|
331
|
+
"""Armenian with Latin transliteration support"""
|
|
332
|
+
LANG_CODE = "hy"
|
|
333
|
+
LANG_NAME = "Armenian"
|
|
334
|
+
|
|
335
|
+
def _build_base_dictionary(self):
|
|
336
|
+
self.function_words = {
|
|
337
|
+
'yes': 'PRON', 'duk': 'PRON', 'na': 'PRON', 'menk': 'PRON', 'nrank': 'PRON',
|
|
338
|
+
'yev': 'CONJ', 'kam': 'CONJ', 'bayc': 'CONJ',
|
|
339
|
+
'che': 'NEG', 'ayo': 'ADV', 'shat': 'ADV', 'lav': 'ADV',
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
class AzerbaijaniAnalyzer(LatinScriptAnalyzer):
|
|
344
|
+
LANG_CODE = "az"
|
|
345
|
+
LANG_NAME = "Azerbaijani"
|
|
346
|
+
|
|
347
|
+
def _build_base_dictionary(self):
|
|
348
|
+
self.function_words = {
|
|
349
|
+
'mən': 'PRON', 'sən': 'PRON', 'o': 'PRON', 'biz': 'PRON', 'siz': 'PRON', 'onlar': 'PRON',
|
|
350
|
+
'və': 'CONJ', 'və ya': 'CONJ', 'amma': 'CONJ', 'lakin': 'CONJ',
|
|
351
|
+
'yox': 'NEG', 'bəli': 'ADV', 'çox': 'ADV', 'yaxşı': 'ADV',
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
# =============================================================================
|
|
356
|
+
# Central Asian Languages
|
|
357
|
+
# =============================================================================
|
|
358
|
+
|
|
359
|
+
class KazakhAnalyzer(CyrillicScriptAnalyzer):
|
|
360
|
+
LANG_CODE = "kk"
|
|
361
|
+
LANG_NAME = "Kazakh"
|
|
362
|
+
|
|
363
|
+
def _build_base_dictionary(self):
|
|
364
|
+
self.function_words = {
|
|
365
|
+
'мен': 'PRON', 'сен': 'PRON', 'ол': 'PRON', 'біз': 'PRON', 'сіз': 'PRON', 'олар': 'PRON',
|
|
366
|
+
'және': 'CONJ', 'немесе': 'CONJ', 'бірақ': 'CONJ',
|
|
367
|
+
'жоқ': 'NEG', 'иә': 'ADV', 'өте': 'ADV', 'жақсы': 'ADV',
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
class UzbekAnalyzer(LatinScriptAnalyzer):
|
|
372
|
+
LANG_CODE = "uz"
|
|
373
|
+
LANG_NAME = "Uzbek"
|
|
374
|
+
|
|
375
|
+
def _build_base_dictionary(self):
|
|
376
|
+
self.function_words = {
|
|
377
|
+
'men': 'PRON', 'sen': 'PRON', 'u': 'PRON', 'biz': 'PRON', 'siz': 'PRON', 'ular': 'PRON',
|
|
378
|
+
'va': 'CONJ', 'yoki': 'CONJ', 'lekin': 'CONJ', 'ammo': 'CONJ',
|
|
379
|
+
"yo'q": 'NEG', 'ha': 'ADV', 'juda': 'ADV', 'yaxshi': 'ADV',
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
class MongolianAnalyzer(CyrillicScriptAnalyzer):
|
|
384
|
+
LANG_CODE = "mn"
|
|
385
|
+
LANG_NAME = "Mongolian"
|
|
386
|
+
|
|
387
|
+
def _build_base_dictionary(self):
|
|
388
|
+
self.function_words = {
|
|
389
|
+
'би': 'PRON', 'чи': 'PRON', 'тэр': 'PRON', 'бид': 'PRON', 'та': 'PRON', 'тэд': 'PRON',
|
|
390
|
+
'ба': 'CONJ', 'эсвэл': 'CONJ', 'гэхдээ': 'CONJ',
|
|
391
|
+
'үгүй': 'NEG', 'тийм': 'ADV', 'маш': 'ADV', 'сайн': 'ADV',
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
# Export all
|
|
396
|
+
__all__ = [
|
|
397
|
+
# South Asian
|
|
398
|
+
'BengaliAnalyzer', 'TamilAnalyzer', 'TeluguAnalyzer', 'MarathiAnalyzer',
|
|
399
|
+
'GujaratiAnalyzer', 'KannadaAnalyzer', 'MalayalamAnalyzer', 'PunjabiAnalyzer',
|
|
400
|
+
'UrduAnalyzer', 'NepaliAnalyzer', 'SinhalaAnalyzer',
|
|
401
|
+
# Southeast Asian
|
|
402
|
+
'MyanmarAnalyzer', 'KhmerAnalyzer', 'LaoAnalyzer', 'TagalogAnalyzer', 'MalayAnalyzer',
|
|
403
|
+
# African
|
|
404
|
+
'SwahiliAnalyzer', 'AmharicAnalyzer', 'YorubaAnalyzer', 'HausaAnalyzer',
|
|
405
|
+
'ZuluAnalyzer', 'AfrikaansAnalyzer',
|
|
406
|
+
# Caucasian
|
|
407
|
+
'GeorgianAnalyzer', 'ArmenianAnalyzer', 'AzerbaijaniAnalyzer',
|
|
408
|
+
# Central Asian
|
|
409
|
+
'KazakhAnalyzer', 'UzbekAnalyzer', 'MongolianAnalyzer',
|
|
410
|
+
]
|