tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,410 @@
1
+ """
2
+ Tier 4 Languages - Extended Global Coverage
3
+ ============================================
4
+
5
+ 25+ languages: South Asian, Southeast Asian, African, Caucasian, Central Asian, etc.
6
+ bn, ta, te, mr, gu, kn, ml, pa, ur, ne, si, my, km, lo, tl, sw, am, yo, ha, zu, af, ka, az, kk, uz, mn
7
+ """
8
+
9
+ from .templates.latin_template import LatinScriptAnalyzer
10
+ from .templates.cyrillic_template import CyrillicScriptAnalyzer
11
+ from .templates.arabic_script_template import ArabicScriptAnalyzer
12
+ from .templates.brahmic_template import BrahmicScriptAnalyzer
13
+ from .templates.other_scripts_template import GeorgianScriptAnalyzer, EthiopicScriptAnalyzer
14
+
15
+
16
+ # =============================================================================
17
+ # South Asian Languages
18
+ # =============================================================================
19
+
20
+ class BengaliAnalyzer(BrahmicScriptAnalyzer):
21
+ LANG_CODE = "bn"
22
+ LANG_NAME = "Bengali"
23
+ SCRIPT_NAME = "bengali"
24
+
25
+ def _build_base_dictionary(self):
26
+ self.postpositions = {'এ': 'PSP', 'তে': 'PSP', 'কে': 'PSP', 'র': 'PSP', 'থেকে': 'PSP'}
27
+ self.function_words = {
28
+ 'আমি': 'PRON', 'তুমি': 'PRON', 'সে': 'PRON', 'আমরা': 'PRON', 'তোমরা': 'PRON', 'তারা': 'PRON',
29
+ 'এবং': 'CONJ', 'বা': 'CONJ', 'কিন্তু': 'CONJ', 'যে': 'CONJ',
30
+ 'না': 'NEG', 'হ্যাঁ': 'ADV', 'খুব': 'ADV', 'ভালো': 'ADV',
31
+ }
32
+
33
+
34
+ class TamilAnalyzer(BrahmicScriptAnalyzer):
35
+ LANG_CODE = "ta"
36
+ LANG_NAME = "Tamil"
37
+ SCRIPT_NAME = "tamil"
38
+
39
+ def _build_base_dictionary(self):
40
+ self.postpositions = {'இல்': 'PSP', 'இருந்து': 'PSP', 'க்கு': 'PSP', 'உடன்': 'PSP'}
41
+ self.function_words = {
42
+ 'நான்': 'PRON', 'நீ': 'PRON', 'அவன்': 'PRON', 'அவள்': 'PRON', 'நாங்கள்': 'PRON', 'அவர்கள்': 'PRON',
43
+ 'மற்றும்': 'CONJ', 'அல்லது': 'CONJ', 'ஆனால்': 'CONJ',
44
+ 'இல்லை': 'NEG', 'ஆம்': 'ADV', 'மிகவும்': 'ADV', 'நன்றாக': 'ADV',
45
+ }
46
+
47
+
48
+ class TeluguAnalyzer(BrahmicScriptAnalyzer):
49
+ LANG_CODE = "te"
50
+ LANG_NAME = "Telugu"
51
+ SCRIPT_NAME = "telugu"
52
+
53
+ def _build_base_dictionary(self):
54
+ self.postpositions = {'లో': 'PSP', 'కి': 'PSP', 'నుండి': 'PSP', 'తో': 'PSP'}
55
+ self.function_words = {
56
+ 'నేను': 'PRON', 'నీవు': 'PRON', 'అతను': 'PRON', 'ఆమె': 'PRON', 'మేము': 'PRON', 'వారు': 'PRON',
57
+ 'మరియు': 'CONJ', 'లేదా': 'CONJ', 'కానీ': 'CONJ',
58
+ 'కాదు': 'NEG', 'అవును': 'ADV', 'చాలా': 'ADV', 'బాగా': 'ADV',
59
+ }
60
+
61
+
62
+ class MarathiAnalyzer(BrahmicScriptAnalyzer):
63
+ LANG_CODE = "mr"
64
+ LANG_NAME = "Marathi"
65
+ SCRIPT_NAME = "devanagari"
66
+
67
+ def _build_base_dictionary(self):
68
+ self.postpositions = {'मध्ये': 'PSP', 'ला': 'PSP', 'पासून': 'PSP', 'साठी': 'PSP', 'बरोबर': 'PSP'}
69
+ self.function_words = {
70
+ 'मी': 'PRON', 'तू': 'PRON', 'तो': 'PRON', 'ती': 'PRON', 'आम्ही': 'PRON', 'ते': 'PRON',
71
+ 'आणि': 'CONJ', 'किंवा': 'CONJ', 'पण': 'CONJ',
72
+ 'नाही': 'NEG', 'हो': 'ADV', 'खूप': 'ADV', 'चांगले': 'ADV',
73
+ }
74
+
75
+
76
+ class GujaratiAnalyzer(BrahmicScriptAnalyzer):
77
+ LANG_CODE = "gu"
78
+ LANG_NAME = "Gujarati"
79
+ SCRIPT_NAME = "gujarati"
80
+
81
+ def _build_base_dictionary(self):
82
+ self.postpositions = {'માં': 'PSP', 'ને': 'PSP', 'થી': 'PSP', 'માટે': 'PSP'}
83
+ self.function_words = {
84
+ 'હું': 'PRON', 'તું': 'PRON', 'તે': 'PRON', 'અમે': 'PRON', 'તેઓ': 'PRON',
85
+ 'અને': 'CONJ', 'અથવા': 'CONJ', 'પણ': 'CONJ',
86
+ 'નહીં': 'NEG', 'હા': 'ADV', 'ખૂબ': 'ADV', 'સારું': 'ADV',
87
+ }
88
+
89
+
90
+ class KannadaAnalyzer(BrahmicScriptAnalyzer):
91
+ LANG_CODE = "kn"
92
+ LANG_NAME = "Kannada"
93
+ SCRIPT_NAME = "kannada"
94
+
95
+ def _build_base_dictionary(self):
96
+ self.postpositions = {'ಲ್ಲಿ': 'PSP', 'ಗೆ': 'PSP', 'ಇಂದ': 'PSP', 'ಜೊತೆ': 'PSP'}
97
+ self.function_words = {
98
+ 'ನಾನು': 'PRON', 'ನೀನು': 'PRON', 'ಅವನು': 'PRON', 'ಅವಳು': 'PRON', 'ನಾವು': 'PRON', 'ಅವರು': 'PRON',
99
+ 'ಮತ್ತು': 'CONJ', 'ಅಥವಾ': 'CONJ', 'ಆದರೆ': 'CONJ',
100
+ 'ಇಲ್ಲ': 'NEG', 'ಹೌದು': 'ADV', 'ತುಂಬಾ': 'ADV', 'ಚೆನ್ನಾಗಿ': 'ADV',
101
+ }
102
+
103
+
104
+ class MalayalamAnalyzer(BrahmicScriptAnalyzer):
105
+ LANG_CODE = "ml"
106
+ LANG_NAME = "Malayalam"
107
+ SCRIPT_NAME = "malayalam"
108
+
109
+ def _build_base_dictionary(self):
110
+ self.postpositions = {'ൽ': 'PSP', 'ക്ക്': 'PSP', 'ൽ നിന്ന്': 'PSP', 'കൂടെ': 'PSP'}
111
+ self.function_words = {
112
+ 'ഞാൻ': 'PRON', 'നീ': 'PRON', 'അവൻ': 'PRON', 'അവൾ': 'PRON', 'ഞങ്ങൾ': 'PRON', 'അവർ': 'PRON',
113
+ 'ഒപ്പം': 'CONJ', 'അല്ലെങ്കിൽ': 'CONJ', 'പക്ഷേ': 'CONJ',
114
+ 'ഇല്ല': 'NEG', 'അതെ': 'ADV', 'വളരെ': 'ADV', 'നന്നായി': 'ADV',
115
+ }
116
+
117
+
118
+ class PunjabiAnalyzer(BrahmicScriptAnalyzer):
119
+ LANG_CODE = "pa"
120
+ LANG_NAME = "Punjabi"
121
+ SCRIPT_NAME = "gurmukhi"
122
+
123
+ def _build_base_dictionary(self):
124
+ self.postpositions = {'ਵਿੱਚ': 'PSP', 'ਨੂੰ': 'PSP', 'ਤੋਂ': 'PSP', 'ਲਈ': 'PSP', 'ਨਾਲ': 'PSP'}
125
+ self.function_words = {
126
+ 'ਮੈਂ': 'PRON', 'ਤੂੰ': 'PRON', 'ਉਹ': 'PRON', 'ਅਸੀਂ': 'PRON', 'ਉਹ': 'PRON',
127
+ 'ਅਤੇ': 'CONJ', 'ਜਾਂ': 'CONJ', 'ਪਰ': 'CONJ',
128
+ 'ਨਹੀਂ': 'NEG', 'ਹਾਂ': 'ADV', 'ਬਹੁਤ': 'ADV', 'ਚੰਗੀ': 'ADV',
129
+ }
130
+
131
+
132
+ class UrduAnalyzer(ArabicScriptAnalyzer):
133
+ LANG_CODE = "ur"
134
+ LANG_NAME = "Urdu"
135
+
136
+ def _build_base_dictionary(self):
137
+ self.prefixes = {}
138
+ self.suffixes = {'وں': 'PL', 'یں': 'PL'}
139
+ self.function_words = {
140
+ 'میں': 'PRON', 'تم': 'PRON', 'وہ': 'PRON', 'ہم': 'PRON', 'آپ': 'PRON',
141
+ 'میں': 'PREP', 'پر': 'PREP', 'سے': 'PREP', 'کو': 'PREP', 'کے': 'PREP',
142
+ 'اور': 'CONJ', 'یا': 'CONJ', 'لیکن': 'CONJ', 'کہ': 'CONJ',
143
+ 'نہیں': 'NEG', 'ہاں': 'ADV', 'بہت': 'ADV', 'اچھا': 'ADV',
144
+ }
145
+
146
+
147
+ class NepaliAnalyzer(BrahmicScriptAnalyzer):
148
+ LANG_CODE = "ne"
149
+ LANG_NAME = "Nepali"
150
+ SCRIPT_NAME = "devanagari"
151
+
152
+ def _build_base_dictionary(self):
153
+ self.postpositions = {'मा': 'PSP', 'लाई': 'PSP', 'बाट': 'PSP', 'को': 'PSP', 'सँग': 'PSP'}
154
+ self.function_words = {
155
+ 'म': 'PRON', 'तिमी': 'PRON', 'उ': 'PRON', 'हामी': 'PRON', 'उनीहरू': 'PRON',
156
+ 'र': 'CONJ', 'वा': 'CONJ', 'तर': 'CONJ',
157
+ 'होइन': 'NEG', 'हो': 'ADV', 'धेरै': 'ADV', 'राम्रो': 'ADV',
158
+ }
159
+
160
+
161
+ class SinhalaAnalyzer(BrahmicScriptAnalyzer):
162
+ LANG_CODE = "si"
163
+ LANG_NAME = "Sinhala"
164
+ SCRIPT_NAME = "sinhala"
165
+
166
+ def _build_base_dictionary(self):
167
+ self.postpositions = {'ට': 'PSP', 'ගෙන්': 'PSP', 'සමග': 'PSP'}
168
+ self.function_words = {
169
+ 'මම': 'PRON', 'ඔබ': 'PRON', 'ඔහු': 'PRON', 'ඇය': 'PRON', 'අපි': 'PRON', 'ඔවුන්': 'PRON',
170
+ 'සහ': 'CONJ', 'හෝ': 'CONJ', 'නමුත්': 'CONJ',
171
+ 'නැත': 'NEG', 'ඔව්': 'ADV', 'ඉතා': 'ADV', 'හොඳ': 'ADV',
172
+ }
173
+
174
+
175
+ # =============================================================================
176
+ # Southeast Asian Languages
177
+ # =============================================================================
178
+
179
+ class MyanmarAnalyzer(BrahmicScriptAnalyzer):
180
+ LANG_CODE = "my"
181
+ LANG_NAME = "Myanmar/Burmese"
182
+ SCRIPT_NAME = "myanmar"
183
+
184
+ def _build_base_dictionary(self):
185
+ self.function_words = {
186
+ 'ကျွန်တော်': 'PRON', 'သင်': 'PRON', 'သူ': 'PRON', 'ကျွန်တော်တို့': 'PRON', 'သူတို့': 'PRON',
187
+ 'နှင့်': 'CONJ', 'သို့မဟုတ်': 'CONJ', 'သို့သော်': 'CONJ',
188
+ 'မဟုတ်': 'NEG', 'ဟုတ်': 'ADV', 'အလွန်': 'ADV', 'ကောင်း': 'ADV',
189
+ }
190
+
191
+
192
+ class KhmerAnalyzer(BrahmicScriptAnalyzer):
193
+ LANG_CODE = "km"
194
+ LANG_NAME = "Khmer"
195
+ SCRIPT_NAME = "khmer"
196
+
197
+ def _build_base_dictionary(self):
198
+ self.function_words = {
199
+ 'ខ្ញុំ': 'PRON', 'អ្នក': 'PRON', 'គាត់': 'PRON', 'យើង': 'PRON', 'ពួកគេ': 'PRON',
200
+ 'និង': 'CONJ', 'ឬ': 'CONJ', 'ប៉ុន្តែ': 'CONJ',
201
+ 'មិន': 'NEG', 'បាទ': 'ADV', 'ខ្លាំង': 'ADV', 'ល្អ': 'ADV',
202
+ }
203
+
204
+
205
+ class LaoAnalyzer(BrahmicScriptAnalyzer):
206
+ LANG_CODE = "lo"
207
+ LANG_NAME = "Lao"
208
+ SCRIPT_NAME = "lao"
209
+
210
+ def _build_base_dictionary(self):
211
+ self.function_words = {
212
+ 'ຂ້ອຍ': 'PRON', 'ເຈົ້າ': 'PRON', 'ລາວ': 'PRON', 'ພວກເຮົາ': 'PRON', 'ພວກເຂົາ': 'PRON',
213
+ 'ແລະ': 'CONJ', 'ຫຼື': 'CONJ', 'ແຕ່': 'CONJ',
214
+ 'ບໍ່': 'NEG', 'ແມ່ນ': 'ADV', 'ຫຼາຍ': 'ADV', 'ດີ': 'ADV',
215
+ }
216
+
217
+
218
+ class TagalogAnalyzer(LatinScriptAnalyzer):
219
+ LANG_CODE = "tl"
220
+ LANG_NAME = "Tagalog/Filipino"
221
+
222
+ def _build_base_dictionary(self):
223
+ self.function_words = {
224
+ 'ang': 'DET', 'ng': 'DET', 'mga': 'DET',
225
+ 'ako': 'PRON', 'ikaw': 'PRON', 'siya': 'PRON', 'kami': 'PRON', 'tayo': 'PRON', 'sila': 'PRON',
226
+ 'sa': 'PREP', 'para': 'PREP', 'mula': 'PREP',
227
+ 'at': 'CONJ', 'o': 'CONJ', 'pero': 'CONJ', 'na': 'CONJ',
228
+ 'hindi': 'NEG', 'oo': 'ADV', 'napaka': 'ADV', 'mabuti': 'ADV',
229
+ }
230
+
231
+
232
+ class MalayAnalyzer(LatinScriptAnalyzer):
233
+ LANG_CODE = "ms"
234
+ LANG_NAME = "Malay"
235
+
236
+ def _build_base_dictionary(self):
237
+ self.function_words = {
238
+ 'ini': 'DET', 'itu': 'DET', 'tersebut': 'DET', 'semua': 'DET', 'setiap': 'DET',
239
+ 'saya': 'PRON', 'aku': 'PRON', 'awak': 'PRON', 'anda': 'PRON', 'dia': 'PRON', 'ia': 'PRON',
240
+ 'kami': 'PRON', 'kita': 'PRON', 'mereka': 'PRON', 'beliau': 'PRON',
241
+ 'di': 'PREP', 'ke': 'PREP', 'dari': 'PREP', 'daripada': 'PREP', 'kepada': 'PREP',
242
+ 'pada': 'PREP', 'untuk': 'PREP', 'dengan': 'PREP', 'oleh': 'PREP', 'dalam': 'PREP',
243
+ 'dan': 'CONJ', 'atau': 'CONJ', 'tetapi': 'CONJ', 'namun': 'CONJ', 'kerana': 'CONJ',
244
+ 'jika': 'CONJ', 'kalau': 'CONJ', 'apabila': 'CONJ', 'bahawa': 'CONJ', 'supaya': 'CONJ',
245
+ 'tidak': 'NEG', 'bukan': 'NEG', 'belum': 'NEG',
246
+ 'adalah': 'AUX', 'ialah': 'AUX', 'ada': 'AUX', 'akan': 'AUX', 'sudah': 'AUX',
247
+ 'telah': 'AUX', 'sedang': 'AUX', 'masih': 'AUX', 'boleh': 'AUX', 'dapat': 'AUX',
248
+ 'sangat': 'ADV', 'amat': 'ADV', 'lebih': 'ADV', 'paling': 'ADV', 'juga': 'ADV',
249
+ }
250
+
251
+
252
+ # =============================================================================
253
+ # African Languages
254
+ # =============================================================================
255
+
256
+ class SwahiliAnalyzer(LatinScriptAnalyzer):
257
+ LANG_CODE = "sw"
258
+ LANG_NAME = "Swahili"
259
+
260
+ def _build_base_dictionary(self):
261
+ self.function_words = {
262
+ 'mimi': 'PRON', 'wewe': 'PRON', 'yeye': 'PRON', 'sisi': 'PRON', 'ninyi': 'PRON', 'wao': 'PRON',
263
+ 'na': 'CONJ', 'au': 'CONJ', 'lakini': 'CONJ', 'kwa': 'PREP',
264
+ 'hapana': 'NEG', 'ndiyo': 'ADV', 'sana': 'ADV', 'vizuri': 'ADV',
265
+ }
266
+
267
+
268
+ class AmharicAnalyzer(EthiopicScriptAnalyzer):
269
+ pass # Uses template
270
+
271
+
272
+ class YorubaAnalyzer(LatinScriptAnalyzer):
273
+ LANG_CODE = "yo"
274
+ LANG_NAME = "Yoruba"
275
+
276
+ def _build_base_dictionary(self):
277
+ self.function_words = {
278
+ 'èmi': 'PRON', 'ìwọ': 'PRON', 'òun': 'PRON', 'àwa': 'PRON', 'wọ́n': 'PRON',
279
+ 'àti': 'CONJ', 'tàbí': 'CONJ', 'ṣùgbọ́n': 'CONJ',
280
+ 'kò': 'NEG', 'bẹ́ẹ̀ni': 'ADV', 'púpọ̀': 'ADV', 'dáradára': 'ADV',
281
+ }
282
+
283
+
284
+ class HausaAnalyzer(LatinScriptAnalyzer):
285
+ LANG_CODE = "ha"
286
+ LANG_NAME = "Hausa"
287
+
288
+ def _build_base_dictionary(self):
289
+ self.function_words = {
290
+ 'ni': 'PRON', 'kai': 'PRON', 'shi': 'PRON', 'ita': 'PRON', 'mu': 'PRON', 'su': 'PRON',
291
+ 'da': 'CONJ', 'ko': 'CONJ', 'amma': 'CONJ',
292
+ 'ba': 'NEG', 'e': 'ADV', 'sosai': 'ADV', 'da kyau': 'ADV',
293
+ }
294
+
295
+
296
+ class ZuluAnalyzer(LatinScriptAnalyzer):
297
+ LANG_CODE = "zu"
298
+ LANG_NAME = "Zulu"
299
+
300
+ def _build_base_dictionary(self):
301
+ self.function_words = {
302
+ 'mina': 'PRON', 'wena': 'PRON', 'yena': 'PRON', 'thina': 'PRON', 'bona': 'PRON',
303
+ 'na': 'CONJ', 'noma': 'CONJ', 'kodwa': 'CONJ',
304
+ 'cha': 'NEG', 'yebo': 'ADV', 'kakhulu': 'ADV', 'kahle': 'ADV',
305
+ }
306
+
307
+
308
+ class AfrikaansAnalyzer(LatinScriptAnalyzer):
309
+ LANG_CODE = "af"
310
+ LANG_NAME = "Afrikaans"
311
+
312
+ def _build_base_dictionary(self):
313
+ self.function_words = {
314
+ 'die': 'DET', "'n": 'DET',
315
+ 'ek': 'PRON', 'jy': 'PRON', 'hy': 'PRON', 'sy': 'PRON', 'ons': 'PRON', 'hulle': 'PRON',
316
+ 'in': 'PREP', 'op': 'PREP', 'van': 'PREP', 'met': 'PREP', 'vir': 'PREP',
317
+ 'en': 'CONJ', 'of': 'CONJ', 'maar': 'CONJ', 'dat': 'CONJ',
318
+ 'nie': 'NEG', 'ja': 'ADV', 'baie': 'ADV', 'goed': 'ADV',
319
+ }
320
+
321
+
322
+ # =============================================================================
323
+ # Caucasian Languages
324
+ # =============================================================================
325
+
326
+ class GeorgianAnalyzer(GeorgianScriptAnalyzer):
327
+ pass # Uses template
328
+
329
+
330
+ class ArmenianAnalyzer(LatinScriptAnalyzer):
331
+ """Armenian with Latin transliteration support"""
332
+ LANG_CODE = "hy"
333
+ LANG_NAME = "Armenian"
334
+
335
+ def _build_base_dictionary(self):
336
+ self.function_words = {
337
+ 'yes': 'PRON', 'duk': 'PRON', 'na': 'PRON', 'menk': 'PRON', 'nrank': 'PRON',
338
+ 'yev': 'CONJ', 'kam': 'CONJ', 'bayc': 'CONJ',
339
+ 'che': 'NEG', 'ayo': 'ADV', 'shat': 'ADV', 'lav': 'ADV',
340
+ }
341
+
342
+
343
+ class AzerbaijaniAnalyzer(LatinScriptAnalyzer):
344
+ LANG_CODE = "az"
345
+ LANG_NAME = "Azerbaijani"
346
+
347
+ def _build_base_dictionary(self):
348
+ self.function_words = {
349
+ 'mən': 'PRON', 'sən': 'PRON', 'o': 'PRON', 'biz': 'PRON', 'siz': 'PRON', 'onlar': 'PRON',
350
+ 'və': 'CONJ', 'və ya': 'CONJ', 'amma': 'CONJ', 'lakin': 'CONJ',
351
+ 'yox': 'NEG', 'bəli': 'ADV', 'çox': 'ADV', 'yaxşı': 'ADV',
352
+ }
353
+
354
+
355
+ # =============================================================================
356
+ # Central Asian Languages
357
+ # =============================================================================
358
+
359
+ class KazakhAnalyzer(CyrillicScriptAnalyzer):
360
+ LANG_CODE = "kk"
361
+ LANG_NAME = "Kazakh"
362
+
363
+ def _build_base_dictionary(self):
364
+ self.function_words = {
365
+ 'мен': 'PRON', 'сен': 'PRON', 'ол': 'PRON', 'біз': 'PRON', 'сіз': 'PRON', 'олар': 'PRON',
366
+ 'және': 'CONJ', 'немесе': 'CONJ', 'бірақ': 'CONJ',
367
+ 'жоқ': 'NEG', 'иә': 'ADV', 'өте': 'ADV', 'жақсы': 'ADV',
368
+ }
369
+
370
+
371
+ class UzbekAnalyzer(LatinScriptAnalyzer):
372
+ LANG_CODE = "uz"
373
+ LANG_NAME = "Uzbek"
374
+
375
+ def _build_base_dictionary(self):
376
+ self.function_words = {
377
+ 'men': 'PRON', 'sen': 'PRON', 'u': 'PRON', 'biz': 'PRON', 'siz': 'PRON', 'ular': 'PRON',
378
+ 'va': 'CONJ', 'yoki': 'CONJ', 'lekin': 'CONJ', 'ammo': 'CONJ',
379
+ "yo'q": 'NEG', 'ha': 'ADV', 'juda': 'ADV', 'yaxshi': 'ADV',
380
+ }
381
+
382
+
383
+ class MongolianAnalyzer(CyrillicScriptAnalyzer):
384
+ LANG_CODE = "mn"
385
+ LANG_NAME = "Mongolian"
386
+
387
+ def _build_base_dictionary(self):
388
+ self.function_words = {
389
+ 'би': 'PRON', 'чи': 'PRON', 'тэр': 'PRON', 'бид': 'PRON', 'та': 'PRON', 'тэд': 'PRON',
390
+ 'ба': 'CONJ', 'эсвэл': 'CONJ', 'гэхдээ': 'CONJ',
391
+ 'үгүй': 'NEG', 'тийм': 'ADV', 'маш': 'ADV', 'сайн': 'ADV',
392
+ }
393
+
394
+
395
+ # Export all
396
+ __all__ = [
397
+ # South Asian
398
+ 'BengaliAnalyzer', 'TamilAnalyzer', 'TeluguAnalyzer', 'MarathiAnalyzer',
399
+ 'GujaratiAnalyzer', 'KannadaAnalyzer', 'MalayalamAnalyzer', 'PunjabiAnalyzer',
400
+ 'UrduAnalyzer', 'NepaliAnalyzer', 'SinhalaAnalyzer',
401
+ # Southeast Asian
402
+ 'MyanmarAnalyzer', 'KhmerAnalyzer', 'LaoAnalyzer', 'TagalogAnalyzer', 'MalayAnalyzer',
403
+ # African
404
+ 'SwahiliAnalyzer', 'AmharicAnalyzer', 'YorubaAnalyzer', 'HausaAnalyzer',
405
+ 'ZuluAnalyzer', 'AfrikaansAnalyzer',
406
+ # Caucasian
407
+ 'GeorgianAnalyzer', 'ArmenianAnalyzer', 'AzerbaijaniAnalyzer',
408
+ # Central Asian
409
+ 'KazakhAnalyzer', 'UzbekAnalyzer', 'MongolianAnalyzer',
410
+ ]