PyPI - tokmor - Versions diffs - 1.2.9__py3-none-any.whl - Mend

tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

tokmor/__init__.py +77 -0
tokmor/api.py +194 -0
tokmor/assets.py +365 -0
tokmor/base.py +238 -0
tokmor/brahmic.py +516 -0
tokmor/cjk.py +497 -0
tokmor/domain/__init__.py +11 -0
tokmor/domain/sentiment.py +198 -0
tokmor/factory.py +394 -0
tokmor/indic.py +289 -0
tokmor/inventory.py +51 -0
tokmor/legacy_api.py +143 -0
tokmor/lemma_store.py +102 -0
tokmor/lookup_keys.py +145 -0
tokmor/models/domain/sentiment/en.json +54 -0
tokmor/models/domain/sentiment/ko.json +52 -0
tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
tokmor/morphology/__init__.py +395 -0
tokmor/morphology/advanced_base.py +472 -0
tokmor/morphology/arabic_advanced.py +247 -0
tokmor/morphology/chinese.py +736 -0
tokmor/morphology/chinese_advanced.py +425 -0
tokmor/morphology/english.py +315 -0
tokmor/morphology/english_advanced.py +560 -0
tokmor/morphology/french_advanced.py +237 -0
tokmor/morphology/german_advanced.py +343 -0
tokmor/morphology/hindi_advanced.py +258 -0
tokmor/morphology/japanese.py +417 -0
tokmor/morphology/japanese_advanced.py +589 -0
tokmor/morphology/korean.py +534 -0
tokmor/morphology/korean_advanced.py +603 -0
tokmor/morphology/russian_advanced.py +217 -0
tokmor/morphology/spanish_advanced.py +226 -0
tokmor/morphology/templates/__init__.py +32 -0
tokmor/morphology/templates/arabic_script_template.py +162 -0
tokmor/morphology/templates/brahmic_template.py +181 -0
tokmor/morphology/templates/cyrillic_template.py +168 -0
tokmor/morphology/templates/latin_template.py +235 -0
tokmor/morphology/templates/other_scripts_template.py +475 -0
tokmor/morphology/thai_native.py +274 -0
tokmor/morphology/tier2.py +477 -0
tokmor/morphology/tier3.py +449 -0
tokmor/morphology/tier4.py +410 -0
tokmor/morphology/unified.py +855 -0
tokmor/morphology/universal_fallback.py +398 -0
tokmor/ner_prep.py +747 -0
tokmor/offline.py +89 -0
tokmor/preprocess.py +80 -0
tokmor/resources.py +288 -0
tokmor/routing.py +147 -0
tokmor/rtl.py +309 -0
tokmor/schema.py +17 -0
tokmor/sns_tags.py +281 -0
tokmor/space_based.py +272 -0
tokmor/token_quality.py +1185 -0
tokmor/unified_tokens.py +228 -0
tokmor-1.2.9.dist-info/METADATA +103 -0
tokmor-1.2.9.dist-info/RECORD +70 -0
tokmor-1.2.9.dist-info/WHEEL +5 -0
tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
tokmor-1.2.9.dist-info/top_level.txt +1 -0

tokmor/morphology/tier2.py ADDED Viewed

@@ -0,0 +1,477 @@
+"""
+Tier 2 Languages - Major Regional Languages
+============================================
+15 languages: pt, it, nl, pl, tr, vi, th, id, he, fa, uk, el, cs, ro, sv
+"""
+from .templates.latin_template import LatinScriptAnalyzer
+from .templates.cyrillic_template import CyrillicScriptAnalyzer
+from .templates.arabic_script_template import ArabicScriptAnalyzer
+from .templates.other_scripts_template import HebrewScriptAnalyzer, GreekScriptAnalyzer, ThaiScriptAnalyzer
+# =============================================================================
+# Portuguese (pt)
+# =============================================================================
+class PortugueseAnalyzer(LatinScriptAnalyzer):
+    LANG_CODE = "pt"
+    LANG_NAME = "Portuguese"
+    VERB_INFINITIVE_SUFFIXES = ['ar', 'er', 'ir']
+    VERB_PARTICIPLE_SUFFIXES = ['ado', 'ido', 'ando', 'endo', 'indo']
+    NOUN_PLURAL_SUFFIXES = ['s', 'es', 'ões', 'ães', 'ais', 'éis', 'óis']
+    ADJECTIVE_SUFFIXES = ['oso', 'osa', 'ivo', 'iva', 'al', 'el', 'il']
+    ADVERB_SUFFIXES = ['mente']
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'o': 'DET', 'a': 'DET', 'os': 'DET', 'as': 'DET', 'um': 'DET', 'uma': 'DET',
+            'eu': 'PRON', 'tu': 'PRON', 'ele': 'PRON', 'ela': 'PRON', 'nós': 'PRON', 'eles': 'PRON',
+            'de': 'PREP', 'em': 'PREP', 'para': 'PREP', 'com': 'PREP', 'por': 'PREP',
+            'e': 'CONJ', 'ou': 'CONJ', 'mas': 'CONJ', 'que': 'CONJ',
+            'não': 'NEG', 'muito': 'ADV', 'bem': 'ADV', 'mais': 'ADV',
+        }
+        self.irregular_verbs = {
+            'sou': 'ser', 'é': 'ser', 'são': 'ser', 'foi': 'ser', 'eram': 'ser',
+            'tenho': 'ter', 'tem': 'ter', 'tinha': 'ter', 'teve': 'ter',
+            'vou': 'ir', 'vai': 'ir', 'vamos': 'ir', 'foi': 'ir',
+            'faço': 'fazer', 'faz': 'fazer', 'fez': 'fazer',
+            'estou': 'estar', 'está': 'estar', 'estava': 'estar',
+        }
+# =============================================================================
+# Italian (it)
+# =============================================================================
+class ItalianAnalyzer(LatinScriptAnalyzer):
+    LANG_CODE = "it"
+    LANG_NAME = "Italian"
+    VERB_INFINITIVE_SUFFIXES = ['are', 'ere', 'ire']
+    VERB_PARTICIPLE_SUFFIXES = ['ato', 'ito', 'uto', 'ando', 'endo']
+    NOUN_PLURAL_SUFFIXES = ['i', 'e', 'a']
+    ADJECTIVE_SUFFIXES = ['oso', 'osa', 'ale', 'ile', 'ivo', 'iva']
+    ADVERB_SUFFIXES = ['mente']
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'il': 'DET', 'la': 'DET', 'lo': 'DET', 'i': 'DET', 'le': 'DET', 'gli': 'DET',
+            'un': 'DET', 'una': 'DET', 'uno': 'DET',
+            'io': 'PRON', 'tu': 'PRON', 'lui': 'PRON', 'lei': 'PRON', 'noi': 'PRON', 'loro': 'PRON',
+            'di': 'PREP', 'a': 'PREP', 'da': 'PREP', 'in': 'PREP', 'con': 'PREP', 'su': 'PREP', 'per': 'PREP',
+            'e': 'CONJ', 'o': 'CONJ', 'ma': 'CONJ', 'che': 'CONJ',
+            'non': 'NEG', 'molto': 'ADV', 'bene': 'ADV', 'più': 'ADV',
+        }
+        self.irregular_verbs = {
+            'sono': 'essere', 'è': 'essere', 'siamo': 'essere', 'era': 'essere', 'stato': 'essere',
+            'ho': 'avere', 'ha': 'avere', 'abbiamo': 'avere', 'aveva': 'avere', 'avuto': 'avere',
+            'vado': 'andare', 'va': 'andare', 'andiamo': 'andare', 'andato': 'andare',
+            'faccio': 'fare', 'fa': 'fare', 'fatto': 'fare',
+        }
+# =============================================================================
+# Dutch (nl)
+# =============================================================================
+class DutchAnalyzer(LatinScriptAnalyzer):
+    LANG_CODE = "nl"
+    LANG_NAME = "Dutch"
+    VERB_INFINITIVE_SUFFIXES = ['en']
+    VERB_PARTICIPLE_SUFFIXES = ['t', 'd', 'de', 'te', 'end']
+    NOUN_PLURAL_SUFFIXES = ['en', 's', 'eren']
+    ADJECTIVE_SUFFIXES = ['ig', 'lijk', 'isch', 'baar']
+    ADVERB_SUFFIXES = []
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'de': 'DET', 'het': 'DET', 'een': 'DET',
+            'ik': 'PRON', 'jij': 'PRON', 'hij': 'PRON', 'zij': 'PRON', 'wij': 'PRON', 'jullie': 'PRON',
+            'van': 'PREP', 'in': 'PREP', 'op': 'PREP', 'met': 'PREP', 'voor': 'PREP', 'naar': 'PREP',
+            'en': 'CONJ', 'of': 'CONJ', 'maar': 'CONJ', 'dat': 'CONJ',
+            'niet': 'NEG', 'geen': 'NEG', 'heel': 'ADV', 'zeer': 'ADV', 'goed': 'ADV',
+        }
+        self.irregular_verbs = {
+            'ben': 'zijn', 'is': 'zijn', 'zijn': 'zijn', 'was': 'zijn', 'waren': 'zijn', 'geweest': 'zijn',
+            'heb': 'hebben', 'hebt': 'hebben', 'heeft': 'hebben', 'had': 'hebben', 'gehad': 'hebben',
+            'ga': 'gaan', 'gaat': 'gaan', 'ging': 'gaan', 'gegaan': 'gaan',
+            'kom': 'komen', 'komt': 'komen', 'kwam': 'komen', 'gekomen': 'komen',
+        }
+# =============================================================================
+# Polish (pl)
+# =============================================================================
+class PolishAnalyzer(LatinScriptAnalyzer):
+    LANG_CODE = "pl"
+    LANG_NAME = "Polish"
+    VERB_INFINITIVE_SUFFIXES = ['ć', 'c']
+    VERB_PARTICIPLE_SUFFIXES = ['ący', 'ąca', 'ące', 'any', 'ana', 'ane', 'ony', 'ona', 'one']
+    NOUN_PLURAL_SUFFIXES = ['y', 'i', 'e', 'a', 'owie']
+    ADJECTIVE_SUFFIXES = ['ny', 'na', 'ne', 'wy', 'wa', 'we', 'ki', 'ka', 'ke']
+    ADVERB_SUFFIXES = ['o', 'ie', 'e']
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'ja': 'PRON', 'ty': 'PRON', 'on': 'PRON', 'ona': 'PRON', 'ono': 'PRON', 'my': 'PRON', 'oni': 'PRON',
+            'w': 'PREP', 'na': 'PREP', 'z': 'PREP', 'do': 'PREP', 'od': 'PREP', 'po': 'PREP', 'przy': 'PREP',
+            'i': 'CONJ', 'lub': 'CONJ', 'ale': 'CONJ', 'że': 'CONJ', 'bo': 'CONJ',
+            'nie': 'NEG', 'tak': 'ADV', 'bardzo': 'ADV', 'dobrze': 'ADV',
+        }
+        self.irregular_verbs = {
+            'jestem': 'być', 'jest': 'być', 'są': 'być', 'był': 'być', 'była': 'być',
+            'mam': 'mieć', 'ma': 'mieć', 'mają': 'mieć', 'miał': 'mieć',
+            'idę': 'iść', 'idzie': 'iść', 'szedł': 'iść', 'szła': 'iść',
+        }
+# =============================================================================
+# Turkish (tr)
+# =============================================================================
+class TurkishAnalyzer(LatinScriptAnalyzer):
+    LANG_CODE = "tr"
+    LANG_NAME = "Turkish"
+    VERB_INFINITIVE_SUFFIXES = ['mak', 'mek']
+    # 현재진행형 어미
+    VERB_PRESENT_SUFFIXES = [
+        'iyorum', 'ıyorum', 'uyorum', 'üyorum',  # 1인칭 단수
+        'iyorsun', 'ıyorsun', 'uyorsun', 'üyorsun',  # 2인칭 단수
+        'iyor', 'ıyor', 'uyor', 'üyor',  # 3인칭 단수
+        'iyoruz', 'ıyoruz', 'uyoruz', 'üyoruz',  # 1인칭 복수
+        'iyorsunuz', 'ıyorsunuz', 'uyorsunuz', 'üyorsunuz',  # 2인칭 복수
+        'iyorlar', 'ıyorlar', 'uyorlar', 'üyorlar',  # 3인칭 복수
+    ]
+    # 과거형 어미
+    VERB_PAST_SUFFIXES = [
+        'dim', 'dım', 'dum', 'düm', 'tim', 'tım', 'tum', 'tüm',  # 1인칭
+        'din', 'dın', 'dun', 'dün', 'tin', 'tın', 'tun', 'tün',  # 2인칭
+        'di', 'dı', 'du', 'dü', 'ti', 'tı', 'tu', 'tü',  # 3인칭
+        'dik', 'dık', 'duk', 'dük', 'tik', 'tık', 'tuk', 'tük',  # 1인칭 복수
+        'diniz', 'dınız', 'dunuz', 'dünüz',  # 2인칭 복수
+        'diler', 'dılar', 'dular', 'düler',  # 3인칭 복수
+    ]
+    # 미래형 어미
+    VERB_FUTURE_SUFFIXES = [
+        'eceğim', 'acağım', 'yacağım', 'yeceğim',  # 1인칭
+        'eceksin', 'acaksın',  # 2인칭
+        'ecek', 'acak',  # 3인칭
+        'eceğiz', 'acağız',  # 1인칭 복수
+    ]
+    VERB_PARTICIPLE_SUFFIXES = ['yor', 'iyor', 'uyor', 'üyor', 'dı', 'di', 'du', 'dü', 'mış', 'miş']
+    NOUN_PLURAL_SUFFIXES = ['lar', 'ler']
+    # 명사 격조사
+    NOUN_CASE_SUFFIXES = [
+        'ı', 'i', 'u', 'ü',  # 목적격
+        'a', 'e', 'ya', 'ye',  # 여격 (방향)
+        'da', 'de', 'ta', 'te',  # 처소격
+        'dan', 'den', 'tan', 'ten',  # 탈격
+        'ın', 'in', 'un', 'ün', 'nın', 'nin', 'nun', 'nün',  # 소유격
+    ]
+    ADJECTIVE_SUFFIXES = ['lı', 'li', 'lu', 'lü', 'sız', 'siz', 'suz', 'süz']
+    ADVERB_SUFFIXES = ['ca', 'ce', 'ça', 'çe']
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'bir': 'DET', 'bu': 'DET', 'şu': 'DET',
+            'ben': 'PRON', 'sen': 'PRON', 'o': 'PRON', 'biz': 'PRON', 'siz': 'PRON', 'onlar': 'PRON',
+            'beni': 'PRON', 'seni': 'PRON', 'onu': 'PRON', 'bizi': 'PRON', 'sizi': 'PRON', 'onları': 'PRON',
+            'de': 'PSP', 'da': 'PSP', 'den': 'PSP', 'dan': 'PSP', 'e': 'PSP', 'a': 'PSP',
+            've': 'CONJ', 'veya': 'CONJ', 'ama': 'CONJ', 'fakat': 'CONJ',
+            'değil': 'NEG', 'hayır': 'NEG', 'çok': 'ADV', 'iyi': 'ADV', 'daha': 'ADV',
+            'için': 'POSTP', 'ile': 'POSTP', 'gibi': 'POSTP', 'kadar': 'POSTP',
+        }
+        # 일반 동사
+        self.common_verbs = {
+            'gidiyorum': 'V', 'gidiyorsun': 'V', 'gidiyor': 'V',
+            'geliyorum': 'V', 'geliyorsun': 'V', 'geliyor': 'V',
+            'yapıyorum': 'V', 'yapıyorsun': 'V', 'yapıyor': 'V',
+            'istiyorum': 'V', 'istiyorsun': 'V', 'istiyor': 'V',
+            'biliyorum': 'V', 'biliyorsun': 'V', 'biliyor': 'V',
+            'seviyorum': 'V', 'seviyorsun': 'V', 'seviyor': 'V',
+            'bakıyorum': 'V', 'bakıyorsun': 'V', 'bakıyor': 'V',
+            'okuyorum': 'V', 'okuyorsun': 'V', 'okuyor': 'V',
+            'yazıyorum': 'V', 'yazıyorsun': 'V', 'yazıyor': 'V',
+            'alıyorum': 'V', 'alıyorsun': 'V', 'alıyor': 'V',
+            'veriyorum': 'V', 'veriyorsun': 'V', 'veriyor': 'V',
+            'çalışıyorum': 'V', 'çalışıyorsun': 'V', 'çalışıyor': 'V',
+            'konuşuyorum': 'V', 'konuşuyorsun': 'V', 'konuşuyor': 'V',
+            'koşuyorum': 'V', 'koşuyorsun': 'V', 'koşuyor': 'V',
+            'yiyorum': 'V', 'yiyorsun': 'V', 'yiyor': 'V',
+            'içiyorum': 'V', 'içiyorsun': 'V', 'içiyor': 'V',
+            'oturuyorum': 'V', 'oturuyorsun': 'V', 'oturuyor': 'V',
+            'yürüyorum': 'V', 'yürüyorsun': 'V', 'yürüyor': 'V',
+            'bekliyorum': 'V', 'bekliyorsun': 'V', 'bekliyor': 'V',
+            'anlıyorum': 'V', 'anlıyorsun': 'V', 'anlıyor': 'V',
+            'düşünüyorum': 'V', 'düşünüyorsun': 'V', 'düşünüyor': 'V',
+            'görüyorum': 'V', 'görüyorsun': 'V', 'görüyor': 'V',
+            'duyuyorum': 'V', 'duyuyorsun': 'V', 'duyuyor': 'V',
+            # 과거형
+            'gittim': 'V', 'gittin': 'V', 'gitti': 'V',
+            'geldim': 'V', 'geldin': 'V', 'geldi': 'V',
+            'yaptım': 'V', 'yaptın': 'V', 'yaptı': 'V',
+            'istedim': 'V', 'istedin': 'V', 'istedi': 'V',
+            'oldum': 'V', 'oldun': 'V', 'oldu': 'V',
+            # var/yok
+            'var': 'V', 'yok': 'V',
+        }
+        # 일반 명사 (+ 격조사 붙은 형태)
+        self.common_nouns = {
+            # 목적격(-ı,-i,-u,-ü) 포함
+            'okul': 'N', 'okula': 'N', 'okulda': 'N', 'okuldan': 'N', 'okulun': 'N', 'okulu': 'N',
+            'ev': 'N', 'eve': 'N', 'evde': 'N', 'evden': 'N', 'evin': 'N', 'evi': 'N',
+            'iş': 'N', 'işe': 'N', 'işte': 'N', 'işten': 'N', 'işin': 'N', 'işi': 'N',
+            'yol': 'N', 'yola': 'N', 'yolda': 'N', 'yoldan': 'N', 'yolun': 'N', 'yolu': 'N',
+            'su': 'N', 'suyu': 'N', 'suda': 'N', 'sudan': 'N', 'suyun': 'N',
+            'gün': 'N', 'güne': 'N', 'günde': 'N', 'günden': 'N', 'günün': 'N', 'günü': 'N',
+            'yıl': 'N', 'yıla': 'N', 'yılda': 'N', 'yıldan': 'N', 'yılın': 'N', 'yılı': 'N',
+            'insan': 'N', 'insana': 'N', 'insanda': 'N', 'insanın': 'N', 'insanı': 'N',
+            'şehir': 'N', 'şehre': 'N', 'şehirde': 'N', 'şehrin': 'N', 'şehri': 'N',
+            'ülke': 'N', 'ülkeye': 'N', 'ülkede': 'N', 'ülkenin': 'N', 'ülkeyi': 'N',
+            'kitap': 'N', 'kitaba': 'N', 'kitapta': 'N', 'kitabın': 'N', 'kitabı': 'N',
+            'elma': 'N', 'elmayı': 'N', 'elmada': 'N', 'elmanın': 'N',
+            'araba': 'N', 'arabayı': 'N', 'arabada': 'N', 'arabanın': 'N',
+            'kapı': 'N', 'kapıyı': 'N', 'kapıda': 'N', 'kapının': 'N',
+            'para': 'N', 'parayı': 'N', 'parada': 'N', 'paranın': 'N',
+        }
+    def _analyze_word(self, word: str, offset: int, domain) -> 'Morpheme':
+        """터키어 특화 단어 분석 - common_verbs, common_nouns 체크"""
+        from .advanced_base import Morpheme
+        word_lower = word.lower()
+        # 1. User dictionary
+        if word_lower in self._user_dictionary:
+            lemma, pos_tag, _ = self._user_dictionary[word_lower]
+            return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
+        # 2. Domain dictionary
+        domain_sense = self._get_domain_sense(word_lower, domain)
+        if domain_sense:
+            return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
+        # 3. Function words
+        if hasattr(self, 'function_words') and word_lower in self.function_words:
+            return Morpheme(surface=word, lemma=word_lower, pos=self.function_words[word_lower], start=offset, end=offset + len(word))
+        # 4. Common verbs (터키어 특화)
+        if hasattr(self, 'common_verbs') and word_lower in self.common_verbs:
+            return Morpheme(surface=word, lemma=word_lower, pos=self.common_verbs[word_lower], start=offset, end=offset + len(word))
+        # 5. Common nouns (터키어 특화)
+        if hasattr(self, 'common_nouns') and word_lower in self.common_nouns:
+            return Morpheme(surface=word, lemma=word_lower, pos=self.common_nouns[word_lower], start=offset, end=offset + len(word))
+        # 6. Irregular verbs
+        if hasattr(self, 'irregular_verbs') and word_lower in self.irregular_verbs:
+            return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
+        # 7. 동사 어미 분석 (현재진행/과거/미래)
+        for suffix in self.VERB_PRESENT_SUFFIXES + self.VERB_PAST_SUFFIXES + self.VERB_FUTURE_SUFFIXES:
+            if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
+                return Morpheme(surface=word, lemma=word_lower, pos='V', start=offset, end=offset + len(word))
+        # 8. Extended dictionary (optional external)
+        if hasattr(self, 'extended_verbs') and word_lower in self.extended_verbs:
+            return Morpheme(surface=word, lemma=word_lower, pos='V', start=offset, end=offset + len(word))
+        if hasattr(self, 'extended_adjs') and word_lower in self.extended_adjs:
+            return Morpheme(surface=word, lemma=word_lower, pos='ADJ', start=offset, end=offset + len(word))
+        if hasattr(self, 'extended_advs') and word_lower in self.extended_advs:
+            return Morpheme(surface=word, lemma=word_lower, pos='ADV', start=offset, end=offset + len(word))
+        if hasattr(self, 'extended_nouns') and word_lower in self.extended_nouns:
+            return Morpheme(surface=word, lemma=word_lower, pos=self.extended_nouns[word_lower], start=offset, end=offset + len(word))
+        # 9. Morphological analysis (기본)
+        lemma, pos_tag = self._analyze_morphology(word)
+        return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
+# =============================================================================
+# Vietnamese (vi)
+# =============================================================================
+class VietnameseAnalyzer(LatinScriptAnalyzer):
+    LANG_CODE = "vi"
+    LANG_NAME = "Vietnamese"
+    # Vietnamese is isolating - no inflection
+    VERB_INFINITIVE_SUFFIXES = []
+    VERB_PARTICIPLE_SUFFIXES = []
+    NOUN_PLURAL_SUFFIXES = []
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'tôi': 'PRON', 'bạn': 'PRON', 'anh': 'PRON', 'chị': 'PRON', 'em': 'PRON', 'nó': 'PRON',
+            'chúng': 'PRON', 'họ': 'PRON',
+            'của': 'PREP', 'trong': 'PREP', 'trên': 'PREP', 'dưới': 'PREP', 'với': 'PREP',
+            'và': 'CONJ', 'hoặc': 'CONJ', 'nhưng': 'CONJ', 'vì': 'CONJ',
+            'không': 'NEG', 'rất': 'ADV', 'lắm': 'ADV', 'quá': 'ADV',
+            'là': 'V', 'có': 'V', 'được': 'V', 'đi': 'V', 'đến': 'V',
+        }
+# =============================================================================
+# Thai (th) - Uses template
+# =============================================================================
+class ThaiAnalyzer(ThaiScriptAnalyzer):
+    pass
+# =============================================================================
+# Indonesian (id)
+# =============================================================================
+class IndonesianAnalyzer(LatinScriptAnalyzer):
+    LANG_CODE = "id"
+    LANG_NAME = "Indonesian"
+    VERB_INFINITIVE_SUFFIXES = ['kan', 'i']
+    NOUN_PLURAL_SUFFIXES = []  # Reduplication, not suffix
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'yang': 'REL', 'ini': 'DET', 'itu': 'DET',
+            'saya': 'PRON', 'aku': 'PRON', 'kamu': 'PRON', 'dia': 'PRON', 'kami': 'PRON', 'mereka': 'PRON',
+            'di': 'PREP', 'ke': 'PREP', 'dari': 'PREP', 'dengan': 'PREP', 'untuk': 'PREP',
+            'dan': 'CONJ', 'atau': 'CONJ', 'tetapi': 'CONJ', 'karena': 'CONJ',
+            'tidak': 'NEG', 'bukan': 'NEG', 'sangat': 'ADV', 'sudah': 'ADV', 'akan': 'ADV',
+            'adalah': 'V', 'ada': 'V', 'menjadi': 'V',
+        }
+# =============================================================================
+# Hebrew (he) - Uses template
+# =============================================================================
+class HebrewAnalyzer(HebrewScriptAnalyzer):
+    pass
+# =============================================================================
+# Persian/Farsi (fa)
+# =============================================================================
+class PersianAnalyzer(ArabicScriptAnalyzer):
+    LANG_CODE = "fa"
+    LANG_NAME = "Persian"
+    def _build_base_dictionary(self):
+        self.prefixes = {'می': 'PRES', 'ن': 'NEG'}
+        self.suffixes = {
+            'ها': 'PL', 'ان': 'PL',
+            'م': 'PRON', 'ت': 'PRON', 'ش': 'PRON', 'مان': 'PRON', 'تان': 'PRON', 'شان': 'PRON',
+        }
+        self.function_words = {
+            'من': 'PRON', 'تو': 'PRON', 'او': 'PRON', 'ما': 'PRON', 'شما': 'PRON', 'آنها': 'PRON',
+            'در': 'PREP', 'به': 'PREP', 'از': 'PREP', 'با': 'PREP', 'برای': 'PREP',
+            'و': 'CONJ', 'یا': 'CONJ', 'اما': 'CONJ', 'که': 'CONJ',
+            'نه': 'NEG', 'خیلی': 'ADV', 'خوب': 'ADV',
+            'است': 'V', 'هست': 'V', 'بود': 'V', 'شد': 'V',
+        }
+# =============================================================================
+# Ukrainian (uk)
+# =============================================================================
+class UkrainianAnalyzer(CyrillicScriptAnalyzer):
+    LANG_CODE = "uk"
+    LANG_NAME = "Ukrainian"
+    VERB_INFINITIVE_SUFFIX = 'ти'
+    REFLEXIVE_SUFFIX = 'ся'
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'я': 'PRON', 'ти': 'PRON', 'він': 'PRON', 'вона': 'PRON', 'воно': 'PRON',
+            'ми': 'PRON', 'ви': 'PRON', 'вони': 'PRON',
+            'в': 'PREP', 'на': 'PREP', 'з': 'PREP', 'до': 'PREP', 'від': 'PREP', 'за': 'PREP',
+            'і': 'CONJ', 'та': 'CONJ', 'але': 'CONJ', 'що': 'CONJ', 'як': 'CONJ',
+            'не': 'NEG', 'так': 'ADV', 'дуже': 'ADV', 'добре': 'ADV',
+        }
+        self.irregular_verbs = {
+            'є': 'бути', 'був': 'бути', 'була': 'бути', 'було': 'бути', 'були': 'бути',
+            'маю': 'мати', 'має': 'мати', 'мав': 'мати',
+            'іду': 'йти', 'йде': 'йти', 'йшов': 'йти', 'йшла': 'йти',
+        }
+# =============================================================================
+# Greek (el) - Uses template
+# =============================================================================
+class GreekAnalyzer(GreekScriptAnalyzer):
+    pass
+# =============================================================================
+# Czech (cs)
+# =============================================================================
+class CzechAnalyzer(LatinScriptAnalyzer):
+    LANG_CODE = "cs"
+    LANG_NAME = "Czech"
+    VERB_INFINITIVE_SUFFIXES = ['t', 'ti', 'ci']
+    NOUN_PLURAL_SUFFIXES = ['y', 'i', 'e', 'a', 'ové']
+    ADJECTIVE_SUFFIXES = ['ý', 'á', 'é', 'í']
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'já': 'PRON', 'ty': 'PRON', 'on': 'PRON', 'ona': 'PRON', 'ono': 'PRON',
+            'my': 'PRON', 'vy': 'PRON', 'oni': 'PRON',
+            'v': 'PREP', 'na': 'PREP', 'z': 'PREP', 'do': 'PREP', 's': 'PREP', 'k': 'PREP',
+            'a': 'CONJ', 'nebo': 'CONJ', 'ale': 'CONJ', 'že': 'CONJ',
+            'ne': 'NEG', 'ano': 'ADV', 'velmi': 'ADV', 'dobře': 'ADV',
+        }
+        self.irregular_verbs = {
+            'jsem': 'být', 'jsi': 'být', 'je': 'být', 'jsme': 'být', 'jste': 'být', 'jsou': 'být',
+            'byl': 'být', 'byla': 'být', 'bylo': 'být', 'byli': 'být',
+            'mám': 'mít', 'má': 'mít', 'máme': 'mít', 'měl': 'mít',
+        }
+# =============================================================================
+# Romanian (ro)
+# =============================================================================
+class RomanianAnalyzer(LatinScriptAnalyzer):
+    LANG_CODE = "ro"
+    LANG_NAME = "Romanian"
+    VERB_INFINITIVE_SUFFIXES = ['a', 'ea', 'e', 'i', 'î']
+    NOUN_PLURAL_SUFFIXES = ['i', 'e', 'uri', 'le']
+    ADJECTIVE_SUFFIXES = ['os', 'oasă', 'ic', 'ică']
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'eu': 'PRON', 'tu': 'PRON', 'el': 'PRON', 'ea': 'PRON', 'noi': 'PRON', 'ei': 'PRON', 'ele': 'PRON',
+            'un': 'DET', 'o': 'DET', 'niște': 'DET',
+            'în': 'PREP', 'pe': 'PREP', 'la': 'PREP', 'de': 'PREP', 'cu': 'PREP', 'din': 'PREP',
+            'și': 'CONJ', 'sau': 'CONJ', 'dar': 'CONJ', 'că': 'CONJ',
+            'nu': 'NEG', 'da': 'ADV', 'foarte': 'ADV', 'bine': 'ADV',
+        }
+        self.irregular_verbs = {
+            'sunt': 'fi', 'ești': 'fi', 'este': 'fi', 'suntem': 'fi', 'sunteți': 'fi',
+            'am': 'avea', 'ai': 'avea', 'are': 'avea', 'avem': 'avea',
+        }
+# =============================================================================
+# Swedish (sv)
+# =============================================================================
+class SwedishAnalyzer(LatinScriptAnalyzer):
+    LANG_CODE = "sv"
+    LANG_NAME = "Swedish"
+    VERB_INFINITIVE_SUFFIXES = ['a']
+    VERB_PARTICIPLE_SUFFIXES = ['r', 'de', 't', 'ande', 'ende']
+    NOUN_PLURAL_SUFFIXES = ['or', 'ar', 'er', 'n', 'en']
+    ADJECTIVE_SUFFIXES = ['ig', 'lig', 'isk', 'sk']
+    def _build_base_dictionary(self):
+        self.function_words = {
+            'en': 'DET', 'ett': 'DET', 'den': 'DET', 'det': 'DET', 'de': 'DET',
+            'jag': 'PRON', 'du': 'PRON', 'han': 'PRON', 'hon': 'PRON', 'vi': 'PRON', 'de': 'PRON',
+            'i': 'PREP', 'på': 'PREP', 'till': 'PREP', 'från': 'PREP', 'med': 'PREP', 'av': 'PREP',
+            'och': 'CONJ', 'eller': 'CONJ', 'men': 'CONJ', 'att': 'CONJ',
+            'inte': 'NEG', 'mycket': 'ADV', 'bra': 'ADV', 'nu': 'ADV',
+        }
+        self.irregular_verbs = {
+            'är': 'vara', 'var': 'vara', 'varit': 'vara',
+            'har': 'ha', 'hade': 'ha', 'haft': 'ha',
+            'går': 'gå', 'gick': 'gå', 'gått': 'gå',
+            'kommer': 'komma', 'kom': 'komma', 'kommit': 'komma',
+        }
+# Export all
+__all__ = [
+    'PortugueseAnalyzer', 'ItalianAnalyzer', 'DutchAnalyzer', 'PolishAnalyzer',
+    'TurkishAnalyzer', 'VietnameseAnalyzer', 'ThaiAnalyzer', 'IndonesianAnalyzer',
+    'HebrewAnalyzer', 'PersianAnalyzer', 'UkrainianAnalyzer', 'GreekAnalyzer',
+    'CzechAnalyzer', 'RomanianAnalyzer', 'SwedishAnalyzer',
+]