pythainlp 5.0.0.dev0__tar.gz → 5.0.0.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/CONTRIBUTING.md +5 -2
- {pythainlp-5.0.0.dev0/pythainlp.egg-info → pythainlp-5.0.0.dev2}/PKG-INFO +1 -1
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/README.md +46 -10
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/README_TH.md +39 -7
- pythainlp-5.0.0.dev2/pyproject.toml +16 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/__init__.py +3 -19
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/__main__.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/ancient/__init__.py +10 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/ancient/aksonhan.py +12 -22
- pythainlp-5.0.0.dev2/pythainlp/augment/__init__.py +10 -0
- pythainlp-5.0.0.dev2/pythainlp/augment/lm/__init__.py +16 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/lm/fasttext.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/augment/lm/phayathaibert.py +105 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/lm/wangchanberta.py +7 -16
- pythainlp-5.0.0.dev2/pythainlp/augment/word2vec/__init__.py +12 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/word2vec/bpemb_wv.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/word2vec/core.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/word2vec/ltw2v.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/word2vec/thai2fit.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/wordnet.py +3 -13
- pythainlp-5.0.0.dev2/pythainlp/benchmarks/__init__.py +10 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/benchmarks/word_tokenization.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/chat/__init__.py +10 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/chat/core.py +28 -28
- pythainlp-5.0.0.dev2/pythainlp/classify/__init__.py +10 -0
- {pythainlp-5.0.0.dev0/pythainlp/cls → pythainlp-5.0.0.dev2/pythainlp/classify}/param_free.py +8 -15
- pythainlp-5.0.0.dev2/pythainlp/cli/__init__.py +25 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/cli/benchmark.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/cli/data.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/cli/soundex.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/cli/tag.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/cli/tokenize.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/cls/__init__.py +16 -0
- pythainlp-5.0.0.dev2/pythainlp/coref/__init__.py +8 -0
- pythainlp-5.0.0.dev2/pythainlp/coref/_fastcoref.py +34 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/coref/core.py +22 -20
- pythainlp-5.0.0.dev2/pythainlp/coref/han_coref.py +12 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/__init__.py +24 -23
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/common.py +87 -48
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/conceptnet.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/core.py +95 -51
- pythainlp-5.0.0.dev2/pythainlp/corpus/icu.py +26 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/oscar.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/th_en_translit.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/tnc.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/ttc.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/util.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/corpus/volubilis.py +32 -0
- pythainlp-5.0.0.dev2/pythainlp/corpus/wikipedia.py +35 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/wordnet.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/el/__init__.py +10 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/el/_multiel.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/el/core.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/generate/__init__.py +10 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/generate/core.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/generate/thai2fit.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/generate/wangchanglm.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/khavee/__init__.py +7 -0
- pythainlp-5.0.0.dev2/pythainlp/khavee/core.py +652 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/khavee/example.py +58 -18
- pythainlp-5.0.0.dev2/pythainlp/morpheme/__init__.py +13 -0
- {pythainlp-5.0.0.dev0/pythainlp/util → pythainlp-5.0.0.dev2/pythainlp/morpheme}/thaiwordcheck.py +4 -14
- pythainlp-5.0.0.dev2/pythainlp/morpheme/word_formation.py +56 -0
- pythainlp-5.0.0.dev2/pythainlp/parse/__init__.py +10 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/parse/core.py +10 -14
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/parse/spacy_thai_engine.py +2 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/parse/transformers_ud.py +13 -9
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/parse/ud_goeswith.py +32 -21
- pythainlp-5.0.0.dev2/pythainlp/phayathaibert/__init__.py +23 -0
- pythainlp-5.0.0.dev2/pythainlp/phayathaibert/core.py +449 -0
- pythainlp-5.0.0.dev2/pythainlp/soundex/__init__.py +25 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/core.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/lk82.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/metasound.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/prayut_and_somchaip.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/sound.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/udom83.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/spell/__init__.py +21 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/spell/core.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/spell/phunspell.py +24 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/spell/symspellpy.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/spell/tltk.py +21 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/spell/wanchanberta_thai_grammarly.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/summarize/__init__.py +14 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/summarize/core.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/summarize/freq.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/summarize/keybert.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/summarize/mt5.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/tag/__init__.py +26 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/_tag_perceptron.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/blackboard.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/chunk.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/crfchunk.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/locations.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/named_entity.py +11 -16
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/orchid.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/perceptron.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/pos_tag.py +25 -27
- pythainlp-5.0.0.dev2/pythainlp/tag/thai_nner.py +14 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/thainer.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/tltk.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/unigram.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/wangchanberta_onnx.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/__init__.py +6 -17
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/_utils.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/attacut.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/core.py +67 -70
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/crfcls.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/crfcut.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/deepcut.py +3 -14
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/etcc.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/tokenize/han_solo.py +130 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/longest.py +3 -1
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/multi_cut.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/nercut.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/newmm.py +3 -14
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/nlpo3.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/oskut.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/pyicu.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/sefr_cut.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/tokenize/ssg.py +16 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/tcc.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/tcc_p.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/thaisumcut.py +3 -16
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/tltk.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/tokenize/wtsplit.py +78 -0
- pythainlp-5.0.0.dev2/pythainlp/tools/__init__.py +16 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tools/misspell.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tools/path.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/translate/__init__.py +15 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/core.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/en_th.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/th_fr.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/zh_th.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/transliterate/__init__.py +11 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/core.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/transliterate/ipa.py +28 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/iso_11940.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/lookup.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/pyicu.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/royin.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/spoonerism.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/thai2rom.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/thai2rom_onnx.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/thaig2p.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/tltk.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/w2p.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/wunsen.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/ulmfit/__init__.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/ulmfit/core.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/ulmfit/preprocess.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/ulmfit/tokenizer.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/__init__.py +4 -14
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/abbreviation.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/collate.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/date.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/digitconv.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/emojiconv.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/util/encoding.py +39 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/keyboard.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/keywords.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/util/morse.py +199 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/normalize.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/numtoword.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/util/phoneme.py +257 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/pronounce.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/remove_trailing_repeat_consonants.py +13 -22
- pythainlp-5.0.0.dev2/pythainlp/util/spell_words.py +129 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/strftime.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/syllable.py +2 -13
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/thai.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/util/thaiwordcheck.py +15 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/time.py +3 -18
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/trie.py +7 -18
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/wordtonum.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/wangchanberta/__init__.py +14 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/wangchanberta/core.py +39 -34
- pythainlp-5.0.0.dev2/pythainlp/word_vector/__init__.py +15 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/word_vector/core.py +2 -13
- pythainlp-5.0.0.dev2/pythainlp/wsd/__init__.py +8 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/wsd/core.py +60 -47
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2/pythainlp.egg-info}/PKG-INFO +1 -1
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/SOURCES.txt +15 -2
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/setup.cfg +1 -1
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/setup.py +3 -14
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/__init__.py +2 -0
- pythainlp-5.0.0.dev2/tests/test_ancient.py +22 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_augment.py +7 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_benchmarks.py +6 -2
- pythainlp-5.0.0.dev2/tests/test_classify.py +23 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_cli.py +4 -4
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_coref.py +2 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_corpus.py +54 -14
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_el.py +3 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_generate.py +2 -0
- pythainlp-5.0.0.dev2/tests/test_khavee.py +49 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_misspell.py +4 -6
- pythainlp-5.0.0.dev2/tests/test_morpheme.py +37 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_parse.py +5 -1
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_soundex.py +11 -3
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_spell.py +4 -4
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_summarize.py +11 -3
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_tag.py +49 -32
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_tokenize.py +86 -89
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_tools.py +2 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_ulmfit.py +13 -21
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_util.py +114 -157
- pythainlp-5.0.0.dev2/tests/test_wsd.py +13 -0
- pythainlp-5.0.0.dev0/pythainlp/ancient/__init__.py +0 -21
- pythainlp-5.0.0.dev0/pythainlp/augment/__init__.py +0 -21
- pythainlp-5.0.0.dev0/pythainlp/augment/lm/__init__.py +0 -25
- pythainlp-5.0.0.dev0/pythainlp/augment/word2vec/__init__.py +0 -23
- pythainlp-5.0.0.dev0/pythainlp/benchmarks/__init__.py +0 -21
- pythainlp-5.0.0.dev0/pythainlp/chat/__init__.py +0 -21
- pythainlp-5.0.0.dev0/pythainlp/cli/__init__.py +0 -36
- pythainlp-5.0.0.dev0/pythainlp/cls/__init__.py +0 -21
- pythainlp-5.0.0.dev0/pythainlp/coref/__init__.py +0 -19
- pythainlp-5.0.0.dev0/pythainlp/coref/_fastcoref.py +0 -38
- pythainlp-5.0.0.dev0/pythainlp/coref/han_coref.py +0 -25
- pythainlp-5.0.0.dev0/pythainlp/el/__init__.py +0 -21
- pythainlp-5.0.0.dev0/pythainlp/generate/__init__.py +0 -21
- pythainlp-5.0.0.dev0/pythainlp/khavee/__init__.py +0 -17
- pythainlp-5.0.0.dev0/pythainlp/khavee/core.py +0 -558
- pythainlp-5.0.0.dev0/pythainlp/parse/__init__.py +0 -19
- pythainlp-5.0.0.dev0/pythainlp/soundex/__init__.py +0 -36
- pythainlp-5.0.0.dev0/pythainlp/spell/__init__.py +0 -32
- pythainlp-5.0.0.dev0/pythainlp/spell/phunspell.py +0 -35
- pythainlp-5.0.0.dev0/pythainlp/spell/tltk.py +0 -32
- pythainlp-5.0.0.dev0/pythainlp/summarize/__init__.py +0 -25
- pythainlp-5.0.0.dev0/pythainlp/tag/__init__.py +0 -37
- pythainlp-5.0.0.dev0/pythainlp/tag/thai_nner.py +0 -25
- pythainlp-5.0.0.dev0/pythainlp/tokenize/han_solo.py +0 -144
- pythainlp-5.0.0.dev0/pythainlp/tokenize/ssg.py +0 -27
- pythainlp-5.0.0.dev0/pythainlp/tokenize/wtsplit.py +0 -85
- pythainlp-5.0.0.dev0/pythainlp/tools/__init__.py +0 -27
- pythainlp-5.0.0.dev0/pythainlp/translate/__init__.py +0 -26
- pythainlp-5.0.0.dev0/pythainlp/transliterate/__init__.py +0 -22
- pythainlp-5.0.0.dev0/pythainlp/transliterate/ipa.py +0 -39
- pythainlp-5.0.0.dev0/pythainlp/util/encoding.py +0 -31
- pythainlp-5.0.0.dev0/pythainlp/util/phoneme.py +0 -253
- pythainlp-5.0.0.dev0/pythainlp/util/spell_words.py +0 -121
- pythainlp-5.0.0.dev0/pythainlp/wangchanberta/__init__.py +0 -21
- pythainlp-5.0.0.dev0/pythainlp/word_vector/__init__.py +0 -26
- pythainlp-5.0.0.dev0/pythainlp/wsd/__init__.py +0 -19
- pythainlp-5.0.0.dev0/tests/test_ancient.py +0 -20
- pythainlp-5.0.0.dev0/tests/test_cls.py +0 -20
- pythainlp-5.0.0.dev0/tests/test_khavee.py +0 -43
- pythainlp-5.0.0.dev0/tests/test_wsd.py +0 -10
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/LICENSE +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/MANIFEST.in +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/parse/esupar_engine.py +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/spell/pn.py +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/small100.py +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/tokenization_small100.py +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/dependency_links.txt +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/entry_points.txt +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/not-zip-safe +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/requires.txt +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/top_level.txt +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/data/eval-details-input.json +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/data/eval-input.yml +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/data/input.txt +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/data/sentences.yml +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/data/test.txt +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_translate.py +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_transliterate.py +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_wangchanberta.py +0 -0
- {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_word_vector.py +0 -0
|
@@ -135,10 +135,10 @@ Thanks to all [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contr
|
|
|
135
135
|
- Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com> - foundation, distribution and maintenance
|
|
136
136
|
- Korakot Chaovavanich - initial tokenization and soundex codes
|
|
137
137
|
- Charin Polpanumas - classification and benchmarking
|
|
138
|
-
- Peeradej Tanruangporn - documentation
|
|
139
138
|
- Arthit Suriyawongkul - refactoring, packaging, distribution, and maintenance
|
|
140
|
-
-
|
|
139
|
+
- Lalita Lowphansirikul - documentation
|
|
141
140
|
- Pattarawat Chormai - benchmarking
|
|
141
|
+
- Peerat Limkonchotiwat
|
|
142
142
|
- Thanathip Suntorntip - nlpO3 maintenance, Rust Developer
|
|
143
143
|
- Can Udomcharoenchaikit - documentation and codes
|
|
144
144
|
|
|
@@ -147,6 +147,9 @@ Thanks to all [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contr
|
|
|
147
147
|
- Wannaphong Phatthiyaphaibun
|
|
148
148
|
|
|
149
149
|
|
|
150
|
+
### Past
|
|
151
|
+
- Peeradej Tanruangporn - documentation
|
|
152
|
+
|
|
150
153
|
## References
|
|
151
154
|
|
|
152
155
|
- **[Maximum Matching]** -- Manabu Sassano. Deterministic Word Segmentation Using Maximum Matching with Fully Lexicalized Rules. Retrieved from http://www.aclweb.org/anthology/E14-4016
|
|
@@ -29,7 +29,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร
|
|
|
29
29
|
|
|
30
30
|
## Getting Started
|
|
31
31
|
|
|
32
|
-
- PyThaiNLP 2 requires Python 3.
|
|
32
|
+
- PyThaiNLP 2 requires Python 3.7+. Python 2.7 users can use PyThaiNLP 1.6. See [2.0 change log](https://github.com/PyThaiNLP/pythainlp/issues/118) | [Upgrading from 1.7](https://pythainlp.github.io/docs/2.0/notes/pythainlp-1_7-2_0.html) | [Upgrading ThaiNER from 1.7](https://github.com/PyThaiNLP/pythainlp/wiki/Upgrade-ThaiNER-from-PyThaiNLP-1.7-to-PyThaiNLP-2.0)
|
|
33
33
|
- [PyThaiNLP Get Started notebook](https://pythainlp.github.io/tutorials/notebooks/pythainlp_get_started.html) | [API document](https://pythainlp.github.io/docs) | [Tutorials](https://pythainlp.github.io/tutorials)
|
|
34
34
|
- [Official website](https://pythainlp.github.io/) | [PyPI](https://pypi.org/project/pythainlp/) | [Facebook page](https://www.facebook.com/pythainlp/)
|
|
35
35
|
- [Who uses PyThaiNLP?](https://github.com/PyThaiNLP/pythainlp/blob/dev/INTHEWILD.md)
|
|
@@ -142,24 +142,60 @@ You can read [INTHEWILD.md](https://github.com/PyThaiNLP/pythainlp/blob/dev/INTH
|
|
|
142
142
|
|
|
143
143
|
If you use `PyThaiNLP` in your project or publication, please cite the library as follows:
|
|
144
144
|
|
|
145
|
-
```
|
|
146
145
|
Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, & Pattarawat Chormai. (2016, Jun 27). PyThaiNLP: Thai Natural Language Processing in Python. Zenodo. http://doi.org/10.5281/zenodo.3519354
|
|
147
|
-
```
|
|
148
146
|
|
|
149
147
|
or by BibTeX entry:
|
|
150
148
|
|
|
151
149
|
``` bib
|
|
152
150
|
@misc{pythainlp,
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
151
|
+
title = "{P}y{T}hai{NLP}: {T}hai Natural Language Processing in {P}ython",
|
|
152
|
+
author = "Phatthiyaphaibun, Wannaphong and
|
|
153
|
+
Chaovavanich, Korakot and
|
|
154
|
+
Polpanumas, Charin and
|
|
155
|
+
Suriyawongkul, Arthit and
|
|
156
|
+
Lowphansirikul, Lalita and
|
|
157
|
+
Chormai, Pattarawat",
|
|
158
|
+
month = jun,
|
|
159
|
+
year = "2016",
|
|
160
|
+
doi = {10.5281/zenodo.3519354},
|
|
161
|
+
publisher = {Zenodo},
|
|
162
|
+
url = {http://doi.org/10.5281/zenodo.3519354}
|
|
160
163
|
}
|
|
161
164
|
```
|
|
162
165
|
|
|
166
|
+
Our [NLP-OSS 2023](https://nlposs.github.io/2023/) paper:
|
|
167
|
+
|
|
168
|
+
Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, Pattarawat Chormai, Peerat Limkonchotiwat, Thanathip Suntorntip, and Can Udomcharoenchaikit. 2023. [PyThaiNLP: Thai Natural Language Processing in Python.](https://aclanthology.org/2023.nlposs-1.4) In Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023), pages 25–36, Singapore, Singapore. Empirical Methods in Natural Language Processing.
|
|
169
|
+
|
|
170
|
+
and its BibTeX entry:
|
|
171
|
+
|
|
172
|
+
```bib
|
|
173
|
+
@inproceedings{phatthiyaphaibun-etal-2023-pythainlp,
|
|
174
|
+
title = "{P}y{T}hai{NLP}: {T}hai Natural Language Processing in {P}ython",
|
|
175
|
+
author = "Phatthiyaphaibun, Wannaphong and
|
|
176
|
+
Chaovavanich, Korakot and
|
|
177
|
+
Polpanumas, Charin and
|
|
178
|
+
Suriyawongkul, Arthit and
|
|
179
|
+
Lowphansirikul, Lalita and
|
|
180
|
+
Chormai, Pattarawat and
|
|
181
|
+
Limkonchotiwat, Peerat and
|
|
182
|
+
Suntorntip, Thanathip and
|
|
183
|
+
Udomcharoenchaikit, Can",
|
|
184
|
+
editor = "Tan, Liling and
|
|
185
|
+
Milajevs, Dmitrijs and
|
|
186
|
+
Chauhan, Geeticka and
|
|
187
|
+
Gwinnup, Jeremy and
|
|
188
|
+
Rippeth, Elijah",
|
|
189
|
+
booktitle = "Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023)",
|
|
190
|
+
month = dec,
|
|
191
|
+
year = "2023",
|
|
192
|
+
address = "Singapore, Singapore",
|
|
193
|
+
publisher = "Empirical Methods in Natural Language Processing",
|
|
194
|
+
url = "https://aclanthology.org/2023.nlposs-1.4",
|
|
195
|
+
pages = "25--36",
|
|
196
|
+
abstract = "We present PyThaiNLP, a free and open-source natural language processing (NLP) library for Thai language implemented in Python. It provides a wide range of software, models, and datasets for Thai language. We first provide a brief historical context of tools for Thai language prior to the development of PyThaiNLP. We then outline the functionalities it provided as well as datasets and pre-trained language models. We later summarize its development milestones and discuss our experience during its development. We conclude by demonstrating how industrial and research communities utilize PyThaiNLP in their work. The library is freely available at https://github.com/pythainlp/pythainlp.",
|
|
197
|
+
}
|
|
198
|
+
```
|
|
163
199
|
|
|
164
200
|
## Sponsors
|
|
165
201
|
|
|
@@ -123,13 +123,11 @@ thainlp help
|
|
|
123
123
|
|
|
124
124
|
## การอ้างอิง
|
|
125
125
|
|
|
126
|
-
|
|
126
|
+
หากคุณใช้ซอฟต์แวร์ `PyThaiNLP` ในโครงงานหรืองานวิจัยของคุณ คุณสามารถอ้างอิงได้ตามนี้
|
|
127
127
|
|
|
128
|
-
```
|
|
129
128
|
Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, & Pattarawat Chormai. (2016, Jun 27). PyThaiNLP: Thai Natural Language Processing in Python. Zenodo. http://doi.org/10.5281/zenodo.3519354
|
|
130
|
-
```
|
|
131
129
|
|
|
132
|
-
|
|
130
|
+
โดยสามารถใช้ BibTeX นี้:
|
|
133
131
|
|
|
134
132
|
``` bib
|
|
135
133
|
@misc{pythainlp,
|
|
@@ -143,6 +141,40 @@ Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Sur
|
|
|
143
141
|
}
|
|
144
142
|
```
|
|
145
143
|
|
|
144
|
+
บทความของเราในงานประชุมวิชาการ [NLP-OSS 2023](https://nlposs.github.io/2023/):
|
|
145
|
+
|
|
146
|
+
Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, Pattarawat Chormai, Peerat Limkonchotiwat, Thanathip Suntorntip, and Can Udomcharoenchaikit. 2023. [PyThaiNLP: Thai Natural Language Processing in Python.](https://aclanthology.org/2023.nlposs-1.4) In Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023), pages 25–36, Singapore, Singapore. Empirical Methods in Natural Language Processing.
|
|
147
|
+
|
|
148
|
+
โดยสามารถใช้ BibTeX นี้:
|
|
149
|
+
|
|
150
|
+
```bib
|
|
151
|
+
@inproceedings{phatthiyaphaibun-etal-2023-pythainlp,
|
|
152
|
+
title = "{P}y{T}hai{NLP}: {T}hai Natural Language Processing in Python",
|
|
153
|
+
author = "Phatthiyaphaibun, Wannaphong and
|
|
154
|
+
Chaovavanich, Korakot and
|
|
155
|
+
Polpanumas, Charin and
|
|
156
|
+
Suriyawongkul, Arthit and
|
|
157
|
+
Lowphansirikul, Lalita and
|
|
158
|
+
Chormai, Pattarawat and
|
|
159
|
+
Limkonchotiwat, Peerat and
|
|
160
|
+
Suntorntip, Thanathip and
|
|
161
|
+
Udomcharoenchaikit, Can",
|
|
162
|
+
editor = "Tan, Liling and
|
|
163
|
+
Milajevs, Dmitrijs and
|
|
164
|
+
Chauhan, Geeticka and
|
|
165
|
+
Gwinnup, Jeremy and
|
|
166
|
+
Rippeth, Elijah",
|
|
167
|
+
booktitle = "Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023)",
|
|
168
|
+
month = dec,
|
|
169
|
+
year = "2023",
|
|
170
|
+
address = "Singapore, Singapore",
|
|
171
|
+
publisher = "Empirical Methods in Natural Language Processing",
|
|
172
|
+
url = "https://aclanthology.org/2023.nlposs-1.4",
|
|
173
|
+
pages = "25--36",
|
|
174
|
+
abstract = "We present PyThaiNLP, a free and open-source natural language processing (NLP) library for Thai language implemented in Python. It provides a wide range of software, models, and datasets for Thai language. We first provide a brief historical context of tools for Thai language prior to the development of PyThaiNLP. We then outline the functionalities it provided as well as datasets and pre-trained language models. We later summarize its development milestones and discuss our experience during its development. We conclude by demonstrating how industrial and research communities utilize PyThaiNLP in their work. The library is freely available at https://github.com/pythainlp/pythainlp.",
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
146
178
|
## ร่วมสนับสนุน PyThaiNLP
|
|
147
179
|
|
|
148
180
|
- กรุณา fork แล้วพัฒนาต่อ จากนั้นสร้าง pull request กลับมา :)
|
|
@@ -157,10 +189,10 @@ Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Sur
|
|
|
157
189
|
|
|
158
190
|
| | สัญญาอนุญาต |
|
|
159
191
|
|:---|:----|
|
|
160
|
-
| PyThaiNLP
|
|
161
|
-
|
|
|
192
|
+
| ต้นรหัสซอร์สโค้ดและโน๊ตบุ๊กของ PyThaiNLP | [Apache Software License 2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) |
|
|
193
|
+
| ฐานข้อมูลภาษา ชุดข้อมูล และเอกสารที่สร้างโดยโครงการ PyThaiNLP | [Creative Commons Zero 1.0 Universal Public Domain Dedication License (CC0)](https://creativecommons.org/publicdomain/zero/1.0/)|
|
|
162
194
|
| Language models created by PyThaiNLP | [Creative Commons Attribution 4.0 International Public License (CC-by)](https://creativecommons.org/licenses/by/4.0/) |
|
|
163
|
-
|
|
|
195
|
+
| สำหรับฐานข้อมูลภาษาและโมเดลอื่นที่อาจมาพร้อมกับซอฟต์แวร์ PyThaiNLP | ดู [Corpus License](https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md) |
|
|
164
196
|
|
|
165
197
|
|
|
166
198
|
## บัตรโมเดล
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
[tool.ruff]
|
|
2
|
+
line-length = 79
|
|
3
|
+
indent-width = 4
|
|
4
|
+
target-version = "py38"
|
|
5
|
+
|
|
6
|
+
[tool.ruff.format]
|
|
7
|
+
quote-style = "double"
|
|
8
|
+
indent-style = "space"
|
|
9
|
+
skip-magic-trailing-comma = false
|
|
10
|
+
line-ending = "auto"
|
|
11
|
+
docstring-code-format = true
|
|
12
|
+
|
|
13
|
+
[tool.ruff.lint.mccabe]
|
|
14
|
+
# Flag errors (`C901`) whenever the complexity level exceeds 5. Default is 10.
|
|
15
|
+
# We should aim to gradually reduce this to 10.
|
|
16
|
+
max-complexity = 40
|
|
@@ -1,23 +1,7 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
#
|
|
3
|
-
#
|
|
4
|
-
|
|
5
|
-
#
|
|
6
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
-
# you may not use this file except in compliance with the License.
|
|
8
|
-
# You may obtain a copy of the License at
|
|
9
|
-
#
|
|
10
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
-
#
|
|
12
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
-
# See the License for the specific language governing permissions and
|
|
16
|
-
# limitations under the License.
|
|
17
|
-
#
|
|
18
|
-
# URL: <https://pythainlp.github.io/>
|
|
19
|
-
# For license information, see LICENSE
|
|
20
|
-
__version__ = "5.0.0dev0"
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
__version__ = "5.0.0dev2"
|
|
21
5
|
|
|
22
6
|
thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars
|
|
23
7
|
|
|
@@ -1,17 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# Copyright
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
15
4
|
import argparse
|
|
16
5
|
import sys
|
|
17
6
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
"""
|
|
5
|
+
Ancient versions of the Thai language
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__all__ = ["aksonhan_to_current"]
|
|
9
|
+
|
|
10
|
+
from pythainlp.ancient.aksonhan import aksonhan_to_current
|
|
@@ -1,34 +1,23 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# Copyright
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
15
4
|
from pythainlp.util import Trie
|
|
16
|
-
from pythainlp import thai_consonants,thai_tonemarks
|
|
5
|
+
from pythainlp import thai_consonants, thai_tonemarks
|
|
17
6
|
from pythainlp.tokenize import Tokenizer
|
|
18
7
|
from pythainlp.corpus import thai_orst_words
|
|
19
8
|
|
|
20
9
|
|
|
21
10
|
_dict_aksonhan = {}
|
|
22
11
|
for i in list(thai_consonants):
|
|
23
|
-
if i=="ร":
|
|
12
|
+
if i == "ร":
|
|
24
13
|
continue
|
|
25
14
|
for j in list(thai_tonemarks):
|
|
26
|
-
_dict_aksonhan[i+j+i] = "ั"+j+i
|
|
27
|
-
_dict_aksonhan[i+i+j+i] = i+"ั"+j+i
|
|
28
|
-
_dict_aksonhan[i+i] = "ั"+i
|
|
15
|
+
_dict_aksonhan[i + j + i] = "ั" + j + i
|
|
16
|
+
_dict_aksonhan[i + i + j + i] = i + "ั" + j + i
|
|
17
|
+
_dict_aksonhan[i + i] = "ั" + i
|
|
29
18
|
_set_aksonhan = set(_dict_aksonhan.keys())
|
|
30
|
-
_trie = Trie(list(_dict_aksonhan.keys())+list(thai_consonants))
|
|
31
|
-
_tokenizer = Tokenizer(custom_dict=_trie,engine="mm")
|
|
19
|
+
_trie = Trie(list(_dict_aksonhan.keys()) + list(thai_consonants))
|
|
20
|
+
_tokenizer = Tokenizer(custom_dict=_trie, engine="mm")
|
|
32
21
|
_dict_thai = set(thai_orst_words()) # call Thai words
|
|
33
22
|
|
|
34
23
|
|
|
@@ -63,8 +52,9 @@ def aksonhan_to_current(word: str) -> str:
|
|
|
63
52
|
return word
|
|
64
53
|
elif word in _set_aksonhan:
|
|
65
54
|
return _dict_aksonhan[word]
|
|
66
|
-
elif word in _dict_thai:
|
|
55
|
+
elif word in _dict_thai: # word in Thai words
|
|
67
56
|
return word
|
|
57
|
+
|
|
68
58
|
_seg = _tokenizer.word_tokenize(word)
|
|
69
59
|
_w = []
|
|
70
60
|
for i in _seg:
|
|
@@ -72,4 +62,4 @@ def aksonhan_to_current(word: str) -> str:
|
|
|
72
62
|
_w.append(_dict_aksonhan[i])
|
|
73
63
|
else:
|
|
74
64
|
_w.append(i)
|
|
75
|
-
return
|
|
65
|
+
return "".join(_w)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
"""
|
|
5
|
+
Language Models
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"FastTextAug",
|
|
10
|
+
"Thai2transformersAug",
|
|
11
|
+
"ThaiTextAugmenter",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
from pythainlp.augment.lm.fasttext import FastTextAug
|
|
15
|
+
from pythainlp.augment.lm.phayathaibert import ThaiTextAugmenter
|
|
16
|
+
from pythainlp.augment.lm.wangchanberta import Thai2transformersAug
|
|
@@ -1,17 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# Copyright
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
15
4
|
import itertools
|
|
16
5
|
from typing import List, Tuple
|
|
17
6
|
from gensim.models.fasttext import FastText as FastText_gensim
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from typing import List
|
|
6
|
+
import random
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
from pythainlp.phayathaibert.core import ThaiTextProcessor
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_MODEL_NAME = "clicknext/phayathaibert"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ThaiTextAugmenter:
|
|
16
|
+
def __init__(self) -> None:
|
|
17
|
+
from transformers import (
|
|
18
|
+
AutoTokenizer,
|
|
19
|
+
AutoModelForMaskedLM,
|
|
20
|
+
pipeline,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
self.tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
|
|
24
|
+
self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(
|
|
25
|
+
_MODEL_NAME
|
|
26
|
+
)
|
|
27
|
+
self.model = pipeline(
|
|
28
|
+
"fill-mask",
|
|
29
|
+
tokenizer=self.tokenizer,
|
|
30
|
+
model=self.model_for_masked_lm,
|
|
31
|
+
)
|
|
32
|
+
self.processor = ThaiTextProcessor()
|
|
33
|
+
|
|
34
|
+
def generate(
|
|
35
|
+
self,
|
|
36
|
+
sample_text: str,
|
|
37
|
+
word_rank: int,
|
|
38
|
+
max_length: int = 3,
|
|
39
|
+
sample: bool = False,
|
|
40
|
+
) -> str:
|
|
41
|
+
sample_txt = sample_text
|
|
42
|
+
final_text = ""
|
|
43
|
+
|
|
44
|
+
for j in range(max_length):
|
|
45
|
+
input = self.processor.preprocess(sample_txt)
|
|
46
|
+
if sample:
|
|
47
|
+
random_word_idx = random.randint(0, 4)
|
|
48
|
+
output = self.model(input)[random_word_idx]["sequence"]
|
|
49
|
+
else:
|
|
50
|
+
output = self.model(input)[word_rank]["sequence"]
|
|
51
|
+
sample_txt = output + "<mask>"
|
|
52
|
+
final_text = sample_txt
|
|
53
|
+
|
|
54
|
+
gen_txt = re.sub("<mask>", "", final_text)
|
|
55
|
+
|
|
56
|
+
return gen_txt
|
|
57
|
+
|
|
58
|
+
def augment(
|
|
59
|
+
self, text: str, num_augs: int = 3, sample: bool = False
|
|
60
|
+
) -> List[str]:
|
|
61
|
+
"""
|
|
62
|
+
Text augmentation from PhayaThaiBERT
|
|
63
|
+
|
|
64
|
+
:param str text: Thai text
|
|
65
|
+
:param int num_augs: an amount of augmentation text needed as an output
|
|
66
|
+
:param bool sample: whether to sample the text as an output or not, \
|
|
67
|
+
true if more word diversity is needed
|
|
68
|
+
|
|
69
|
+
:return: list of text augment
|
|
70
|
+
:rtype: List[str]
|
|
71
|
+
|
|
72
|
+
:Example:
|
|
73
|
+
::
|
|
74
|
+
|
|
75
|
+
from pythainlp.augment.lm import ThaiTextAugmenter
|
|
76
|
+
|
|
77
|
+
aug = ThaiTextAugmenter()
|
|
78
|
+
aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)
|
|
79
|
+
|
|
80
|
+
# output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.',
|
|
81
|
+
'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...',
|
|
82
|
+
'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...',
|
|
83
|
+
'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
|
|
84
|
+
'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
|
|
85
|
+
"""
|
|
86
|
+
MAX_NUM_AUGS = 5
|
|
87
|
+
augment_list = []
|
|
88
|
+
|
|
89
|
+
if "<mask>" not in text:
|
|
90
|
+
text = text + "<mask>"
|
|
91
|
+
|
|
92
|
+
if num_augs <= MAX_NUM_AUGS:
|
|
93
|
+
for rank in range(num_augs):
|
|
94
|
+
gen_text = self.generate(text, rank, sample=sample)
|
|
95
|
+
processed_text = re.sub(
|
|
96
|
+
"<_>", " ", self.processor.preprocess(gen_text)
|
|
97
|
+
)
|
|
98
|
+
augment_list.append(processed_text)
|
|
99
|
+
else:
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"augmentation of more than {num_augs} is exceeded \
|
|
102
|
+
the default limit: {MAX_NUM_AUGS}"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return augment_list
|
|
@@ -1,18 +1,9 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# Copyright
|
|
3
|
-
#
|
|
4
|
-
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
15
5
|
from typing import List
|
|
6
|
+
|
|
16
7
|
from transformers import (
|
|
17
8
|
CamembertTokenizer,
|
|
18
9
|
pipeline,
|
|
@@ -62,9 +53,9 @@ class Thai2transformersAug:
|
|
|
62
53
|
|
|
63
54
|
def augment(self, sentence: str, num_replace_tokens: int = 3) -> List[str]:
|
|
64
55
|
"""
|
|
65
|
-
Text
|
|
56
|
+
Text augmentation from WangchanBERTa
|
|
66
57
|
|
|
67
|
-
:param str sentence:
|
|
58
|
+
:param str sentence: Thai sentence
|
|
68
59
|
:param int num_replace_tokens: number replace tokens
|
|
69
60
|
|
|
70
61
|
:return: list of text augment
|
|
@@ -75,7 +66,7 @@ class Thai2transformersAug:
|
|
|
75
66
|
|
|
76
67
|
from pythainlp.augment.lm import Thai2transformersAug
|
|
77
68
|
|
|
78
|
-
aug=Thai2transformersAug()
|
|
69
|
+
aug = Thai2transformersAug()
|
|
79
70
|
|
|
80
71
|
aug.augment("ช้างมีทั้งหมด 50 ตัว บน")
|
|
81
72
|
# output: ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้',
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
"""
|
|
5
|
+
Word2Vec
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__all__ = ["Word2VecAug", "Thai2fitAug", "LTW2VAug"]
|
|
9
|
+
|
|
10
|
+
from pythainlp.augment.word2vec.core import Word2VecAug
|
|
11
|
+
from pythainlp.augment.word2vec.thai2fit import Thai2fitAug
|
|
12
|
+
from pythainlp.augment.word2vec.ltw2v import LTW2VAug
|
|
@@ -1,17 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# Copyright
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
15
4
|
from typing import List, Tuple
|
|
16
5
|
from pythainlp.augment.word2vec.core import Word2VecAug
|
|
17
6
|
|
|
@@ -1,17 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# Copyright
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
15
4
|
from typing import List, Tuple
|
|
16
5
|
import itertools
|
|
17
6
|
|
|
@@ -1,17 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# Copyright
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
15
4
|
from typing import List, Tuple
|
|
16
5
|
from pythainlp.augment.word2vec.core import Word2VecAug
|
|
17
6
|
from pythainlp.corpus import get_corpus_path
|
|
@@ -1,17 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# Copyright
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
15
4
|
from typing import List, Tuple
|
|
16
5
|
from pythainlp.augment.word2vec.core import Word2VecAug
|
|
17
6
|
from pythainlp.corpus import get_corpus_path
|
|
@@ -1,17 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# Copyright
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
15
4
|
"""
|
|
16
5
|
Thank https://dev.to/ton_ami/text-data-augmentation-synonym-replacement-4h8l
|
|
17
6
|
"""
|
|
@@ -23,6 +12,7 @@ __all__ = [
|
|
|
23
12
|
from collections import OrderedDict
|
|
24
13
|
import itertools
|
|
25
14
|
from typing import List
|
|
15
|
+
|
|
26
16
|
from nltk.corpus import wordnet as wn
|
|
27
17
|
from pythainlp.corpus import wordnet
|
|
28
18
|
from pythainlp.tokenize import word_tokenize
|