pythainlp 5.0.0.dev0__tar.gz → 5.0.0.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/CONTRIBUTING.md +5 -2
  2. {pythainlp-5.0.0.dev0/pythainlp.egg-info → pythainlp-5.0.0.dev2}/PKG-INFO +1 -1
  3. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/README.md +46 -10
  4. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/README_TH.md +39 -7
  5. pythainlp-5.0.0.dev2/pyproject.toml +16 -0
  6. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/__init__.py +3 -19
  7. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/__main__.py +2 -13
  8. pythainlp-5.0.0.dev2/pythainlp/ancient/__init__.py +10 -0
  9. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/ancient/aksonhan.py +12 -22
  10. pythainlp-5.0.0.dev2/pythainlp/augment/__init__.py +10 -0
  11. pythainlp-5.0.0.dev2/pythainlp/augment/lm/__init__.py +16 -0
  12. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/lm/fasttext.py +2 -13
  13. pythainlp-5.0.0.dev2/pythainlp/augment/lm/phayathaibert.py +105 -0
  14. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/lm/wangchanberta.py +7 -16
  15. pythainlp-5.0.0.dev2/pythainlp/augment/word2vec/__init__.py +12 -0
  16. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/word2vec/bpemb_wv.py +2 -13
  17. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/word2vec/core.py +2 -13
  18. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/word2vec/ltw2v.py +2 -13
  19. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/word2vec/thai2fit.py +2 -13
  20. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/augment/wordnet.py +3 -13
  21. pythainlp-5.0.0.dev2/pythainlp/benchmarks/__init__.py +10 -0
  22. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/benchmarks/word_tokenization.py +2 -13
  23. pythainlp-5.0.0.dev2/pythainlp/chat/__init__.py +10 -0
  24. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/chat/core.py +28 -28
  25. pythainlp-5.0.0.dev2/pythainlp/classify/__init__.py +10 -0
  26. {pythainlp-5.0.0.dev0/pythainlp/cls → pythainlp-5.0.0.dev2/pythainlp/classify}/param_free.py +8 -15
  27. pythainlp-5.0.0.dev2/pythainlp/cli/__init__.py +25 -0
  28. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/cli/benchmark.py +2 -13
  29. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/cli/data.py +2 -13
  30. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/cli/soundex.py +2 -13
  31. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/cli/tag.py +2 -13
  32. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/cli/tokenize.py +2 -13
  33. pythainlp-5.0.0.dev2/pythainlp/cls/__init__.py +16 -0
  34. pythainlp-5.0.0.dev2/pythainlp/coref/__init__.py +8 -0
  35. pythainlp-5.0.0.dev2/pythainlp/coref/_fastcoref.py +34 -0
  36. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/coref/core.py +22 -20
  37. pythainlp-5.0.0.dev2/pythainlp/coref/han_coref.py +12 -0
  38. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/__init__.py +24 -23
  39. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/common.py +87 -48
  40. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/conceptnet.py +2 -13
  41. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/core.py +95 -51
  42. pythainlp-5.0.0.dev2/pythainlp/corpus/icu.py +26 -0
  43. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/oscar.py +2 -13
  44. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/th_en_translit.py +2 -13
  45. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/tnc.py +2 -13
  46. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/ttc.py +2 -13
  47. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/util.py +2 -13
  48. pythainlp-5.0.0.dev2/pythainlp/corpus/volubilis.py +32 -0
  49. pythainlp-5.0.0.dev2/pythainlp/corpus/wikipedia.py +35 -0
  50. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/corpus/wordnet.py +2 -13
  51. pythainlp-5.0.0.dev2/pythainlp/el/__init__.py +10 -0
  52. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/el/_multiel.py +2 -13
  53. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/el/core.py +2 -13
  54. pythainlp-5.0.0.dev2/pythainlp/generate/__init__.py +10 -0
  55. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/generate/core.py +2 -13
  56. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/generate/thai2fit.py +2 -13
  57. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/generate/wangchanglm.py +2 -13
  58. pythainlp-5.0.0.dev2/pythainlp/khavee/__init__.py +7 -0
  59. pythainlp-5.0.0.dev2/pythainlp/khavee/core.py +652 -0
  60. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/khavee/example.py +58 -18
  61. pythainlp-5.0.0.dev2/pythainlp/morpheme/__init__.py +13 -0
  62. {pythainlp-5.0.0.dev0/pythainlp/util → pythainlp-5.0.0.dev2/pythainlp/morpheme}/thaiwordcheck.py +4 -14
  63. pythainlp-5.0.0.dev2/pythainlp/morpheme/word_formation.py +56 -0
  64. pythainlp-5.0.0.dev2/pythainlp/parse/__init__.py +10 -0
  65. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/parse/core.py +10 -14
  66. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/parse/spacy_thai_engine.py +2 -0
  67. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/parse/transformers_ud.py +13 -9
  68. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/parse/ud_goeswith.py +32 -21
  69. pythainlp-5.0.0.dev2/pythainlp/phayathaibert/__init__.py +23 -0
  70. pythainlp-5.0.0.dev2/pythainlp/phayathaibert/core.py +449 -0
  71. pythainlp-5.0.0.dev2/pythainlp/soundex/__init__.py +25 -0
  72. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/core.py +2 -13
  73. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/lk82.py +2 -13
  74. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/metasound.py +2 -13
  75. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/prayut_and_somchaip.py +2 -13
  76. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/sound.py +2 -13
  77. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/soundex/udom83.py +2 -13
  78. pythainlp-5.0.0.dev2/pythainlp/spell/__init__.py +21 -0
  79. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/spell/core.py +2 -13
  80. pythainlp-5.0.0.dev2/pythainlp/spell/phunspell.py +24 -0
  81. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/spell/symspellpy.py +2 -13
  82. pythainlp-5.0.0.dev2/pythainlp/spell/tltk.py +21 -0
  83. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/spell/wanchanberta_thai_grammarly.py +2 -13
  84. pythainlp-5.0.0.dev2/pythainlp/summarize/__init__.py +14 -0
  85. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/summarize/core.py +2 -13
  86. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/summarize/freq.py +2 -13
  87. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/summarize/keybert.py +2 -13
  88. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/summarize/mt5.py +2 -13
  89. pythainlp-5.0.0.dev2/pythainlp/tag/__init__.py +26 -0
  90. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/_tag_perceptron.py +2 -13
  91. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/blackboard.py +2 -13
  92. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/chunk.py +2 -13
  93. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/crfchunk.py +2 -13
  94. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/locations.py +2 -13
  95. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/named_entity.py +11 -16
  96. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/orchid.py +2 -13
  97. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/perceptron.py +2 -13
  98. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/pos_tag.py +25 -27
  99. pythainlp-5.0.0.dev2/pythainlp/tag/thai_nner.py +14 -0
  100. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/thainer.py +2 -13
  101. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/tltk.py +2 -13
  102. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/unigram.py +2 -13
  103. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tag/wangchanberta_onnx.py +2 -13
  104. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/__init__.py +6 -17
  105. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/_utils.py +2 -13
  106. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/attacut.py +2 -13
  107. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/core.py +67 -70
  108. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/crfcls.py +2 -13
  109. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/crfcut.py +2 -13
  110. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/deepcut.py +3 -14
  111. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/etcc.py +2 -13
  112. pythainlp-5.0.0.dev2/pythainlp/tokenize/han_solo.py +130 -0
  113. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/longest.py +3 -1
  114. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/multi_cut.py +2 -13
  115. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/nercut.py +2 -13
  116. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/newmm.py +3 -14
  117. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/nlpo3.py +2 -13
  118. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/oskut.py +2 -13
  119. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/pyicu.py +2 -13
  120. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/sefr_cut.py +2 -13
  121. pythainlp-5.0.0.dev2/pythainlp/tokenize/ssg.py +16 -0
  122. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/tcc.py +2 -13
  123. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/tcc_p.py +2 -13
  124. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/thaisumcut.py +3 -16
  125. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tokenize/tltk.py +2 -13
  126. pythainlp-5.0.0.dev2/pythainlp/tokenize/wtsplit.py +78 -0
  127. pythainlp-5.0.0.dev2/pythainlp/tools/__init__.py +16 -0
  128. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tools/misspell.py +2 -13
  129. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/tools/path.py +2 -13
  130. pythainlp-5.0.0.dev2/pythainlp/translate/__init__.py +15 -0
  131. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/core.py +2 -13
  132. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/en_th.py +2 -13
  133. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/th_fr.py +2 -13
  134. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/zh_th.py +2 -13
  135. pythainlp-5.0.0.dev2/pythainlp/transliterate/__init__.py +11 -0
  136. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/core.py +2 -13
  137. pythainlp-5.0.0.dev2/pythainlp/transliterate/ipa.py +28 -0
  138. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/iso_11940.py +2 -13
  139. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/lookup.py +2 -13
  140. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/pyicu.py +2 -13
  141. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/royin.py +2 -13
  142. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/spoonerism.py +2 -13
  143. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/thai2rom.py +2 -13
  144. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/thai2rom_onnx.py +2 -13
  145. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/thaig2p.py +2 -13
  146. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/tltk.py +2 -13
  147. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/w2p.py +2 -13
  148. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/transliterate/wunsen.py +2 -13
  149. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/ulmfit/__init__.py +2 -13
  150. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/ulmfit/core.py +2 -13
  151. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/ulmfit/preprocess.py +2 -13
  152. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/ulmfit/tokenizer.py +2 -13
  153. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/__init__.py +4 -14
  154. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/abbreviation.py +2 -13
  155. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/collate.py +2 -13
  156. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/date.py +2 -13
  157. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/digitconv.py +2 -13
  158. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/emojiconv.py +2 -13
  159. pythainlp-5.0.0.dev2/pythainlp/util/encoding.py +39 -0
  160. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/keyboard.py +2 -13
  161. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/keywords.py +2 -13
  162. pythainlp-5.0.0.dev2/pythainlp/util/morse.py +199 -0
  163. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/normalize.py +2 -13
  164. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/numtoword.py +2 -13
  165. pythainlp-5.0.0.dev2/pythainlp/util/phoneme.py +257 -0
  166. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/pronounce.py +2 -13
  167. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/remove_trailing_repeat_consonants.py +13 -22
  168. pythainlp-5.0.0.dev2/pythainlp/util/spell_words.py +129 -0
  169. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/strftime.py +2 -13
  170. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/syllable.py +2 -13
  171. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/thai.py +2 -13
  172. pythainlp-5.0.0.dev2/pythainlp/util/thaiwordcheck.py +15 -0
  173. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/time.py +3 -18
  174. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/trie.py +7 -18
  175. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/util/wordtonum.py +2 -13
  176. pythainlp-5.0.0.dev2/pythainlp/wangchanberta/__init__.py +14 -0
  177. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/wangchanberta/core.py +39 -34
  178. pythainlp-5.0.0.dev2/pythainlp/word_vector/__init__.py +15 -0
  179. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/word_vector/core.py +2 -13
  180. pythainlp-5.0.0.dev2/pythainlp/wsd/__init__.py +8 -0
  181. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/wsd/core.py +60 -47
  182. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2/pythainlp.egg-info}/PKG-INFO +1 -1
  183. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/SOURCES.txt +15 -2
  184. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/setup.cfg +1 -1
  185. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/setup.py +3 -14
  186. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/__init__.py +2 -0
  187. pythainlp-5.0.0.dev2/tests/test_ancient.py +22 -0
  188. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_augment.py +7 -0
  189. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_benchmarks.py +6 -2
  190. pythainlp-5.0.0.dev2/tests/test_classify.py +23 -0
  191. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_cli.py +4 -4
  192. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_coref.py +2 -0
  193. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_corpus.py +54 -14
  194. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_el.py +3 -0
  195. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_generate.py +2 -0
  196. pythainlp-5.0.0.dev2/tests/test_khavee.py +49 -0
  197. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_misspell.py +4 -6
  198. pythainlp-5.0.0.dev2/tests/test_morpheme.py +37 -0
  199. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_parse.py +5 -1
  200. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_soundex.py +11 -3
  201. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_spell.py +4 -4
  202. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_summarize.py +11 -3
  203. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_tag.py +49 -32
  204. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_tokenize.py +86 -89
  205. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_tools.py +2 -0
  206. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_ulmfit.py +13 -21
  207. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_util.py +114 -157
  208. pythainlp-5.0.0.dev2/tests/test_wsd.py +13 -0
  209. pythainlp-5.0.0.dev0/pythainlp/ancient/__init__.py +0 -21
  210. pythainlp-5.0.0.dev0/pythainlp/augment/__init__.py +0 -21
  211. pythainlp-5.0.0.dev0/pythainlp/augment/lm/__init__.py +0 -25
  212. pythainlp-5.0.0.dev0/pythainlp/augment/word2vec/__init__.py +0 -23
  213. pythainlp-5.0.0.dev0/pythainlp/benchmarks/__init__.py +0 -21
  214. pythainlp-5.0.0.dev0/pythainlp/chat/__init__.py +0 -21
  215. pythainlp-5.0.0.dev0/pythainlp/cli/__init__.py +0 -36
  216. pythainlp-5.0.0.dev0/pythainlp/cls/__init__.py +0 -21
  217. pythainlp-5.0.0.dev0/pythainlp/coref/__init__.py +0 -19
  218. pythainlp-5.0.0.dev0/pythainlp/coref/_fastcoref.py +0 -38
  219. pythainlp-5.0.0.dev0/pythainlp/coref/han_coref.py +0 -25
  220. pythainlp-5.0.0.dev0/pythainlp/el/__init__.py +0 -21
  221. pythainlp-5.0.0.dev0/pythainlp/generate/__init__.py +0 -21
  222. pythainlp-5.0.0.dev0/pythainlp/khavee/__init__.py +0 -17
  223. pythainlp-5.0.0.dev0/pythainlp/khavee/core.py +0 -558
  224. pythainlp-5.0.0.dev0/pythainlp/parse/__init__.py +0 -19
  225. pythainlp-5.0.0.dev0/pythainlp/soundex/__init__.py +0 -36
  226. pythainlp-5.0.0.dev0/pythainlp/spell/__init__.py +0 -32
  227. pythainlp-5.0.0.dev0/pythainlp/spell/phunspell.py +0 -35
  228. pythainlp-5.0.0.dev0/pythainlp/spell/tltk.py +0 -32
  229. pythainlp-5.0.0.dev0/pythainlp/summarize/__init__.py +0 -25
  230. pythainlp-5.0.0.dev0/pythainlp/tag/__init__.py +0 -37
  231. pythainlp-5.0.0.dev0/pythainlp/tag/thai_nner.py +0 -25
  232. pythainlp-5.0.0.dev0/pythainlp/tokenize/han_solo.py +0 -144
  233. pythainlp-5.0.0.dev0/pythainlp/tokenize/ssg.py +0 -27
  234. pythainlp-5.0.0.dev0/pythainlp/tokenize/wtsplit.py +0 -85
  235. pythainlp-5.0.0.dev0/pythainlp/tools/__init__.py +0 -27
  236. pythainlp-5.0.0.dev0/pythainlp/translate/__init__.py +0 -26
  237. pythainlp-5.0.0.dev0/pythainlp/transliterate/__init__.py +0 -22
  238. pythainlp-5.0.0.dev0/pythainlp/transliterate/ipa.py +0 -39
  239. pythainlp-5.0.0.dev0/pythainlp/util/encoding.py +0 -31
  240. pythainlp-5.0.0.dev0/pythainlp/util/phoneme.py +0 -253
  241. pythainlp-5.0.0.dev0/pythainlp/util/spell_words.py +0 -121
  242. pythainlp-5.0.0.dev0/pythainlp/wangchanberta/__init__.py +0 -21
  243. pythainlp-5.0.0.dev0/pythainlp/word_vector/__init__.py +0 -26
  244. pythainlp-5.0.0.dev0/pythainlp/wsd/__init__.py +0 -19
  245. pythainlp-5.0.0.dev0/tests/test_ancient.py +0 -20
  246. pythainlp-5.0.0.dev0/tests/test_cls.py +0 -20
  247. pythainlp-5.0.0.dev0/tests/test_khavee.py +0 -43
  248. pythainlp-5.0.0.dev0/tests/test_wsd.py +0 -10
  249. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/LICENSE +0 -0
  250. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/MANIFEST.in +0 -0
  251. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/parse/esupar_engine.py +0 -0
  252. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/spell/pn.py +0 -0
  253. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/small100.py +0 -0
  254. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp/translate/tokenization_small100.py +0 -0
  255. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/dependency_links.txt +0 -0
  256. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/entry_points.txt +0 -0
  257. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/not-zip-safe +0 -0
  258. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/requires.txt +0 -0
  259. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/pythainlp.egg-info/top_level.txt +0 -0
  260. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/data/eval-details-input.json +0 -0
  261. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/data/eval-input.yml +0 -0
  262. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/data/input.txt +0 -0
  263. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/data/sentences.yml +0 -0
  264. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/data/test.txt +0 -0
  265. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_translate.py +0 -0
  266. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_transliterate.py +0 -0
  267. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_wangchanberta.py +0 -0
  268. {pythainlp-5.0.0.dev0 → pythainlp-5.0.0.dev2}/tests/test_word_vector.py +0 -0
@@ -135,10 +135,10 @@ Thanks to all [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contr
135
135
  - Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com> - foundation, distribution and maintenance
136
136
  - Korakot Chaovavanich - initial tokenization and soundex codes
137
137
  - Charin Polpanumas - classification and benchmarking
138
- - Peeradej Tanruangporn - documentation
139
138
  - Arthit Suriyawongkul - refactoring, packaging, distribution, and maintenance
140
- - Chakri Lowphansirikul - documentation
139
+ - Lalita Lowphansirikul - documentation
141
140
  - Pattarawat Chormai - benchmarking
141
+ - Peerat Limkonchotiwat
142
142
  - Thanathip Suntorntip - nlpO3 maintenance, Rust Developer
143
143
  - Can Udomcharoenchaikit - documentation and codes
144
144
 
@@ -147,6 +147,9 @@ Thanks to all [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contr
147
147
  - Wannaphong Phatthiyaphaibun
148
148
 
149
149
 
150
+ ### Past
151
+ - Peeradej Tanruangporn - documentation
152
+
150
153
  ## References
151
154
 
152
155
  - **[Maximum Matching]** -- Manabu Sassano. Deterministic Word Segmentation Using Maximum Matching with Fully Lexicalized Rules. Retrieved from http://www.aclweb.org/anthology/E14-4016
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pythainlp
3
- Version: 5.0.0.dev0
3
+ Version: 5.0.0.dev2
4
4
  Summary: Thai Natural Language Processing library
5
5
  Home-page: https://github.com/PyThaiNLP/pythainlp
6
6
  Author: PyThaiNLP
@@ -29,7 +29,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร
29
29
 
30
30
  ## Getting Started
31
31
 
32
- - PyThaiNLP 2 requires Python 3.6+. Python 2.7 users can use PyThaiNLP 1.6. See [2.0 change log](https://github.com/PyThaiNLP/pythainlp/issues/118) | [Upgrading from 1.7](https://pythainlp.github.io/docs/2.0/notes/pythainlp-1_7-2_0.html) | [Upgrading ThaiNER from 1.7](https://github.com/PyThaiNLP/pythainlp/wiki/Upgrade-ThaiNER-from-PyThaiNLP-1.7-to-PyThaiNLP-2.0)
32
+ - PyThaiNLP 2 requires Python 3.7+. Python 2.7 users can use PyThaiNLP 1.6. See [2.0 change log](https://github.com/PyThaiNLP/pythainlp/issues/118) | [Upgrading from 1.7](https://pythainlp.github.io/docs/2.0/notes/pythainlp-1_7-2_0.html) | [Upgrading ThaiNER from 1.7](https://github.com/PyThaiNLP/pythainlp/wiki/Upgrade-ThaiNER-from-PyThaiNLP-1.7-to-PyThaiNLP-2.0)
33
33
  - [PyThaiNLP Get Started notebook](https://pythainlp.github.io/tutorials/notebooks/pythainlp_get_started.html) | [API document](https://pythainlp.github.io/docs) | [Tutorials](https://pythainlp.github.io/tutorials)
34
34
  - [Official website](https://pythainlp.github.io/) | [PyPI](https://pypi.org/project/pythainlp/) | [Facebook page](https://www.facebook.com/pythainlp/)
35
35
  - [Who uses PyThaiNLP?](https://github.com/PyThaiNLP/pythainlp/blob/dev/INTHEWILD.md)
@@ -142,24 +142,60 @@ You can read [INTHEWILD.md](https://github.com/PyThaiNLP/pythainlp/blob/dev/INTH
142
142
 
143
143
  If you use `PyThaiNLP` in your project or publication, please cite the library as follows:
144
144
 
145
- ```
146
145
  Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, & Pattarawat Chormai. (2016, Jun 27). PyThaiNLP: Thai Natural Language Processing in Python. Zenodo. http://doi.org/10.5281/zenodo.3519354
147
- ```
148
146
 
149
147
  or by BibTeX entry:
150
148
 
151
149
  ``` bib
152
150
  @misc{pythainlp,
153
- author = {Wannaphong Phatthiyaphaibun and Korakot Chaovavanich and Charin Polpanumas and Arthit Suriyawongkul and Lalita Lowphansirikul and Pattarawat Chormai},
154
- title = {{PyThaiNLP: Thai Natural Language Processing in Python}},
155
- month = Jun,
156
- year = 2016,
157
- doi = {10.5281/zenodo.3519354},
158
- publisher = {Zenodo},
159
- url = {http://doi.org/10.5281/zenodo.3519354}
151
+ title = "{P}y{T}hai{NLP}: {T}hai Natural Language Processing in {P}ython",
152
+ author = "Phatthiyaphaibun, Wannaphong and
153
+ Chaovavanich, Korakot and
154
+ Polpanumas, Charin and
155
+ Suriyawongkul, Arthit and
156
+ Lowphansirikul, Lalita and
157
+ Chormai, Pattarawat",
158
+ month = jun,
159
+ year = "2016",
160
+ doi = {10.5281/zenodo.3519354},
161
+ publisher = {Zenodo},
162
+ url = {http://doi.org/10.5281/zenodo.3519354}
160
163
  }
161
164
  ```
162
165
 
166
+ Our [NLP-OSS 2023](https://nlposs.github.io/2023/) paper:
167
+
168
+ Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, Pattarawat Chormai, Peerat Limkonchotiwat, Thanathip Suntorntip, and Can Udomcharoenchaikit. 2023. [PyThaiNLP: Thai Natural Language Processing in Python.](https://aclanthology.org/2023.nlposs-1.4) In Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023), pages 25–36, Singapore, Singapore. Empirical Methods in Natural Language Processing.
169
+
170
+ and its BibTeX entry:
171
+
172
+ ```bib
173
+ @inproceedings{phatthiyaphaibun-etal-2023-pythainlp,
174
+ title = "{P}y{T}hai{NLP}: {T}hai Natural Language Processing in {P}ython",
175
+ author = "Phatthiyaphaibun, Wannaphong and
176
+ Chaovavanich, Korakot and
177
+ Polpanumas, Charin and
178
+ Suriyawongkul, Arthit and
179
+ Lowphansirikul, Lalita and
180
+ Chormai, Pattarawat and
181
+ Limkonchotiwat, Peerat and
182
+ Suntorntip, Thanathip and
183
+ Udomcharoenchaikit, Can",
184
+ editor = "Tan, Liling and
185
+ Milajevs, Dmitrijs and
186
+ Chauhan, Geeticka and
187
+ Gwinnup, Jeremy and
188
+ Rippeth, Elijah",
189
+ booktitle = "Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023)",
190
+ month = dec,
191
+ year = "2023",
192
+ address = "Singapore, Singapore",
193
+ publisher = "Empirical Methods in Natural Language Processing",
194
+ url = "https://aclanthology.org/2023.nlposs-1.4",
195
+ pages = "25--36",
196
+ abstract = "We present PyThaiNLP, a free and open-source natural language processing (NLP) library for Thai language implemented in Python. It provides a wide range of software, models, and datasets for Thai language. We first provide a brief historical context of tools for Thai language prior to the development of PyThaiNLP. We then outline the functionalities it provided as well as datasets and pre-trained language models. We later summarize its development milestones and discuss our experience during its development. We conclude by demonstrating how industrial and research communities utilize PyThaiNLP in their work. The library is freely available at https://github.com/pythainlp/pythainlp.",
197
+ }
198
+ ```
163
199
 
164
200
  ## Sponsors
165
201
 
@@ -123,13 +123,11 @@ thainlp help
123
123
 
124
124
  ## การอ้างอิง
125
125
 
126
- ถ้าคุณใช้ `PyThaiNLP` ในโปรเจคหรืองานวิจัยของคุณ คุณสามารถอ้างอิงได้ตามนี้
126
+ หากคุณใช้ซอฟต์แวร์ `PyThaiNLP` ในโครงงานหรืองานวิจัยของคุณ คุณสามารถอ้างอิงได้ตามนี้
127
127
 
128
- ```
129
128
  Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, & Pattarawat Chormai. (2016, Jun 27). PyThaiNLP: Thai Natural Language Processing in Python. Zenodo. http://doi.org/10.5281/zenodo.3519354
130
- ```
131
129
 
132
- หรือ BibTeX entry:
130
+ โดยสามารถใช้ BibTeX นี้:
133
131
 
134
132
  ``` bib
135
133
  @misc{pythainlp,
@@ -143,6 +141,40 @@ Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Sur
143
141
  }
144
142
  ```
145
143
 
144
+ บทความของเราในงานประชุมวิชาการ [NLP-OSS 2023](https://nlposs.github.io/2023/):
145
+
146
+ Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, Pattarawat Chormai, Peerat Limkonchotiwat, Thanathip Suntorntip, and Can Udomcharoenchaikit. 2023. [PyThaiNLP: Thai Natural Language Processing in Python.](https://aclanthology.org/2023.nlposs-1.4) In Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023), pages 25–36, Singapore, Singapore. Empirical Methods in Natural Language Processing.
147
+
148
+ โดยสามารถใช้ BibTeX นี้:
149
+
150
+ ```bib
151
+ @inproceedings{phatthiyaphaibun-etal-2023-pythainlp,
152
+ title = "{P}y{T}hai{NLP}: {T}hai Natural Language Processing in Python",
153
+ author = "Phatthiyaphaibun, Wannaphong and
154
+ Chaovavanich, Korakot and
155
+ Polpanumas, Charin and
156
+ Suriyawongkul, Arthit and
157
+ Lowphansirikul, Lalita and
158
+ Chormai, Pattarawat and
159
+ Limkonchotiwat, Peerat and
160
+ Suntorntip, Thanathip and
161
+ Udomcharoenchaikit, Can",
162
+ editor = "Tan, Liling and
163
+ Milajevs, Dmitrijs and
164
+ Chauhan, Geeticka and
165
+ Gwinnup, Jeremy and
166
+ Rippeth, Elijah",
167
+ booktitle = "Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023)",
168
+ month = dec,
169
+ year = "2023",
170
+ address = "Singapore, Singapore",
171
+ publisher = "Empirical Methods in Natural Language Processing",
172
+ url = "https://aclanthology.org/2023.nlposs-1.4",
173
+ pages = "25--36",
174
+ abstract = "We present PyThaiNLP, a free and open-source natural language processing (NLP) library for Thai language implemented in Python. It provides a wide range of software, models, and datasets for Thai language. We first provide a brief historical context of tools for Thai language prior to the development of PyThaiNLP. We then outline the functionalities it provided as well as datasets and pre-trained language models. We later summarize its development milestones and discuss our experience during its development. We conclude by demonstrating how industrial and research communities utilize PyThaiNLP in their work. The library is freely available at https://github.com/pythainlp/pythainlp.",
175
+ }
176
+ ```
177
+
146
178
  ## ร่วมสนับสนุน PyThaiNLP
147
179
 
148
180
  - กรุณา fork แล้วพัฒนาต่อ จากนั้นสร้าง pull request กลับมา :)
@@ -157,10 +189,10 @@ Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Sur
157
189
 
158
190
  | | สัญญาอนุญาต |
159
191
  |:---|:----|
160
- | PyThaiNLP Source Code and Notebooks | [Apache Software License 2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) |
161
- | Corpora, datasets, and documentations created by PyThaiNLP | [Creative Commons Zero 1.0 Universal Public Domain Dedication License (CC0)](https://creativecommons.org/publicdomain/zero/1.0/)|
192
+ | ต้นรหัสซอร์สโค้ดและโน๊ตบุ๊กของ PyThaiNLP | [Apache Software License 2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) |
193
+ | ฐานข้อมูลภาษา ชุดข้อมูล และเอกสารที่สร้างโดยโครงการ PyThaiNLP | [Creative Commons Zero 1.0 Universal Public Domain Dedication License (CC0)](https://creativecommons.org/publicdomain/zero/1.0/)|
162
194
  | Language models created by PyThaiNLP | [Creative Commons Attribution 4.0 International Public License (CC-by)](https://creativecommons.org/licenses/by/4.0/) |
163
- | Other corpora and models that may included with PyThaiNLP | See [Corpus License](https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md) |
195
+ | สำหรับฐานข้อมูลภาษาและโมเดลอื่นที่อาจมาพร้อมกับซอฟต์แวร์ PyThaiNLP | ดู [Corpus License](https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md) |
164
196
 
165
197
 
166
198
  ## บัตรโมเดล
@@ -0,0 +1,16 @@
1
+ [tool.ruff]
2
+ line-length = 79
3
+ indent-width = 4
4
+ target-version = "py38"
5
+
6
+ [tool.ruff.format]
7
+ quote-style = "double"
8
+ indent-style = "space"
9
+ skip-magic-trailing-comma = false
10
+ line-ending = "auto"
11
+ docstring-code-format = true
12
+
13
+ [tool.ruff.lint.mccabe]
14
+ # Flag errors (`C901`) whenever the complexity level exceeds 5. Default is 10.
15
+ # We should aim to gradually reduce this to 10.
16
+ max-complexity = 40
@@ -1,23 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
- # PyThaiNLP: Thai Natural Language Processing in Python
3
- #
4
- # Copyright (C) 2016-2023 PyThaiNLP Project
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
- #
18
- # URL: <https://pythainlp.github.io/>
19
- # For license information, see LICENSE
20
- __version__ = "5.0.0dev0"
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ __version__ = "5.0.0dev2"
21
5
 
22
6
  thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars
23
7
 
@@ -1,17 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
- # Copyright (C) 2016-2023 PyThaiNLP Project
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
15
4
  import argparse
16
5
  import sys
17
6
 
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ """
5
+ Ancient versions of the Thai language
6
+ """
7
+
8
+ __all__ = ["aksonhan_to_current"]
9
+
10
+ from pythainlp.ancient.aksonhan import aksonhan_to_current
@@ -1,34 +1,23 @@
1
1
  # -*- coding: utf-8 -*-
2
- # Copyright (C) 2016-2023 PyThaiNLP Project
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
15
4
  from pythainlp.util import Trie
16
- from pythainlp import thai_consonants,thai_tonemarks
5
+ from pythainlp import thai_consonants, thai_tonemarks
17
6
  from pythainlp.tokenize import Tokenizer
18
7
  from pythainlp.corpus import thai_orst_words
19
8
 
20
9
 
21
10
  _dict_aksonhan = {}
22
11
  for i in list(thai_consonants):
23
- if i=="ร":
12
+ if i == "ร":
24
13
  continue
25
14
  for j in list(thai_tonemarks):
26
- _dict_aksonhan[i+j+i] = "ั"+j+i
27
- _dict_aksonhan[i+i+j+i] = i+"ั"+j+i
28
- _dict_aksonhan[i+i] = "ั"+i
15
+ _dict_aksonhan[i + j + i] = "ั" + j + i
16
+ _dict_aksonhan[i + i + j + i] = i + "ั" + j + i
17
+ _dict_aksonhan[i + i] = "ั" + i
29
18
  _set_aksonhan = set(_dict_aksonhan.keys())
30
- _trie = Trie(list(_dict_aksonhan.keys())+list(thai_consonants))
31
- _tokenizer = Tokenizer(custom_dict=_trie,engine="mm")
19
+ _trie = Trie(list(_dict_aksonhan.keys()) + list(thai_consonants))
20
+ _tokenizer = Tokenizer(custom_dict=_trie, engine="mm")
32
21
  _dict_thai = set(thai_orst_words()) # call Thai words
33
22
 
34
23
 
@@ -63,8 +52,9 @@ def aksonhan_to_current(word: str) -> str:
63
52
  return word
64
53
  elif word in _set_aksonhan:
65
54
  return _dict_aksonhan[word]
66
- elif word in _dict_thai: # word in Thai words
55
+ elif word in _dict_thai: # word in Thai words
67
56
  return word
57
+
68
58
  _seg = _tokenizer.word_tokenize(word)
69
59
  _w = []
70
60
  for i in _seg:
@@ -72,4 +62,4 @@ def aksonhan_to_current(word: str) -> str:
72
62
  _w.append(_dict_aksonhan[i])
73
63
  else:
74
64
  _w.append(i)
75
- return ''.join(_w)
65
+ return "".join(_w)
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ """
5
+ Thai text augment
6
+ """
7
+
8
+ __all__ = ["WordNetAug"]
9
+
10
+ from pythainlp.augment.wordnet import WordNetAug
@@ -0,0 +1,16 @@
1
+ # -*- coding: utf-8 -*-
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ """
5
+ Language Models
6
+ """
7
+
8
+ __all__ = [
9
+ "FastTextAug",
10
+ "Thai2transformersAug",
11
+ "ThaiTextAugmenter",
12
+ ]
13
+
14
+ from pythainlp.augment.lm.fasttext import FastTextAug
15
+ from pythainlp.augment.lm.phayathaibert import ThaiTextAugmenter
16
+ from pythainlp.augment.lm.wangchanberta import Thai2transformersAug
@@ -1,17 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
- # Copyright (C) 2016-2023 PyThaiNLP Project
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
15
4
  import itertools
16
5
  from typing import List, Tuple
17
6
  from gensim.models.fasttext import FastText as FastText_gensim
@@ -0,0 +1,105 @@
1
+ # -*- coding: utf-8 -*-
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from typing import List
6
+ import random
7
+ import re
8
+
9
+ from pythainlp.phayathaibert.core import ThaiTextProcessor
10
+
11
+
12
+ _MODEL_NAME = "clicknext/phayathaibert"
13
+
14
+
15
+ class ThaiTextAugmenter:
16
+ def __init__(self) -> None:
17
+ from transformers import (
18
+ AutoTokenizer,
19
+ AutoModelForMaskedLM,
20
+ pipeline,
21
+ )
22
+
23
+ self.tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
24
+ self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(
25
+ _MODEL_NAME
26
+ )
27
+ self.model = pipeline(
28
+ "fill-mask",
29
+ tokenizer=self.tokenizer,
30
+ model=self.model_for_masked_lm,
31
+ )
32
+ self.processor = ThaiTextProcessor()
33
+
34
+ def generate(
35
+ self,
36
+ sample_text: str,
37
+ word_rank: int,
38
+ max_length: int = 3,
39
+ sample: bool = False,
40
+ ) -> str:
41
+ sample_txt = sample_text
42
+ final_text = ""
43
+
44
+ for j in range(max_length):
45
+ input = self.processor.preprocess(sample_txt)
46
+ if sample:
47
+ random_word_idx = random.randint(0, 4)
48
+ output = self.model(input)[random_word_idx]["sequence"]
49
+ else:
50
+ output = self.model(input)[word_rank]["sequence"]
51
+ sample_txt = output + "<mask>"
52
+ final_text = sample_txt
53
+
54
+ gen_txt = re.sub("<mask>", "", final_text)
55
+
56
+ return gen_txt
57
+
58
+ def augment(
59
+ self, text: str, num_augs: int = 3, sample: bool = False
60
+ ) -> List[str]:
61
+ """
62
+ Text augmentation from PhayaThaiBERT
63
+
64
+ :param str text: Thai text
65
+ :param int num_augs: an amount of augmentation text needed as an output
66
+ :param bool sample: whether to sample the text as an output or not, \
67
+ true if more word diversity is needed
68
+
69
+ :return: list of text augment
70
+ :rtype: List[str]
71
+
72
+ :Example:
73
+ ::
74
+
75
+ from pythainlp.augment.lm import ThaiTextAugmenter
76
+
77
+ aug = ThaiTextAugmenter()
78
+ aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)
79
+
80
+ # output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.',
81
+ 'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...',
82
+ 'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...',
83
+ 'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
84
+ 'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
85
+ """
86
+ MAX_NUM_AUGS = 5
87
+ augment_list = []
88
+
89
+ if "<mask>" not in text:
90
+ text = text + "<mask>"
91
+
92
+ if num_augs <= MAX_NUM_AUGS:
93
+ for rank in range(num_augs):
94
+ gen_text = self.generate(text, rank, sample=sample)
95
+ processed_text = re.sub(
96
+ "<_>", " ", self.processor.preprocess(gen_text)
97
+ )
98
+ augment_list.append(processed_text)
99
+ else:
100
+ raise ValueError(
101
+ f"augmentation of more than {num_augs} is exceeded \
102
+ the default limit: {MAX_NUM_AUGS}"
103
+ )
104
+
105
+ return augment_list
@@ -1,18 +1,9 @@
1
1
  # -*- coding: utf-8 -*-
2
- # Copyright (C) 2016-2023 PyThaiNLP Project
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
15
5
  from typing import List
6
+
16
7
  from transformers import (
17
8
  CamembertTokenizer,
18
9
  pipeline,
@@ -62,9 +53,9 @@ class Thai2transformersAug:
62
53
 
63
54
  def augment(self, sentence: str, num_replace_tokens: int = 3) -> List[str]:
64
55
  """
65
- Text Augment from wangchanberta
56
+ Text augmentation from WangchanBERTa
66
57
 
67
- :param str sentence: thai sentence
58
+ :param str sentence: Thai sentence
68
59
  :param int num_replace_tokens: number replace tokens
69
60
 
70
61
  :return: list of text augment
@@ -75,7 +66,7 @@ class Thai2transformersAug:
75
66
 
76
67
  from pythainlp.augment.lm import Thai2transformersAug
77
68
 
78
- aug=Thai2transformersAug()
69
+ aug = Thai2transformersAug()
79
70
 
80
71
  aug.augment("ช้างมีทั้งหมด 50 ตัว บน")
81
72
  # output: ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้',
@@ -0,0 +1,12 @@
1
+ # -*- coding: utf-8 -*-
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ """
5
+ Word2Vec
6
+ """
7
+
8
+ __all__ = ["Word2VecAug", "Thai2fitAug", "LTW2VAug"]
9
+
10
+ from pythainlp.augment.word2vec.core import Word2VecAug
11
+ from pythainlp.augment.word2vec.thai2fit import Thai2fitAug
12
+ from pythainlp.augment.word2vec.ltw2v import LTW2VAug
@@ -1,17 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
- # Copyright (C) 2016-2023 PyThaiNLP Project
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
15
4
  from typing import List, Tuple
16
5
  from pythainlp.augment.word2vec.core import Word2VecAug
17
6
 
@@ -1,17 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
- # Copyright (C) 2016-2023 PyThaiNLP Project
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
15
4
  from typing import List, Tuple
16
5
  import itertools
17
6
 
@@ -1,17 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
- # Copyright (C) 2016-2023 PyThaiNLP Project
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
15
4
  from typing import List, Tuple
16
5
  from pythainlp.augment.word2vec.core import Word2VecAug
17
6
  from pythainlp.corpus import get_corpus_path
@@ -1,17 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
- # Copyright (C) 2016-2023 PyThaiNLP Project
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
15
4
  from typing import List, Tuple
16
5
  from pythainlp.augment.word2vec.core import Word2VecAug
17
6
  from pythainlp.corpus import get_corpus_path
@@ -1,17 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
- # Copyright (C) 2016-2023 PyThaiNLP Project
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
2
+ # SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3
+ # SPDX-License-Identifier: Apache-2.0
15
4
  """
16
5
  Thank https://dev.to/ton_ami/text-data-augmentation-synonym-replacement-4h8l
17
6
  """
@@ -23,6 +12,7 @@ __all__ = [
23
12
  from collections import OrderedDict
24
13
  import itertools
25
14
  from typing import List
15
+
26
16
  from nltk.corpus import wordnet as wn
27
17
  from pythainlp.corpus import wordnet
28
18
  from pythainlp.tokenize import word_tokenize