semantic-compressor 2.41__tar.gz → 2.42__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. semantic_compressor-2.42/.gitignore +4 -0
  2. {semantic_compressor-2.41/semantic_compressor.egg-info → semantic_compressor-2.42}/PKG-INFO +4 -3
  3. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/semantic.py +7 -5
  4. semantic_compressor-2.42/helper_commands.txt +1 -0
  5. {semantic_compressor-2.41 → semantic_compressor-2.42}/pyproject.toml +3 -3
  6. semantic_compressor-2.42/requirements.txt +6 -0
  7. {semantic_compressor-2.41 → semantic_compressor-2.42/semantic_compressor.egg-info}/PKG-INFO +4 -3
  8. {semantic_compressor-2.41 → semantic_compressor-2.42}/semantic_compressor.egg-info/SOURCES.txt +3 -1
  9. {semantic_compressor-2.41 → semantic_compressor-2.42}/semantic_compressor.egg-info/requires.txt +1 -1
  10. {semantic_compressor-2.41 → semantic_compressor-2.42}/setup.py +2 -2
  11. semantic_compressor-2.41/compressor/resources/lid.176.ftz +0 -0
  12. {semantic_compressor-2.41 → semantic_compressor-2.42}/LICENSE +0 -0
  13. {semantic_compressor-2.41 → semantic_compressor-2.42}/MANIFEST.in +0 -0
  14. {semantic_compressor-2.41 → semantic_compressor-2.42}/README.md +0 -0
  15. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/__init__.py +0 -0
  16. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/minbpe/__init__.py +0 -0
  17. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/minbpe/base.py +0 -0
  18. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/minbpe/basic.py +0 -0
  19. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/minbpe/regex.py +0 -0
  20. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/__init__.py +0 -0
  21. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/en_stopwords.pkl +0 -0
  22. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/README +0 -0
  23. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/albanian +0 -0
  24. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/arabic +0 -0
  25. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/azerbaijani +0 -0
  26. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/basque +0 -0
  27. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/belarusian +0 -0
  28. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/bengali +0 -0
  29. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/catalan +0 -0
  30. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/chinese +0 -0
  31. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/danish +0 -0
  32. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/dutch +0 -0
  33. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/english +0 -0
  34. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/finnish +0 -0
  35. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/french +0 -0
  36. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/german +0 -0
  37. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/greek +0 -0
  38. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/hebrew +0 -0
  39. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/hinglish +0 -0
  40. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/hungarian +0 -0
  41. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/indonesian +0 -0
  42. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/italian +0 -0
  43. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/kazakh +0 -0
  44. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/nepali +0 -0
  45. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/norwegian +0 -0
  46. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/portuguese +0 -0
  47. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/romanian +0 -0
  48. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/russian +0 -0
  49. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/slovene +0 -0
  50. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/spanish +0 -0
  51. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/swedish +0 -0
  52. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/tajik +0 -0
  53. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/tamil +0 -0
  54. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords/turkish +0 -0
  55. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/stopwords.zip +0 -0
  56. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/wordnet.zip +0 -0
  57. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/words/README +0 -0
  58. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/words/en +0 -0
  59. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/corpora/words/en-basic +0 -0
  60. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/stemmers/rslp/step0.pt +0 -0
  61. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/stemmers/rslp/step1.pt +0 -0
  62. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/stemmers/rslp/step2.pt +0 -0
  63. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/stemmers/rslp/step3.pt +0 -0
  64. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/stemmers/rslp/step4.pt +0 -0
  65. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/stemmers/rslp/step5.pt +0 -0
  66. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/stemmers/rslp/step6.pt +0 -0
  67. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/stemmers/rslp.zip +0 -0
  68. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +0 -0
  69. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/.DS_Store +0 -0
  70. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/README +0 -0
  71. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/czech.pickle +0 -0
  72. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/danish.pickle +0 -0
  73. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/dutch.pickle +0 -0
  74. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/english.pickle +0 -0
  75. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/estonian.pickle +0 -0
  76. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/finnish.pickle +0 -0
  77. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/french.pickle +0 -0
  78. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/german.pickle +0 -0
  79. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/greek.pickle +0 -0
  80. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/italian.pickle +0 -0
  81. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/malayalam.pickle +0 -0
  82. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/norwegian.pickle +0 -0
  83. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/polish.pickle +0 -0
  84. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/portuguese.pickle +0 -0
  85. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/russian.pickle +0 -0
  86. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/slovene.pickle +0 -0
  87. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/spanish.pickle +0 -0
  88. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/swedish.pickle +0 -0
  89. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/PY3/turkish.pickle +0 -0
  90. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/README +0 -0
  91. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/czech.pickle +0 -0
  92. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/danish.pickle +0 -0
  93. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/dutch.pickle +0 -0
  94. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/english.pickle +0 -0
  95. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/estonian.pickle +0 -0
  96. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/finnish.pickle +0 -0
  97. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/french.pickle +0 -0
  98. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/german.pickle +0 -0
  99. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/greek.pickle +0 -0
  100. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/italian.pickle +0 -0
  101. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/malayalam.pickle +0 -0
  102. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/norwegian.pickle +0 -0
  103. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/polish.pickle +0 -0
  104. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/portuguese.pickle +0 -0
  105. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/russian.pickle +0 -0
  106. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/slovene.pickle +0 -0
  107. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/spanish.pickle +0 -0
  108. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/swedish.pickle +0 -0
  109. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt/turkish.pickle +0 -0
  110. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt.zip +0 -0
  111. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/README +0 -0
  112. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +0 -0
  113. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/collocations.tab +0 -0
  114. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab +0 -0
  115. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt +0 -0
  116. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt +0 -0
  117. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/collocations.tab +0 -0
  118. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab +0 -0
  119. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt +0 -0
  120. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt +0 -0
  121. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/collocations.tab +0 -0
  122. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab +0 -0
  123. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt +0 -0
  124. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +0 -0
  125. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab +0 -0
  126. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +0 -0
  127. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +0 -0
  128. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt +0 -0
  129. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/collocations.tab +0 -0
  130. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab +0 -0
  131. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt +0 -0
  132. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt +0 -0
  133. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/collocations.tab +0 -0
  134. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab +0 -0
  135. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt +0 -0
  136. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt +0 -0
  137. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/collocations.tab +0 -0
  138. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/ortho_context.tab +0 -0
  139. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/sent_starters.txt +0 -0
  140. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt +0 -0
  141. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/collocations.tab +0 -0
  142. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/ortho_context.tab +0 -0
  143. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/sent_starters.txt +0 -0
  144. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt +0 -0
  145. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/collocations.tab +0 -0
  146. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/ortho_context.tab +0 -0
  147. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/sent_starters.txt +0 -0
  148. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/abbrev_types.txt +0 -0
  149. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/collocations.tab +0 -0
  150. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/ortho_context.tab +0 -0
  151. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/sent_starters.txt +0 -0
  152. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/abbrev_types.txt +0 -0
  153. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/collocations.tab +0 -0
  154. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/ortho_context.tab +0 -0
  155. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/sent_starters.txt +0 -0
  156. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/abbrev_types.txt +0 -0
  157. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/collocations.tab +0 -0
  158. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/ortho_context.tab +0 -0
  159. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/sent_starters.txt +0 -0
  160. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/abbrev_types.txt +0 -0
  161. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/collocations.tab +0 -0
  162. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/ortho_context.tab +0 -0
  163. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/sent_starters.txt +0 -0
  164. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/abbrev_types.txt +0 -0
  165. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/collocations.tab +0 -0
  166. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/ortho_context.tab +0 -0
  167. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/sent_starters.txt +0 -0
  168. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/abbrev_types.txt +0 -0
  169. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/collocations.tab +0 -0
  170. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/ortho_context.tab +0 -0
  171. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/sent_starters.txt +0 -0
  172. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/abbrev_types.txt +0 -0
  173. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/collocations.tab +0 -0
  174. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/ortho_context.tab +0 -0
  175. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/sent_starters.txt +0 -0
  176. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/abbrev_types.txt +0 -0
  177. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/collocations.tab +0 -0
  178. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/ortho_context.tab +0 -0
  179. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/sent_starters.txt +0 -0
  180. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/abbrev_types.txt +0 -0
  181. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/collocations.tab +0 -0
  182. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/ortho_context.tab +0 -0
  183. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/sent_starters.txt +0 -0
  184. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt +0 -0
  185. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab +0 -0
  186. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab +0 -0
  187. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt +0 -0
  188. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/nltk_data/tokenizers/punkt_tab.zip +0 -0
  189. {semantic_compressor-2.41 → semantic_compressor-2.42}/compressor/resources/pt_stopwords.pkl +0 -0
  190. {semantic_compressor-2.41 → semantic_compressor-2.42}/semantic_compressor.egg-info/dependency_links.txt +0 -0
  191. {semantic_compressor-2.41 → semantic_compressor-2.42}/semantic_compressor.egg-info/top_level.txt +0 -0
  192. {semantic_compressor-2.41 → semantic_compressor-2.42}/setup.cfg +0 -0
@@ -0,0 +1,4 @@
1
+ *__pycache__*
2
+ build/*
3
+ dist/*
4
+ *.egg-info/*
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: semantic_compressor
3
- Version: 2.41
3
+ Version: 2.42
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -12,10 +12,11 @@ License-File: LICENSE
12
12
  Requires-Dist: numpy<2
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
- Requires-Dist: fasttext
15
+ Requires-Dist: lingua-language-detector
16
16
  Requires-Dist: model2vec
17
17
  Requires-Dist: pyspellchecker
18
18
  Dynamic: author
19
+ Dynamic: license-file
19
20
  Dynamic: requires-python
20
21
 
21
22
  ```python
@@ -6,7 +6,7 @@ from sklearn.decomposition import LatentDirichletAllocation
6
6
  from sklearn.metrics.pairwise import cosine_similarity
7
7
  from compressor.minbpe.regex import RegexTokenizer
8
8
  from concurrent.futures import ProcessPoolExecutor
9
- import numpy as np, pickle, fasttext, traceback
9
+ import numpy as np, pickle, traceback
10
10
  from nltk.tokenize import sent_tokenize
11
11
  from multiprocessing import cpu_count
12
12
  from spellchecker import SpellChecker
@@ -16,6 +16,10 @@ from collections import Counter
16
16
  from model2vec import StaticModel
17
17
  import re
18
18
 
19
+ from lingua import Language, LanguageDetectorBuilder
20
+ languages = [Language.ENGLISH, Language.PORTUGUESE]
21
+ lang_detector = LanguageDetectorBuilder.from_languages(*languages).build()
22
+
19
23
  tokenizer = RegexTokenizer()
20
24
 
21
25
  # Inicializando os stemmers
@@ -24,10 +28,8 @@ stemmer_portuguese = RSLPStemmer()
24
28
 
25
29
  english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
26
30
  portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
27
- fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
28
31
  english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
29
32
  portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
30
- langdetect_model = fasttext.load_model(fasttext_model_path)
31
33
 
32
34
  embedding_model = StaticModel.from_pretrained("cnmoro/static-nomic-eng-ptbr-tiny")
33
35
 
@@ -91,8 +93,8 @@ def count_tokens(text):
91
93
  return len(tokenizer.encode(text))
92
94
 
93
95
  def detect_language(text):
94
- detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
95
- return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
96
+ detected_lang = lang_detector.detect_language_of(text)
97
+ return 'pt' if detected_lang == Language.PORTUGUESE else 'en'
96
98
 
97
99
  def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
98
100
  words = text.split()
@@ -0,0 +1 @@
1
+ python setup.py sdist bdist_wheel && twine upload dist/*
@@ -1,10 +1,10 @@
1
1
  [build-system]
2
- requires = ["setuptools>=61.0", "numpy<2", "nltk", "scikit-learn", "fasttext", "onnxruntime", "onnxruntime-extensions", "pyspellchecker"]
2
+ requires = ["setuptools>=61.0", "numpy<2", "nltk", "scikit-learn", "lingua-language-detector", "onnxruntime", "onnxruntime-extensions", "pyspellchecker"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "semantic_compressor"
7
- version = "2.41"
7
+ version = "2.42"
8
8
  authors = [
9
9
  { name="Carlo Moro", email="cnmoro@gmail.com" },
10
10
  ]
@@ -20,7 +20,7 @@ dependencies = [
20
20
  "numpy<2",
21
21
  "nltk",
22
22
  "scikit-learn",
23
- "fasttext",
23
+ "lingua-language-detector",
24
24
  "model2vec",
25
25
  "pyspellchecker"
26
26
  ]
@@ -0,0 +1,6 @@
1
+ numpy<2
2
+ nltk
3
+ scikit-learn
4
+ lingua-language-detector
5
+ model2vec
6
+ pyspellchecker
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: semantic_compressor
3
- Version: 2.41
3
+ Version: 2.42
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -12,10 +12,11 @@ License-File: LICENSE
12
12
  Requires-Dist: numpy<2
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
- Requires-Dist: fasttext
15
+ Requires-Dist: lingua-language-detector
16
16
  Requires-Dist: model2vec
17
17
  Requires-Dist: pyspellchecker
18
18
  Dynamic: author
19
+ Dynamic: license-file
19
20
  Dynamic: requires-python
20
21
 
21
22
  ```python
@@ -1,7 +1,10 @@
1
+ .gitignore
1
2
  LICENSE
2
3
  MANIFEST.in
3
4
  README.md
5
+ helper_commands.txt
4
6
  pyproject.toml
7
+ requirements.txt
5
8
  setup.py
6
9
  compressor/__init__.py
7
10
  compressor/semantic.py
@@ -11,7 +14,6 @@ compressor/minbpe/basic.py
11
14
  compressor/minbpe/regex.py
12
15
  compressor/resources/__init__.py
13
16
  compressor/resources/en_stopwords.pkl
14
- compressor/resources/lid.176.ftz
15
17
  compressor/resources/pt_stopwords.pkl
16
18
  compressor/resources/nltk_data/corpora/stopwords.zip
17
19
  compressor/resources/nltk_data/corpora/wordnet.zip
@@ -1,6 +1,6 @@
1
1
  numpy<2
2
2
  nltk
3
3
  scikit-learn
4
- fasttext
4
+ lingua-language-detector
5
5
  model2vec
6
6
  pyspellchecker
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='semantic_compressor',
5
- version='2.41',
5
+ version='2.42',
6
6
  author='Carlo Moro',
7
7
  author_email='cnmoro@gmail.com',
8
8
  description="Semantic text compression",
@@ -15,7 +15,7 @@ setup(
15
15
  "numpy<2",
16
16
  "nltk",
17
17
  "scikit-learn",
18
- "fasttext",
18
+ "lingua-language-detector",
19
19
  "model2vec",
20
20
  "pyspellchecker"
21
21
  ],