semantic-compressor 2.2__tar.gz → 2.41__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. {semantic_compressor-2.2/semantic_compressor.egg-info → semantic_compressor-2.41}/PKG-INFO +1 -1
  2. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/semantic.py +38 -4
  3. {semantic_compressor-2.2 → semantic_compressor-2.41}/pyproject.toml +1 -1
  4. {semantic_compressor-2.2 → semantic_compressor-2.41/semantic_compressor.egg-info}/PKG-INFO +1 -1
  5. {semantic_compressor-2.2 → semantic_compressor-2.41}/setup.py +1 -1
  6. {semantic_compressor-2.2 → semantic_compressor-2.41}/LICENSE +0 -0
  7. {semantic_compressor-2.2 → semantic_compressor-2.41}/MANIFEST.in +0 -0
  8. {semantic_compressor-2.2 → semantic_compressor-2.41}/README.md +0 -0
  9. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/__init__.py +0 -0
  10. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/minbpe/__init__.py +0 -0
  11. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/minbpe/base.py +0 -0
  12. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/minbpe/basic.py +0 -0
  13. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/minbpe/regex.py +0 -0
  14. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/__init__.py +0 -0
  15. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/en_stopwords.pkl +0 -0
  16. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/lid.176.ftz +0 -0
  17. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/README +0 -0
  18. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/albanian +0 -0
  19. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/arabic +0 -0
  20. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/azerbaijani +0 -0
  21. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/basque +0 -0
  22. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/belarusian +0 -0
  23. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/bengali +0 -0
  24. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/catalan +0 -0
  25. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/chinese +0 -0
  26. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/danish +0 -0
  27. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/dutch +0 -0
  28. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/english +0 -0
  29. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/finnish +0 -0
  30. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/french +0 -0
  31. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/german +0 -0
  32. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/greek +0 -0
  33. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/hebrew +0 -0
  34. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/hinglish +0 -0
  35. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/hungarian +0 -0
  36. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/indonesian +0 -0
  37. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/italian +0 -0
  38. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/kazakh +0 -0
  39. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/nepali +0 -0
  40. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/norwegian +0 -0
  41. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/portuguese +0 -0
  42. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/romanian +0 -0
  43. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/russian +0 -0
  44. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/slovene +0 -0
  45. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/spanish +0 -0
  46. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/swedish +0 -0
  47. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/tajik +0 -0
  48. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/tamil +0 -0
  49. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords/turkish +0 -0
  50. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/stopwords.zip +0 -0
  51. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/wordnet.zip +0 -0
  52. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/words/README +0 -0
  53. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/words/en +0 -0
  54. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/corpora/words/en-basic +0 -0
  55. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/stemmers/rslp/step0.pt +0 -0
  56. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/stemmers/rslp/step1.pt +0 -0
  57. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/stemmers/rslp/step2.pt +0 -0
  58. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/stemmers/rslp/step3.pt +0 -0
  59. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/stemmers/rslp/step4.pt +0 -0
  60. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/stemmers/rslp/step5.pt +0 -0
  61. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/stemmers/rslp/step6.pt +0 -0
  62. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/stemmers/rslp.zip +0 -0
  63. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +0 -0
  64. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/.DS_Store +0 -0
  65. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/README +0 -0
  66. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/czech.pickle +0 -0
  67. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/danish.pickle +0 -0
  68. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/dutch.pickle +0 -0
  69. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/english.pickle +0 -0
  70. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/estonian.pickle +0 -0
  71. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/finnish.pickle +0 -0
  72. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/french.pickle +0 -0
  73. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/german.pickle +0 -0
  74. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/greek.pickle +0 -0
  75. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/italian.pickle +0 -0
  76. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/malayalam.pickle +0 -0
  77. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/norwegian.pickle +0 -0
  78. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/polish.pickle +0 -0
  79. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/portuguese.pickle +0 -0
  80. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/russian.pickle +0 -0
  81. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/slovene.pickle +0 -0
  82. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/spanish.pickle +0 -0
  83. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/swedish.pickle +0 -0
  84. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/PY3/turkish.pickle +0 -0
  85. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/README +0 -0
  86. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/czech.pickle +0 -0
  87. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/danish.pickle +0 -0
  88. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/dutch.pickle +0 -0
  89. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/english.pickle +0 -0
  90. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/estonian.pickle +0 -0
  91. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/finnish.pickle +0 -0
  92. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/french.pickle +0 -0
  93. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/german.pickle +0 -0
  94. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/greek.pickle +0 -0
  95. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/italian.pickle +0 -0
  96. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/malayalam.pickle +0 -0
  97. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/norwegian.pickle +0 -0
  98. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/polish.pickle +0 -0
  99. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/portuguese.pickle +0 -0
  100. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/russian.pickle +0 -0
  101. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/slovene.pickle +0 -0
  102. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/spanish.pickle +0 -0
  103. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/swedish.pickle +0 -0
  104. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt/turkish.pickle +0 -0
  105. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt.zip +0 -0
  106. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/README +0 -0
  107. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +0 -0
  108. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/collocations.tab +0 -0
  109. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab +0 -0
  110. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt +0 -0
  111. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt +0 -0
  112. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/collocations.tab +0 -0
  113. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab +0 -0
  114. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt +0 -0
  115. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt +0 -0
  116. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/collocations.tab +0 -0
  117. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab +0 -0
  118. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt +0 -0
  119. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +0 -0
  120. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab +0 -0
  121. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +0 -0
  122. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +0 -0
  123. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt +0 -0
  124. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/collocations.tab +0 -0
  125. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab +0 -0
  126. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt +0 -0
  127. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt +0 -0
  128. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/collocations.tab +0 -0
  129. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab +0 -0
  130. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt +0 -0
  131. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt +0 -0
  132. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/collocations.tab +0 -0
  133. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/ortho_context.tab +0 -0
  134. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/sent_starters.txt +0 -0
  135. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt +0 -0
  136. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/collocations.tab +0 -0
  137. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/ortho_context.tab +0 -0
  138. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/sent_starters.txt +0 -0
  139. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt +0 -0
  140. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/collocations.tab +0 -0
  141. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/ortho_context.tab +0 -0
  142. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/sent_starters.txt +0 -0
  143. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/abbrev_types.txt +0 -0
  144. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/collocations.tab +0 -0
  145. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/ortho_context.tab +0 -0
  146. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/sent_starters.txt +0 -0
  147. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/abbrev_types.txt +0 -0
  148. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/collocations.tab +0 -0
  149. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/ortho_context.tab +0 -0
  150. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/sent_starters.txt +0 -0
  151. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/abbrev_types.txt +0 -0
  152. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/collocations.tab +0 -0
  153. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/ortho_context.tab +0 -0
  154. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/sent_starters.txt +0 -0
  155. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/abbrev_types.txt +0 -0
  156. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/collocations.tab +0 -0
  157. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/ortho_context.tab +0 -0
  158. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/sent_starters.txt +0 -0
  159. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/abbrev_types.txt +0 -0
  160. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/collocations.tab +0 -0
  161. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/ortho_context.tab +0 -0
  162. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/sent_starters.txt +0 -0
  163. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/abbrev_types.txt +0 -0
  164. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/collocations.tab +0 -0
  165. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/ortho_context.tab +0 -0
  166. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/sent_starters.txt +0 -0
  167. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/abbrev_types.txt +0 -0
  168. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/collocations.tab +0 -0
  169. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/ortho_context.tab +0 -0
  170. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/sent_starters.txt +0 -0
  171. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/abbrev_types.txt +0 -0
  172. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/collocations.tab +0 -0
  173. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/ortho_context.tab +0 -0
  174. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/sent_starters.txt +0 -0
  175. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/abbrev_types.txt +0 -0
  176. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/collocations.tab +0 -0
  177. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/ortho_context.tab +0 -0
  178. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/sent_starters.txt +0 -0
  179. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt +0 -0
  180. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab +0 -0
  181. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab +0 -0
  182. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt +0 -0
  183. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/nltk_data/tokenizers/punkt_tab.zip +0 -0
  184. {semantic_compressor-2.2 → semantic_compressor-2.41}/compressor/resources/pt_stopwords.pkl +0 -0
  185. {semantic_compressor-2.2 → semantic_compressor-2.41}/semantic_compressor.egg-info/SOURCES.txt +0 -0
  186. {semantic_compressor-2.2 → semantic_compressor-2.41}/semantic_compressor.egg-info/dependency_links.txt +0 -0
  187. {semantic_compressor-2.2 → semantic_compressor-2.41}/semantic_compressor.egg-info/requires.txt +0 -0
  188. {semantic_compressor-2.2 → semantic_compressor-2.41}/semantic_compressor.egg-info/top_level.txt +0 -0
  189. {semantic_compressor-2.2 → semantic_compressor-2.41}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: semantic_compressor
3
- Version: 2.2
3
+ Version: 2.41
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -14,6 +14,7 @@ from nltk.stem import PorterStemmer
14
14
  from nltk.stem import RSLPStemmer
15
15
  from collections import Counter
16
16
  from model2vec import StaticModel
17
+ import re
17
18
 
18
19
  tokenizer = RegexTokenizer()
19
20
 
@@ -28,10 +29,39 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
28
29
  portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
29
30
  langdetect_model = fasttext.load_model(fasttext_model_path)
30
31
 
31
- embedding_model = StaticModel.from_pretrained("minishlab/potion-base-2M")
32
+ embedding_model = StaticModel.from_pretrained("cnmoro/static-nomic-eng-ptbr-tiny")
32
33
 
33
34
  hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
34
35
 
36
+ def clean_text(text: str) -> str:
37
+ # 1) Fix hyphenation at line breaks
38
+ text = re.sub(r'(\w)-\s*\n\s*(\w)', r'\1\2', text)
39
+ # 2) Strip stray pipes, bullets, brackets, quotes, unmatched parens
40
+ text = re.sub(r'[\|\•\[\]\(\)\"“”]', ' ', text)
41
+ # 3) Remove leading list hyphens
42
+ text = re.sub(r'(?m)^\s*-\s*', '', text)
43
+ # 4) Remove hyphens not between letters
44
+ text = re.sub(r'(?<!\w)-(?!\w)', ' ', text)
45
+ # 5) Collapse repeated punctuation
46
+ text = re.sub(r'([!?.,;:]){2,}', r'\1', text)
47
+ # 6) Normalize whitespace
48
+ text = re.sub(r'[ \t]+', ' ', text)
49
+ text = re.sub(r'\n{2,}', '\n', text).strip()
50
+
51
+ # 7) Aggressive cleanup if >20% noise, but keep basic punctuation
52
+ alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
53
+ if alpha_ratio < 0.8:
54
+ text = re.sub(r'[^A-Za-zÀ-ÿ\s\.\,\;\:\?\!]', ' ', text)
55
+ text = re.sub(r'\s{2,}', ' ', text).strip()
56
+
57
+ # 8) Reattach punctuation to preceding word and normalize post-punct spacing
58
+ # "word ." → "word."
59
+ text = re.sub(r'\s+([\.!,\?;:])', r'\1', text)
60
+ # "word.Next" → "word. Next"
61
+ text = re.sub(r'([\.!,\?;:])(?=\S)', r'\1 ', text)
62
+
63
+ return text
64
+
35
65
  def extract_textual_embeddings(text):
36
66
  X = hashing_vectorizer.fit_transform([text])
37
67
  dense_matrix = X.toarray()
@@ -100,7 +130,7 @@ def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
100
130
  def calculate_similarity(embed1, embed2):
101
131
  return cosine_similarity([embed1], [embed2])[0][0]
102
132
 
103
- def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None):
133
+ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None, perform_cleaning: bool = True):
104
134
  def create_lda_model(texts, stopwords):
105
135
  vectorizer = CountVectorizer(stop_words=stopwords)
106
136
  doc_term_matrix = vectorizer.fit_transform(texts)
@@ -129,6 +159,9 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, refere
129
159
  return importance
130
160
 
131
161
  try:
162
+ if perform_cleaning:
163
+ full_text = clean_text(full_text)
164
+
132
165
  # Split the text into sentences
133
166
  sentences = sent_tokenize(full_text)
134
167
 
@@ -192,7 +225,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, refere
192
225
 
193
226
  return full_text
194
227
 
195
- def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None):
228
+ def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None, perform_cleaning=True):
196
229
  """
197
230
  Compress text using either a compression rate or a target token count.
198
231
  If both are provided, the compression rate will be used.
@@ -219,7 +252,8 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
219
252
  return semantic_compress_text(
220
253
  full_text = text,
221
254
  compression_rate = compression_rate,
222
- reference_text = reference_text_steering
255
+ reference_text = reference_text_steering,
256
+ perform_cleaning = perform_cleaning
223
257
  )
224
258
  except Exception:
225
259
  traceback.print_exc()
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "semantic_compressor"
7
- version = "2.2"
7
+ version = "2.41"
8
8
  authors = [
9
9
  { name="Carlo Moro", email="cnmoro@gmail.com" },
10
10
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: semantic_compressor
3
- Version: 2.2
3
+ Version: 2.41
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='semantic_compressor',
5
- version='2.2',
5
+ version='2.41',
6
6
  author='Carlo Moro',
7
7
  author_email='cnmoro@gmail.com',
8
8
  description="Semantic text compression",