semantic-compressor 2.3__tar.gz → 2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. {semantic_compressor-2.3/semantic_compressor.egg-info → semantic_compressor-2.5}/PKG-INFO +5 -4
  2. semantic_compressor-2.5/compressor/semantic.py +414 -0
  3. {semantic_compressor-2.3 → semantic_compressor-2.5}/pyproject.toml +4 -4
  4. {semantic_compressor-2.3 → semantic_compressor-2.5/semantic_compressor.egg-info}/PKG-INFO +5 -4
  5. {semantic_compressor-2.3 → semantic_compressor-2.5}/semantic_compressor.egg-info/SOURCES.txt +13 -2
  6. {semantic_compressor-2.3 → semantic_compressor-2.5}/semantic_compressor.egg-info/requires.txt +2 -2
  7. {semantic_compressor-2.3 → semantic_compressor-2.5}/semantic_compressor.egg-info/top_level.txt +1 -0
  8. {semantic_compressor-2.3 → semantic_compressor-2.5}/setup.py +3 -3
  9. semantic_compressor-2.5/tests/__init__.py +0 -0
  10. semantic_compressor-2.5/tests/conftest.py +92 -0
  11. semantic_compressor-2.5/tests/test_benchmark.py +187 -0
  12. semantic_compressor-2.5/tests/test_clean_text.py +90 -0
  13. semantic_compressor-2.5/tests/test_compression.py +123 -0
  14. semantic_compressor-2.5/tests/test_correct_spelling.py +30 -0
  15. semantic_compressor-2.5/tests/test_embeddings.py +74 -0
  16. semantic_compressor-2.5/tests/test_find_needle_in_haystack.py +147 -0
  17. semantic_compressor-2.5/tests/test_language_and_stemming.py +74 -0
  18. semantic_compressor-2.5/tests/test_ngrams.py +60 -0
  19. semantic_compressor-2.5/tests/test_semantic_embeddings.py +39 -0
  20. semantic_compressor-2.5/tests/test_tokenizer.py +70 -0
  21. semantic_compressor-2.3/compressor/resources/lid.176.ftz +0 -0
  22. semantic_compressor-2.3/compressor/semantic.py +0 -408
  23. {semantic_compressor-2.3 → semantic_compressor-2.5}/LICENSE +0 -0
  24. {semantic_compressor-2.3 → semantic_compressor-2.5}/MANIFEST.in +0 -0
  25. {semantic_compressor-2.3 → semantic_compressor-2.5}/README.md +0 -0
  26. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/__init__.py +0 -0
  27. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/minbpe/__init__.py +0 -0
  28. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/minbpe/base.py +0 -0
  29. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/minbpe/basic.py +0 -0
  30. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/minbpe/regex.py +0 -0
  31. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/__init__.py +0 -0
  32. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/en_stopwords.pkl +0 -0
  33. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/README +0 -0
  34. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/albanian +0 -0
  35. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/arabic +0 -0
  36. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/azerbaijani +0 -0
  37. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/basque +0 -0
  38. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/belarusian +0 -0
  39. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/bengali +0 -0
  40. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/catalan +0 -0
  41. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/chinese +0 -0
  42. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/danish +0 -0
  43. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/dutch +0 -0
  44. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/english +0 -0
  45. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/finnish +0 -0
  46. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/french +0 -0
  47. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/german +0 -0
  48. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/greek +0 -0
  49. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/hebrew +0 -0
  50. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/hinglish +0 -0
  51. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/hungarian +0 -0
  52. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/indonesian +0 -0
  53. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/italian +0 -0
  54. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/kazakh +0 -0
  55. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/nepali +0 -0
  56. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/norwegian +0 -0
  57. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/portuguese +0 -0
  58. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/romanian +0 -0
  59. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/russian +0 -0
  60. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/slovene +0 -0
  61. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/spanish +0 -0
  62. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/swedish +0 -0
  63. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/tajik +0 -0
  64. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/tamil +0 -0
  65. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/turkish +0 -0
  66. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords.zip +0 -0
  67. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/wordnet.zip +0 -0
  68. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/words/README +0 -0
  69. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/words/en +0 -0
  70. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/words/en-basic +0 -0
  71. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step0.pt +0 -0
  72. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step1.pt +0 -0
  73. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step2.pt +0 -0
  74. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step3.pt +0 -0
  75. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step4.pt +0 -0
  76. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step5.pt +0 -0
  77. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step6.pt +0 -0
  78. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp.zip +0 -0
  79. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +0 -0
  80. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/.DS_Store +0 -0
  81. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/README +0 -0
  82. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/czech.pickle +0 -0
  83. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/danish.pickle +0 -0
  84. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/dutch.pickle +0 -0
  85. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/english.pickle +0 -0
  86. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/estonian.pickle +0 -0
  87. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/finnish.pickle +0 -0
  88. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/french.pickle +0 -0
  89. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/german.pickle +0 -0
  90. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/greek.pickle +0 -0
  91. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/italian.pickle +0 -0
  92. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/malayalam.pickle +0 -0
  93. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/norwegian.pickle +0 -0
  94. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/polish.pickle +0 -0
  95. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/portuguese.pickle +0 -0
  96. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/russian.pickle +0 -0
  97. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/slovene.pickle +0 -0
  98. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/spanish.pickle +0 -0
  99. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/swedish.pickle +0 -0
  100. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/turkish.pickle +0 -0
  101. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/README +0 -0
  102. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/czech.pickle +0 -0
  103. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/danish.pickle +0 -0
  104. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/dutch.pickle +0 -0
  105. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/english.pickle +0 -0
  106. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/estonian.pickle +0 -0
  107. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/finnish.pickle +0 -0
  108. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/french.pickle +0 -0
  109. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/german.pickle +0 -0
  110. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/greek.pickle +0 -0
  111. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/italian.pickle +0 -0
  112. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/malayalam.pickle +0 -0
  113. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/norwegian.pickle +0 -0
  114. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/polish.pickle +0 -0
  115. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/portuguese.pickle +0 -0
  116. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/russian.pickle +0 -0
  117. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/slovene.pickle +0 -0
  118. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/spanish.pickle +0 -0
  119. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/swedish.pickle +0 -0
  120. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/turkish.pickle +0 -0
  121. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt.zip +0 -0
  122. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/README +0 -0
  123. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +0 -0
  124. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/collocations.tab +0 -0
  125. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab +0 -0
  126. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt +0 -0
  127. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt +0 -0
  128. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/collocations.tab +0 -0
  129. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab +0 -0
  130. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt +0 -0
  131. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt +0 -0
  132. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/collocations.tab +0 -0
  133. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab +0 -0
  134. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt +0 -0
  135. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +0 -0
  136. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab +0 -0
  137. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +0 -0
  138. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +0 -0
  139. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt +0 -0
  140. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/collocations.tab +0 -0
  141. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab +0 -0
  142. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt +0 -0
  143. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt +0 -0
  144. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/collocations.tab +0 -0
  145. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab +0 -0
  146. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt +0 -0
  147. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt +0 -0
  148. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/collocations.tab +0 -0
  149. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/ortho_context.tab +0 -0
  150. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/sent_starters.txt +0 -0
  151. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt +0 -0
  152. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/collocations.tab +0 -0
  153. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/ortho_context.tab +0 -0
  154. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/sent_starters.txt +0 -0
  155. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt +0 -0
  156. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/collocations.tab +0 -0
  157. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/ortho_context.tab +0 -0
  158. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/sent_starters.txt +0 -0
  159. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/abbrev_types.txt +0 -0
  160. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/collocations.tab +0 -0
  161. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/ortho_context.tab +0 -0
  162. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/sent_starters.txt +0 -0
  163. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/abbrev_types.txt +0 -0
  164. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/collocations.tab +0 -0
  165. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/ortho_context.tab +0 -0
  166. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/sent_starters.txt +0 -0
  167. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/abbrev_types.txt +0 -0
  168. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/collocations.tab +0 -0
  169. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/ortho_context.tab +0 -0
  170. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/sent_starters.txt +0 -0
  171. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/abbrev_types.txt +0 -0
  172. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/collocations.tab +0 -0
  173. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/ortho_context.tab +0 -0
  174. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/sent_starters.txt +0 -0
  175. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/abbrev_types.txt +0 -0
  176. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/collocations.tab +0 -0
  177. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/ortho_context.tab +0 -0
  178. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/sent_starters.txt +0 -0
  179. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/abbrev_types.txt +0 -0
  180. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/collocations.tab +0 -0
  181. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/ortho_context.tab +0 -0
  182. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/sent_starters.txt +0 -0
  183. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/abbrev_types.txt +0 -0
  184. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/collocations.tab +0 -0
  185. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/ortho_context.tab +0 -0
  186. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/sent_starters.txt +0 -0
  187. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/abbrev_types.txt +0 -0
  188. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/collocations.tab +0 -0
  189. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/ortho_context.tab +0 -0
  190. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/sent_starters.txt +0 -0
  191. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/abbrev_types.txt +0 -0
  192. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/collocations.tab +0 -0
  193. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/ortho_context.tab +0 -0
  194. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/sent_starters.txt +0 -0
  195. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt +0 -0
  196. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab +0 -0
  197. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab +0 -0
  198. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt +0 -0
  199. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab.zip +0 -0
  200. {semantic_compressor-2.3 → semantic_compressor-2.5}/compressor/resources/pt_stopwords.pkl +0 -0
  201. {semantic_compressor-2.3 → semantic_compressor-2.5}/semantic_compressor.egg-info/dependency_links.txt +0 -0
  202. {semantic_compressor-2.3 → semantic_compressor-2.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: semantic_compressor
3
- Version: 2.3
3
+ Version: 2.5
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -9,13 +9,14 @@ Classifier: Operating System :: OS Independent
9
9
  Requires-Python: >=3.7
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
- Requires-Dist: numpy<2
12
+ Requires-Dist: numpy
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
- Requires-Dist: fasttext
15
+ Requires-Dist: lingua-language-detector
16
16
  Requires-Dist: model2vec
17
17
  Requires-Dist: pyspellchecker
18
18
  Dynamic: author
19
+ Dynamic: license-file
19
20
  Dynamic: requires-python
20
21
 
21
22
  ```python
@@ -0,0 +1,414 @@
1
+ import os, sys, importlib.resources, functools, re
2
+
3
+ _NLTK_DATA_PATH = None
4
+ try:
5
+ _NLTK_DATA_PATH = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
6
+ except Exception:
7
+ _NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), 'resources', 'nltk_data')
8
+
9
+ os.environ['NLTK_DATA'] = _NLTK_DATA_PATH
10
+
11
+ from collections import Counter
12
+
13
+ _PUNCT_REATTACH = re.compile(r'\s+([.!,\?;:])')
14
+ _PUNCT_BOUNDARY = re.compile(r'([.!,\?;:])(?=\S)')
15
+ _HYPHENATION = re.compile(r'(\w)-\s*\n\s*(\w)')
16
+ _NOISE_CHARS = re.compile(r'[\|\•\[\]\(\)\"“”]')
17
+ _LEADING_HYPHEN = re.compile(r'(?m)^\s*-\s*')
18
+ _STRAY_HYPHEN = re.compile(r'(?<!\w)-(?!\w)')
19
+ _REPEATED_PUNCT = re.compile(r'([!?.,;:]){2,}')
20
+ _MULTI_SPACE = re.compile(r'[ \t]+')
21
+ _MULTI_NEWLINE = re.compile(r'\n{2,}')
22
+ _AGGRESSIVE_CLEAN = re.compile(r'[^A-Za-zÀ-ÿ\s\.\,\;\:\?\!]')
23
+ _MULTI_SPACE2 = re.compile(r'\s{2,}')
24
+
25
+ _EN_STOPWORDS_PATH = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
26
+ _PT_STOPWORDS_PATH = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
27
+
28
+
29
+ @functools.lru_cache(maxsize=1)
30
+ def _ensure_nltk_ready():
31
+ import nltk.data
32
+ nltk.data.path.insert(0, _NLTK_DATA_PATH)
33
+
34
+
35
+ @functools.lru_cache(maxsize=1)
36
+ def _get_tokenizer():
37
+ from compressor.minbpe.regex import RegexTokenizer
38
+ return RegexTokenizer()
39
+
40
+
41
+ @functools.lru_cache(maxsize=1)
42
+ def _get_english_stemmer():
43
+ _ensure_nltk_ready()
44
+ from nltk.stem import PorterStemmer
45
+ return PorterStemmer()
46
+
47
+
48
+ @functools.lru_cache(maxsize=1)
49
+ def _get_portuguese_stemmer():
50
+ _ensure_nltk_ready()
51
+ from nltk.stem import RSLPStemmer
52
+ return RSLPStemmer()
53
+
54
+
55
+ @functools.lru_cache(maxsize=1)
56
+ def _get_language_detector():
57
+ from lingua import Language, LanguageDetectorBuilder
58
+ return LanguageDetectorBuilder.from_languages(
59
+ Language.ENGLISH, Language.PORTUGUESE
60
+ ).build()
61
+
62
+
63
+ @functools.lru_cache(maxsize=1)
64
+ def _get_language_enums():
65
+ from lingua import Language
66
+ return Language
67
+
68
+
69
+ @functools.lru_cache(maxsize=1)
70
+ def _get_english_stopwords():
71
+ import pickle
72
+ return pickle.load(open(_EN_STOPWORDS_PATH, "rb"))
73
+
74
+
75
+ @functools.lru_cache(maxsize=1)
76
+ def _get_portuguese_stopwords():
77
+ import pickle
78
+ return pickle.load(open(_PT_STOPWORDS_PATH, "rb"))
79
+
80
+
81
+ @functools.lru_cache(maxsize=1)
82
+ def _get_embedding_model():
83
+ from model2vec import StaticModel
84
+ return StaticModel.from_pretrained("cnmoro/static-nomic-eng-ptbr-tiny")
85
+
86
+
87
+ @functools.lru_cache(maxsize=1)
88
+ def _get_hashing_vectorizer():
89
+ from sklearn.feature_extraction.text import HashingVectorizer
90
+ return HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
91
+
92
+
93
+ @functools.lru_cache(maxsize=1)
94
+ def _get_sent_tokenize():
95
+ _ensure_nltk_ready()
96
+ from nltk.tokenize import sent_tokenize
97
+ return sent_tokenize
98
+
99
+
100
+ def clean_text(text: str) -> str:
101
+ text = _HYPHENATION.sub(r'\1\2', text)
102
+ text = _NOISE_CHARS.sub(' ', text)
103
+ text = _LEADING_HYPHEN.sub('', text)
104
+ text = _STRAY_HYPHEN.sub(' ', text)
105
+ text = _REPEATED_PUNCT.sub(r'\1', text)
106
+ text = _MULTI_SPACE.sub(' ', text)
107
+ text = _MULTI_NEWLINE.sub('\n', text).strip()
108
+
109
+ alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
110
+ if alpha_ratio < 0.8:
111
+ text = _AGGRESSIVE_CLEAN.sub(' ', text)
112
+ text = _MULTI_SPACE2.sub(' ', text).strip()
113
+
114
+ text = _PUNCT_REATTACH.sub(r'\1', text)
115
+ text = _PUNCT_BOUNDARY.sub(r'\1 ', text)
116
+ return text
117
+
118
+
119
+ def extract_textual_embeddings(text):
120
+ v = _get_hashing_vectorizer()
121
+ import numpy as np
122
+ return np.asarray(v.transform([text]).sum(axis=0)).ravel().tolist()
123
+
124
+
125
+ def extract_semantic_embeddings(text):
126
+ return _get_embedding_model().encode([text])[0]
127
+
128
+
129
+ def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
130
+ tok = _get_tokenizer()
131
+ chunks = []
132
+ current_chunk = []
133
+ current_chunk_length = 0
134
+ tokens = tok.encode(full_text)
135
+ for i, token in enumerate(tokens):
136
+ if current_chunk_length + 1 > tokens_per_chunk:
137
+ chunks.append(current_chunk)
138
+ current_chunk = tokens[i - chunk_overlap:i] if i > chunk_overlap else []
139
+ current_chunk_length = len(current_chunk)
140
+ current_chunk.append(token)
141
+ current_chunk_length += 1
142
+ chunks.append(current_chunk)
143
+ return [tok.decode(chunk) for chunk in chunks]
144
+
145
+
146
+ def count_tokens(text):
147
+ return len(_get_tokenizer().encode(text))
148
+
149
+
150
+ def detect_language(text):
151
+ Language = _get_language_enums()
152
+ lang = _get_language_detector().detect_language_of(text)
153
+ return 'pt' if lang == Language.PORTUGUESE else 'en'
154
+
155
+
156
+ def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
157
+ words = text.split()
158
+ n = len(words)
159
+ if n < ngram_size:
160
+ return text
161
+
162
+ ngram_tuples = [tuple(words[i:i + ngram_size]) for i in range(n - ngram_size + 1)]
163
+ counter = Counter(ngram_tuples)
164
+ repeated = [ng for ng, count in counter.items() if count > threshold]
165
+ if not repeated:
166
+ return text
167
+
168
+ for ng in repeated:
169
+ first = True
170
+ i = 0
171
+ while i <= len(words) - ngram_size:
172
+ if tuple(words[i:i + ngram_size]) == ng:
173
+ if first:
174
+ first = False
175
+ i += ngram_size
176
+ else:
177
+ del words[i:i + ngram_size]
178
+ else:
179
+ i += 1
180
+ return ' '.join(words)
181
+
182
+
183
+ def calculate_similarity(embed1, embed2):
184
+ from sklearn.metrics.pairwise import cosine_similarity
185
+ return cosine_similarity([embed1], [embed2])[0][0]
186
+
187
+
188
+ def _get_stopwords(lang):
189
+ if lang == 'pt':
190
+ return _get_portuguese_stopwords()
191
+ return _get_english_stopwords()
192
+
193
+
194
+ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None, perform_cleaning: bool = True):
195
+ import warnings
196
+ from sklearn.feature_extraction.text import TfidfVectorizer
197
+ from sklearn.decomposition import TruncatedSVD
198
+ import numpy as np
199
+ import traceback
200
+
201
+ try:
202
+ if perform_cleaning:
203
+ full_text = clean_text(full_text)
204
+
205
+ sent_tokenize = _get_sent_tokenize()
206
+ sentences = sent_tokenize(full_text)
207
+
208
+ final_sentences = []
209
+ for s in sentences:
210
+ final_sentences.extend(s.split('\n'))
211
+ sentences = final_sentences
212
+ n_sentences = len(sentences)
213
+
214
+ text_lang = detect_language(full_text)
215
+ stopwords = _get_stopwords(text_lang)
216
+
217
+ if n_sentences >= 3:
218
+ n_topics = min(num_topics, max(2, n_sentences // 5))
219
+ max_features = min(3000, max(500, n_sentences * 10))
220
+
221
+ vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=max_features)
222
+ doc_term_matrix = vectorizer.fit_transform(sentences)
223
+ svd = TruncatedSVD(n_components=n_topics, random_state=42)
224
+ with warnings.catch_warnings():
225
+ warnings.filterwarnings('ignore', category=RuntimeWarning, message='.*divide by zero.*')
226
+ svd.fit(doc_term_matrix)
227
+ topic_scores = np.abs(svd.transform(vectorizer.transform(sentences)))
228
+ else:
229
+ topic_scores = np.ones((n_sentences, 1)) * 0.5
230
+
231
+ doc_embedding = extract_semantic_embeddings(full_text)
232
+
233
+ if reference_text is not None:
234
+ reference_text_embedding = extract_semantic_embeddings(reference_text)
235
+ doc_embedding = 0.6 * doc_embedding + 0.4 * reference_text_embedding
236
+
237
+ sentence_embeddings = _get_embedding_model().encode(sentences)
238
+
239
+ sentence_scores = []
240
+ for i, sentence in enumerate(sentences):
241
+ sentence_embedding = sentence_embeddings[i]
242
+ semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
243
+
244
+ topic_importance = float(np.max(topic_scores[i]))
245
+
246
+ words = sentence.split()
247
+ unique_words = set(w.lower() for w in words if w.lower() not in stopwords)
248
+ lexical_diversity = len(unique_words) / len(words) if words else 0
249
+
250
+ importance = 0.6 * semantic_similarity + 0.3 * topic_importance + 0.2 * lexical_diversity
251
+ sentence_scores.append((sentence, importance))
252
+
253
+ sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
254
+
255
+ total_words = sum(len(s.split()) for s in sentences)
256
+ target_words = int(total_words * compression_rate)
257
+
258
+ compressed_text = []
259
+ current_words = 0
260
+ for sentence, _ in sorted_sentences:
261
+ sentence_words = len(sentence.split())
262
+ if current_words + sentence_words <= target_words:
263
+ compressed_text.append(sentence)
264
+ current_words += sentence_words
265
+ else:
266
+ break
267
+
268
+ if not compressed_text:
269
+ compressed_text = [sentences[0]]
270
+
271
+ compressed_text.sort(key=lambda x: sentences.index(x))
272
+ compressed_text = [s.capitalize() for s in compressed_text]
273
+
274
+ cleaned_compressed_text = ' '.join(compressed_text).replace(' ', ' ').strip()
275
+ cleaned_compressed_text = compute_and_remove_repeated_ngrams(cleaned_compressed_text)
276
+ return cleaned_compressed_text
277
+ except Exception:
278
+ traceback.print_exc()
279
+ return full_text
280
+
281
+
282
+ def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None, perform_cleaning=True):
283
+ import traceback
284
+ try:
285
+ if target_token_count is None:
286
+ compression_rate = 1 - compression_rate
287
+ else:
288
+ original_token_count = count_tokens(text)
289
+ if original_token_count <= target_token_count:
290
+ return text
291
+ compression_rate = target_token_count / original_token_count
292
+
293
+ return semantic_compress_text(
294
+ full_text=text,
295
+ compression_rate=compression_rate,
296
+ reference_text=reference_text_steering,
297
+ perform_cleaning=perform_cleaning
298
+ )
299
+ except Exception:
300
+ traceback.print_exc()
301
+ return text
302
+
303
+
304
+ def stem_text(text, lang='en'):
305
+ if lang == 'en':
306
+ stemmer = _get_english_stemmer()
307
+ else:
308
+ stemmer = _get_portuguese_stemmer()
309
+ return ' '.join(stemmer.stem(word) for word in text.split())
310
+
311
+
312
+ def correct_spelling(sentence, detected_lang="pt"):
313
+ from spellchecker import SpellChecker
314
+ spell = SpellChecker(language=detected_lang)
315
+ words = sentence.split()
316
+ fixed = [spell.correction(word) for word in words]
317
+ result = []
318
+ for original, fixed_word in zip(words, fixed):
319
+ result.append(fixed_word if fixed_word is not None else original)
320
+ return ' '.join(result)
321
+
322
+
323
+ def preprocess_and_extract_textual_embedding(block, use_stemming, lang):
324
+ processed_block = block.lower() if not use_stemming else stem_text(block.lower(), lang)
325
+ return extract_textual_embeddings(processed_block)
326
+
327
+
328
+ def find_needle_in_haystack(
329
+ *, haystack: str, needle: str, block_size=300,
330
+ embedding_mode: str = 'both',
331
+ semantic_embeddings_weight: float = 0.3,
332
+ textual_embeddings_weight: float = 0.7,
333
+ use_stemming: bool = False,
334
+ correct_spelling_needle: bool = False
335
+ ):
336
+ import traceback
337
+ try:
338
+ if embedding_mode not in {'semantic', 'textual', 'both'}:
339
+ raise ValueError("Invalid embedding_mode. Choose 'semantic', 'textual', or 'both'.")
340
+
341
+ blocks = structurize_text(haystack, tokens_per_chunk=block_size)
342
+
343
+ lang = detect_language(f"{needle}\n\n{haystack}")
344
+
345
+ if correct_spelling_needle:
346
+ needle = correct_spelling(needle, lang)
347
+
348
+ needle_semantic_embedding = None
349
+ needle_textual_embedding = None
350
+
351
+ if embedding_mode in {'semantic', 'both'}:
352
+ needle_semantic_embedding = extract_semantic_embeddings(needle)
353
+
354
+ if embedding_mode in {'textual', 'both'}:
355
+ needle_textual_embedding = extract_textual_embeddings(
356
+ needle.lower() if not use_stemming else stem_text(needle, lang)
357
+ )
358
+
359
+ haystack_semantic_embeddings = []
360
+ haystack_textual_embeddings = []
361
+
362
+ if embedding_mode in {'semantic', 'both'}:
363
+ if len(blocks) == 1:
364
+ haystack_semantic_embeddings = [extract_semantic_embeddings(blocks[0])]
365
+ else:
366
+ from concurrent.futures import ProcessPoolExecutor
367
+ with ProcessPoolExecutor() as executor:
368
+ haystack_semantic_embeddings = list(executor.map(extract_semantic_embeddings, blocks))
369
+
370
+ if embedding_mode in {'textual', 'both'}:
371
+ if len(blocks) == 1:
372
+ haystack_textual_embeddings = [preprocess_and_extract_textual_embedding(blocks[0], use_stemming, lang)]
373
+ else:
374
+ from concurrent.futures import ProcessPoolExecutor
375
+ from multiprocessing import cpu_count
376
+ with ProcessPoolExecutor(max_workers=int(cpu_count() // 1.5)) as executor:
377
+ haystack_textual_embeddings = list(
378
+ executor.map(preprocess_and_extract_textual_embedding, blocks, [use_stemming] * len(blocks), [lang] * len(blocks))
379
+ )
380
+
381
+ semantic_similarities = []
382
+ textual_similarities = []
383
+
384
+ if embedding_mode in {'semantic', 'both'}:
385
+ semantic_similarities = [
386
+ calculate_similarity(needle_semantic_embedding, be)
387
+ for be in haystack_semantic_embeddings
388
+ ]
389
+
390
+ if embedding_mode in {'textual', 'both'}:
391
+ textual_similarities = [
392
+ calculate_similarity(needle_textual_embedding, be)
393
+ for be in haystack_textual_embeddings
394
+ ]
395
+
396
+ if embedding_mode == 'semantic':
397
+ sorted_blocks = sorted(zip(blocks, semantic_similarities), key=lambda x: x[1], reverse=True)
398
+ elif embedding_mode == 'textual':
399
+ sorted_blocks = sorted(zip(blocks, textual_similarities), key=lambda x: x[1], reverse=True)
400
+ else:
401
+ sorted_blocks = sorted(
402
+ zip(blocks, semantic_similarities, textual_similarities),
403
+ key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight,
404
+ reverse=True
405
+ )
406
+
407
+ most_similar_block = sorted_blocks[0][0]
408
+ most_similar_block_index = blocks.index(most_similar_block)
409
+ start_index = most_similar_block_index - 1 if most_similar_block_index > 0 else 0
410
+ needle_region = blocks[start_index:most_similar_block_index + 2]
411
+ return ''.join(needle_region).strip()
412
+ except Exception:
413
+ traceback.print_exc()
414
+ return haystack
@@ -1,10 +1,10 @@
1
1
  [build-system]
2
- requires = ["setuptools>=61.0", "numpy<2", "nltk", "scikit-learn", "fasttext", "onnxruntime", "onnxruntime-extensions", "pyspellchecker"]
2
+ requires = ["setuptools>=61.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "semantic_compressor"
7
- version = "2.3"
7
+ version = "2.5"
8
8
  authors = [
9
9
  { name="Carlo Moro", email="cnmoro@gmail.com" },
10
10
  ]
@@ -17,10 +17,10 @@ classifiers = [
17
17
  "Operating System :: OS Independent",
18
18
  ]
19
19
  dependencies = [
20
- "numpy<2",
20
+ "numpy",
21
21
  "nltk",
22
22
  "scikit-learn",
23
- "fasttext",
23
+ "lingua-language-detector",
24
24
  "model2vec",
25
25
  "pyspellchecker"
26
26
  ]
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: semantic_compressor
3
- Version: 2.3
3
+ Version: 2.5
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -9,13 +9,14 @@ Classifier: Operating System :: OS Independent
9
9
  Requires-Python: >=3.7
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
- Requires-Dist: numpy<2
12
+ Requires-Dist: numpy
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
- Requires-Dist: fasttext
15
+ Requires-Dist: lingua-language-detector
16
16
  Requires-Dist: model2vec
17
17
  Requires-Dist: pyspellchecker
18
18
  Dynamic: author
19
+ Dynamic: license-file
19
20
  Dynamic: requires-python
20
21
 
21
22
  ```python
@@ -11,7 +11,6 @@ compressor/minbpe/basic.py
11
11
  compressor/minbpe/regex.py
12
12
  compressor/resources/__init__.py
13
13
  compressor/resources/en_stopwords.pkl
14
- compressor/resources/lid.176.ftz
15
14
  compressor/resources/pt_stopwords.pkl
16
15
  compressor/resources/nltk_data/corpora/stopwords.zip
17
16
  compressor/resources/nltk_data/corpora/wordnet.zip
@@ -184,4 +183,16 @@ semantic_compressor.egg-info/PKG-INFO
184
183
  semantic_compressor.egg-info/SOURCES.txt
185
184
  semantic_compressor.egg-info/dependency_links.txt
186
185
  semantic_compressor.egg-info/requires.txt
187
- semantic_compressor.egg-info/top_level.txt
186
+ semantic_compressor.egg-info/top_level.txt
187
+ tests/__init__.py
188
+ tests/conftest.py
189
+ tests/test_benchmark.py
190
+ tests/test_clean_text.py
191
+ tests/test_compression.py
192
+ tests/test_correct_spelling.py
193
+ tests/test_embeddings.py
194
+ tests/test_find_needle_in_haystack.py
195
+ tests/test_language_and_stemming.py
196
+ tests/test_ngrams.py
197
+ tests/test_semantic_embeddings.py
198
+ tests/test_tokenizer.py
@@ -1,6 +1,6 @@
1
- numpy<2
1
+ numpy
2
2
  nltk
3
3
  scikit-learn
4
- fasttext
4
+ lingua-language-detector
5
5
  model2vec
6
6
  pyspellchecker
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='semantic_compressor',
5
- version='2.3',
5
+ version='2.5',
6
6
  author='Carlo Moro',
7
7
  author_email='cnmoro@gmail.com',
8
8
  description="Semantic text compression",
@@ -12,10 +12,10 @@ setup(
12
12
  },
13
13
  include_package_data=True,
14
14
  install_requires=[
15
- "numpy<2",
15
+ "numpy",
16
16
  "nltk",
17
17
  "scikit-learn",
18
- "fasttext",
18
+ "lingua-language-detector",
19
19
  "model2vec",
20
20
  "pyspellchecker"
21
21
  ],
File without changes
@@ -0,0 +1,92 @@
1
+ import os
2
+ import sys
3
+ import pytest
4
+ import importlib.resources
5
+
6
+ _NLTK_DATA_PATH = None
7
+ try:
8
+ _NLTK_DATA_PATH = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
9
+ except Exception:
10
+ _NLTK_DATA_PATH = os.path.join(
11
+ os.path.dirname(__file__), '..', 'compressor', 'resources', 'nltk_data'
12
+ )
13
+
14
+ os.environ['NLTK_DATA'] = _NLTK_DATA_PATH
15
+
16
+ from compressor.semantic import (
17
+ clean_text,
18
+ detect_language,
19
+ stem_text,
20
+ count_tokens,
21
+ structurize_text,
22
+ extract_textual_embeddings,
23
+ calculate_similarity,
24
+ compute_and_remove_repeated_ngrams,
25
+ correct_spelling,
26
+ )
27
+
28
+
29
+ def pytest_configure(config):
30
+ config.addinivalue_line(
31
+ "markers",
32
+ "need_model: mark test as requiring the model2vec model (skipped if not available)",
33
+ )
34
+
35
+
36
+ def _check_model_available():
37
+ try:
38
+ from compressor.semantic import _get_embedding_model
39
+ _get_embedding_model()
40
+ return True
41
+ except Exception:
42
+ return False
43
+
44
+
45
+ def pytest_collection_modifyitems(config, items):
46
+ model_available = _check_model_available()
47
+ if not model_available:
48
+ skip_need_model = pytest.mark.skip(reason="model2vec model not available")
49
+ for item in items:
50
+ if "need_model" in item.keywords:
51
+ item.add_marker(skip_need_model)
52
+
53
+
54
+ @pytest.fixture
55
+ def sample_text_en():
56
+ return (
57
+ "The quick brown fox jumps over the lazy dog. "
58
+ "This is a test sentence for the semantic compressor. "
59
+ "Natural language processing is a fascinating field. "
60
+ "Machine learning algorithms can analyze text data efficiently."
61
+ )
62
+
63
+
64
+ @pytest.fixture
65
+ def sample_text_pt():
66
+ return (
67
+ "O rato roeu a roupa do rei de Roma. "
68
+ "Esta é uma frase de teste para o compressor semântico. "
69
+ "Processamento de linguagem natural é uma área fascinante. "
70
+ "Algoritmos de aprendizado de máquina podem analisar texto eficientemente."
71
+ )
72
+
73
+
74
+ @pytest.fixture
75
+ def sample_text_noisy():
76
+ return "Hello, World!!! This is... a very noisy??? text---with | weird • characters [and] (parens)."
77
+
78
+
79
+ @pytest.fixture
80
+ def sample_text_hyphenated():
81
+ return "This is a hyphen- ated word that should be re- paired.\n\nSecond paragraph here."
82
+
83
+
84
+ @pytest.fixture
85
+ def sample_text_dense():
86
+ return (
87
+ "Artificial intelligence has transformed the modern world. "
88
+ "Deep learning models can recognize patterns in complex data. "
89
+ "Neural networks are inspired by the human brain. "
90
+ "Natural language understanding enables machines to read text. "
91
+ "Computer vision allows machines to interpret images and video."
92
+ )