semantic-compressor 2.4__tar.gz → 2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {semantic_compressor-2.4/semantic_compressor.egg-info → semantic_compressor-2.5}/PKG-INFO +5 -4
- semantic_compressor-2.5/compressor/semantic.py +414 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/pyproject.toml +4 -4
- {semantic_compressor-2.4 → semantic_compressor-2.5/semantic_compressor.egg-info}/PKG-INFO +5 -4
- {semantic_compressor-2.4 → semantic_compressor-2.5}/semantic_compressor.egg-info/SOURCES.txt +13 -2
- {semantic_compressor-2.4 → semantic_compressor-2.5}/semantic_compressor.egg-info/requires.txt +2 -2
- {semantic_compressor-2.4 → semantic_compressor-2.5}/semantic_compressor.egg-info/top_level.txt +1 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/setup.py +3 -3
- semantic_compressor-2.5/tests/__init__.py +0 -0
- semantic_compressor-2.5/tests/conftest.py +92 -0
- semantic_compressor-2.5/tests/test_benchmark.py +187 -0
- semantic_compressor-2.5/tests/test_clean_text.py +90 -0
- semantic_compressor-2.5/tests/test_compression.py +123 -0
- semantic_compressor-2.5/tests/test_correct_spelling.py +30 -0
- semantic_compressor-2.5/tests/test_embeddings.py +74 -0
- semantic_compressor-2.5/tests/test_find_needle_in_haystack.py +147 -0
- semantic_compressor-2.5/tests/test_language_and_stemming.py +74 -0
- semantic_compressor-2.5/tests/test_ngrams.py +60 -0
- semantic_compressor-2.5/tests/test_semantic_embeddings.py +39 -0
- semantic_compressor-2.5/tests/test_tokenizer.py +70 -0
- semantic_compressor-2.4/compressor/resources/lid.176.ftz +0 -0
- semantic_compressor-2.4/compressor/semantic.py +0 -408
- {semantic_compressor-2.4 → semantic_compressor-2.5}/LICENSE +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/MANIFEST.in +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/README.md +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/__init__.py +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/minbpe/__init__.py +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/minbpe/base.py +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/minbpe/basic.py +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/minbpe/regex.py +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/__init__.py +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/en_stopwords.pkl +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/README +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/albanian +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/arabic +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/azerbaijani +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/basque +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/belarusian +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/bengali +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/catalan +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/chinese +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/danish +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/dutch +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/english +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/finnish +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/french +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/german +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/greek +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/hebrew +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/hinglish +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/hungarian +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/indonesian +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/italian +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/kazakh +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/nepali +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/norwegian +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/portuguese +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/romanian +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/russian +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/slovene +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/spanish +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/swedish +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/tajik +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/tamil +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords/turkish +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/stopwords.zip +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/wordnet.zip +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/words/README +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/words/en +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/corpora/words/en-basic +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step0.pt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step1.pt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step2.pt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step3.pt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step4.pt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step5.pt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp/step6.pt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/stemmers/rslp.zip +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/.DS_Store +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/README +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/czech.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/danish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/dutch.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/english.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/estonian.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/finnish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/french.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/german.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/greek.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/italian.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/malayalam.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/norwegian.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/polish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/portuguese.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/russian.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/slovene.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/spanish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/swedish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/PY3/turkish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/README +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/czech.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/danish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/dutch.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/english.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/estonian.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/finnish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/french.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/german.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/greek.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/italian.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/malayalam.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/norwegian.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/polish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/portuguese.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/russian.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/slovene.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/spanish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/swedish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt/turkish.pickle +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt.zip +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/README +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/french/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/german/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/nltk_data/tokenizers/punkt_tab.zip +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/compressor/resources/pt_stopwords.pkl +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/semantic_compressor.egg-info/dependency_links.txt +0 -0
- {semantic_compressor-2.4 → semantic_compressor-2.5}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: semantic_compressor
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5
|
|
4
4
|
Author: Carlo Moro
|
|
5
5
|
Author-email: Carlo Moro <cnmoro@gmail.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -9,13 +9,14 @@ Classifier: Operating System :: OS Independent
|
|
|
9
9
|
Requires-Python: >=3.7
|
|
10
10
|
Description-Content-Type: text/markdown
|
|
11
11
|
License-File: LICENSE
|
|
12
|
-
Requires-Dist: numpy
|
|
12
|
+
Requires-Dist: numpy
|
|
13
13
|
Requires-Dist: nltk
|
|
14
14
|
Requires-Dist: scikit-learn
|
|
15
|
-
Requires-Dist:
|
|
15
|
+
Requires-Dist: lingua-language-detector
|
|
16
16
|
Requires-Dist: model2vec
|
|
17
17
|
Requires-Dist: pyspellchecker
|
|
18
18
|
Dynamic: author
|
|
19
|
+
Dynamic: license-file
|
|
19
20
|
Dynamic: requires-python
|
|
20
21
|
|
|
21
22
|
```python
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
import os, sys, importlib.resources, functools, re
|
|
2
|
+
|
|
3
|
+
_NLTK_DATA_PATH = None
|
|
4
|
+
try:
|
|
5
|
+
_NLTK_DATA_PATH = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
|
|
6
|
+
except Exception:
|
|
7
|
+
_NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), 'resources', 'nltk_data')
|
|
8
|
+
|
|
9
|
+
os.environ['NLTK_DATA'] = _NLTK_DATA_PATH
|
|
10
|
+
|
|
11
|
+
from collections import Counter
|
|
12
|
+
|
|
13
|
+
_PUNCT_REATTACH = re.compile(r'\s+([.!,\?;:])')
|
|
14
|
+
_PUNCT_BOUNDARY = re.compile(r'([.!,\?;:])(?=\S)')
|
|
15
|
+
_HYPHENATION = re.compile(r'(\w)-\s*\n\s*(\w)')
|
|
16
|
+
_NOISE_CHARS = re.compile(r'[\|\•\[\]\(\)\"“”]')
|
|
17
|
+
_LEADING_HYPHEN = re.compile(r'(?m)^\s*-\s*')
|
|
18
|
+
_STRAY_HYPHEN = re.compile(r'(?<!\w)-(?!\w)')
|
|
19
|
+
_REPEATED_PUNCT = re.compile(r'([!?.,;:]){2,}')
|
|
20
|
+
_MULTI_SPACE = re.compile(r'[ \t]+')
|
|
21
|
+
_MULTI_NEWLINE = re.compile(r'\n{2,}')
|
|
22
|
+
_AGGRESSIVE_CLEAN = re.compile(r'[^A-Za-zÀ-ÿ\s\.\,\;\:\?\!]')
|
|
23
|
+
_MULTI_SPACE2 = re.compile(r'\s{2,}')
|
|
24
|
+
|
|
25
|
+
_EN_STOPWORDS_PATH = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
|
|
26
|
+
_PT_STOPWORDS_PATH = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@functools.lru_cache(maxsize=1)
|
|
30
|
+
def _ensure_nltk_ready():
|
|
31
|
+
import nltk.data
|
|
32
|
+
nltk.data.path.insert(0, _NLTK_DATA_PATH)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@functools.lru_cache(maxsize=1)
|
|
36
|
+
def _get_tokenizer():
|
|
37
|
+
from compressor.minbpe.regex import RegexTokenizer
|
|
38
|
+
return RegexTokenizer()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@functools.lru_cache(maxsize=1)
|
|
42
|
+
def _get_english_stemmer():
|
|
43
|
+
_ensure_nltk_ready()
|
|
44
|
+
from nltk.stem import PorterStemmer
|
|
45
|
+
return PorterStemmer()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@functools.lru_cache(maxsize=1)
|
|
49
|
+
def _get_portuguese_stemmer():
|
|
50
|
+
_ensure_nltk_ready()
|
|
51
|
+
from nltk.stem import RSLPStemmer
|
|
52
|
+
return RSLPStemmer()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@functools.lru_cache(maxsize=1)
|
|
56
|
+
def _get_language_detector():
|
|
57
|
+
from lingua import Language, LanguageDetectorBuilder
|
|
58
|
+
return LanguageDetectorBuilder.from_languages(
|
|
59
|
+
Language.ENGLISH, Language.PORTUGUESE
|
|
60
|
+
).build()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@functools.lru_cache(maxsize=1)
|
|
64
|
+
def _get_language_enums():
|
|
65
|
+
from lingua import Language
|
|
66
|
+
return Language
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@functools.lru_cache(maxsize=1)
|
|
70
|
+
def _get_english_stopwords():
|
|
71
|
+
import pickle
|
|
72
|
+
return pickle.load(open(_EN_STOPWORDS_PATH, "rb"))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@functools.lru_cache(maxsize=1)
|
|
76
|
+
def _get_portuguese_stopwords():
|
|
77
|
+
import pickle
|
|
78
|
+
return pickle.load(open(_PT_STOPWORDS_PATH, "rb"))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@functools.lru_cache(maxsize=1)
|
|
82
|
+
def _get_embedding_model():
|
|
83
|
+
from model2vec import StaticModel
|
|
84
|
+
return StaticModel.from_pretrained("cnmoro/static-nomic-eng-ptbr-tiny")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@functools.lru_cache(maxsize=1)
|
|
88
|
+
def _get_hashing_vectorizer():
|
|
89
|
+
from sklearn.feature_extraction.text import HashingVectorizer
|
|
90
|
+
return HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@functools.lru_cache(maxsize=1)
|
|
94
|
+
def _get_sent_tokenize():
|
|
95
|
+
_ensure_nltk_ready()
|
|
96
|
+
from nltk.tokenize import sent_tokenize
|
|
97
|
+
return sent_tokenize
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def clean_text(text: str) -> str:
|
|
101
|
+
text = _HYPHENATION.sub(r'\1\2', text)
|
|
102
|
+
text = _NOISE_CHARS.sub(' ', text)
|
|
103
|
+
text = _LEADING_HYPHEN.sub('', text)
|
|
104
|
+
text = _STRAY_HYPHEN.sub(' ', text)
|
|
105
|
+
text = _REPEATED_PUNCT.sub(r'\1', text)
|
|
106
|
+
text = _MULTI_SPACE.sub(' ', text)
|
|
107
|
+
text = _MULTI_NEWLINE.sub('\n', text).strip()
|
|
108
|
+
|
|
109
|
+
alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
|
|
110
|
+
if alpha_ratio < 0.8:
|
|
111
|
+
text = _AGGRESSIVE_CLEAN.sub(' ', text)
|
|
112
|
+
text = _MULTI_SPACE2.sub(' ', text).strip()
|
|
113
|
+
|
|
114
|
+
text = _PUNCT_REATTACH.sub(r'\1', text)
|
|
115
|
+
text = _PUNCT_BOUNDARY.sub(r'\1 ', text)
|
|
116
|
+
return text
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def extract_textual_embeddings(text):
|
|
120
|
+
v = _get_hashing_vectorizer()
|
|
121
|
+
import numpy as np
|
|
122
|
+
return np.asarray(v.transform([text]).sum(axis=0)).ravel().tolist()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def extract_semantic_embeddings(text):
|
|
126
|
+
return _get_embedding_model().encode([text])[0]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
|
|
130
|
+
tok = _get_tokenizer()
|
|
131
|
+
chunks = []
|
|
132
|
+
current_chunk = []
|
|
133
|
+
current_chunk_length = 0
|
|
134
|
+
tokens = tok.encode(full_text)
|
|
135
|
+
for i, token in enumerate(tokens):
|
|
136
|
+
if current_chunk_length + 1 > tokens_per_chunk:
|
|
137
|
+
chunks.append(current_chunk)
|
|
138
|
+
current_chunk = tokens[i - chunk_overlap:i] if i > chunk_overlap else []
|
|
139
|
+
current_chunk_length = len(current_chunk)
|
|
140
|
+
current_chunk.append(token)
|
|
141
|
+
current_chunk_length += 1
|
|
142
|
+
chunks.append(current_chunk)
|
|
143
|
+
return [tok.decode(chunk) for chunk in chunks]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def count_tokens(text):
|
|
147
|
+
return len(_get_tokenizer().encode(text))
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def detect_language(text):
|
|
151
|
+
Language = _get_language_enums()
|
|
152
|
+
lang = _get_language_detector().detect_language_of(text)
|
|
153
|
+
return 'pt' if lang == Language.PORTUGUESE else 'en'
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
|
|
157
|
+
words = text.split()
|
|
158
|
+
n = len(words)
|
|
159
|
+
if n < ngram_size:
|
|
160
|
+
return text
|
|
161
|
+
|
|
162
|
+
ngram_tuples = [tuple(words[i:i + ngram_size]) for i in range(n - ngram_size + 1)]
|
|
163
|
+
counter = Counter(ngram_tuples)
|
|
164
|
+
repeated = [ng for ng, count in counter.items() if count > threshold]
|
|
165
|
+
if not repeated:
|
|
166
|
+
return text
|
|
167
|
+
|
|
168
|
+
for ng in repeated:
|
|
169
|
+
first = True
|
|
170
|
+
i = 0
|
|
171
|
+
while i <= len(words) - ngram_size:
|
|
172
|
+
if tuple(words[i:i + ngram_size]) == ng:
|
|
173
|
+
if first:
|
|
174
|
+
first = False
|
|
175
|
+
i += ngram_size
|
|
176
|
+
else:
|
|
177
|
+
del words[i:i + ngram_size]
|
|
178
|
+
else:
|
|
179
|
+
i += 1
|
|
180
|
+
return ' '.join(words)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def calculate_similarity(embed1, embed2):
|
|
184
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
185
|
+
return cosine_similarity([embed1], [embed2])[0][0]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _get_stopwords(lang):
|
|
189
|
+
if lang == 'pt':
|
|
190
|
+
return _get_portuguese_stopwords()
|
|
191
|
+
return _get_english_stopwords()
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None, perform_cleaning: bool = True):
|
|
195
|
+
import warnings
|
|
196
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
197
|
+
from sklearn.decomposition import TruncatedSVD
|
|
198
|
+
import numpy as np
|
|
199
|
+
import traceback
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
if perform_cleaning:
|
|
203
|
+
full_text = clean_text(full_text)
|
|
204
|
+
|
|
205
|
+
sent_tokenize = _get_sent_tokenize()
|
|
206
|
+
sentences = sent_tokenize(full_text)
|
|
207
|
+
|
|
208
|
+
final_sentences = []
|
|
209
|
+
for s in sentences:
|
|
210
|
+
final_sentences.extend(s.split('\n'))
|
|
211
|
+
sentences = final_sentences
|
|
212
|
+
n_sentences = len(sentences)
|
|
213
|
+
|
|
214
|
+
text_lang = detect_language(full_text)
|
|
215
|
+
stopwords = _get_stopwords(text_lang)
|
|
216
|
+
|
|
217
|
+
if n_sentences >= 3:
|
|
218
|
+
n_topics = min(num_topics, max(2, n_sentences // 5))
|
|
219
|
+
max_features = min(3000, max(500, n_sentences * 10))
|
|
220
|
+
|
|
221
|
+
vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=max_features)
|
|
222
|
+
doc_term_matrix = vectorizer.fit_transform(sentences)
|
|
223
|
+
svd = TruncatedSVD(n_components=n_topics, random_state=42)
|
|
224
|
+
with warnings.catch_warnings():
|
|
225
|
+
warnings.filterwarnings('ignore', category=RuntimeWarning, message='.*divide by zero.*')
|
|
226
|
+
svd.fit(doc_term_matrix)
|
|
227
|
+
topic_scores = np.abs(svd.transform(vectorizer.transform(sentences)))
|
|
228
|
+
else:
|
|
229
|
+
topic_scores = np.ones((n_sentences, 1)) * 0.5
|
|
230
|
+
|
|
231
|
+
doc_embedding = extract_semantic_embeddings(full_text)
|
|
232
|
+
|
|
233
|
+
if reference_text is not None:
|
|
234
|
+
reference_text_embedding = extract_semantic_embeddings(reference_text)
|
|
235
|
+
doc_embedding = 0.6 * doc_embedding + 0.4 * reference_text_embedding
|
|
236
|
+
|
|
237
|
+
sentence_embeddings = _get_embedding_model().encode(sentences)
|
|
238
|
+
|
|
239
|
+
sentence_scores = []
|
|
240
|
+
for i, sentence in enumerate(sentences):
|
|
241
|
+
sentence_embedding = sentence_embeddings[i]
|
|
242
|
+
semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
|
|
243
|
+
|
|
244
|
+
topic_importance = float(np.max(topic_scores[i]))
|
|
245
|
+
|
|
246
|
+
words = sentence.split()
|
|
247
|
+
unique_words = set(w.lower() for w in words if w.lower() not in stopwords)
|
|
248
|
+
lexical_diversity = len(unique_words) / len(words) if words else 0
|
|
249
|
+
|
|
250
|
+
importance = 0.6 * semantic_similarity + 0.3 * topic_importance + 0.2 * lexical_diversity
|
|
251
|
+
sentence_scores.append((sentence, importance))
|
|
252
|
+
|
|
253
|
+
sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
|
|
254
|
+
|
|
255
|
+
total_words = sum(len(s.split()) for s in sentences)
|
|
256
|
+
target_words = int(total_words * compression_rate)
|
|
257
|
+
|
|
258
|
+
compressed_text = []
|
|
259
|
+
current_words = 0
|
|
260
|
+
for sentence, _ in sorted_sentences:
|
|
261
|
+
sentence_words = len(sentence.split())
|
|
262
|
+
if current_words + sentence_words <= target_words:
|
|
263
|
+
compressed_text.append(sentence)
|
|
264
|
+
current_words += sentence_words
|
|
265
|
+
else:
|
|
266
|
+
break
|
|
267
|
+
|
|
268
|
+
if not compressed_text:
|
|
269
|
+
compressed_text = [sentences[0]]
|
|
270
|
+
|
|
271
|
+
compressed_text.sort(key=lambda x: sentences.index(x))
|
|
272
|
+
compressed_text = [s.capitalize() for s in compressed_text]
|
|
273
|
+
|
|
274
|
+
cleaned_compressed_text = ' '.join(compressed_text).replace(' ', ' ').strip()
|
|
275
|
+
cleaned_compressed_text = compute_and_remove_repeated_ngrams(cleaned_compressed_text)
|
|
276
|
+
return cleaned_compressed_text
|
|
277
|
+
except Exception:
|
|
278
|
+
traceback.print_exc()
|
|
279
|
+
return full_text
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None, perform_cleaning=True):
|
|
283
|
+
import traceback
|
|
284
|
+
try:
|
|
285
|
+
if target_token_count is None:
|
|
286
|
+
compression_rate = 1 - compression_rate
|
|
287
|
+
else:
|
|
288
|
+
original_token_count = count_tokens(text)
|
|
289
|
+
if original_token_count <= target_token_count:
|
|
290
|
+
return text
|
|
291
|
+
compression_rate = target_token_count / original_token_count
|
|
292
|
+
|
|
293
|
+
return semantic_compress_text(
|
|
294
|
+
full_text=text,
|
|
295
|
+
compression_rate=compression_rate,
|
|
296
|
+
reference_text=reference_text_steering,
|
|
297
|
+
perform_cleaning=perform_cleaning
|
|
298
|
+
)
|
|
299
|
+
except Exception:
|
|
300
|
+
traceback.print_exc()
|
|
301
|
+
return text
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def stem_text(text, lang='en'):
|
|
305
|
+
if lang == 'en':
|
|
306
|
+
stemmer = _get_english_stemmer()
|
|
307
|
+
else:
|
|
308
|
+
stemmer = _get_portuguese_stemmer()
|
|
309
|
+
return ' '.join(stemmer.stem(word) for word in text.split())
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def correct_spelling(sentence, detected_lang="pt"):
|
|
313
|
+
from spellchecker import SpellChecker
|
|
314
|
+
spell = SpellChecker(language=detected_lang)
|
|
315
|
+
words = sentence.split()
|
|
316
|
+
fixed = [spell.correction(word) for word in words]
|
|
317
|
+
result = []
|
|
318
|
+
for original, fixed_word in zip(words, fixed):
|
|
319
|
+
result.append(fixed_word if fixed_word is not None else original)
|
|
320
|
+
return ' '.join(result)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def preprocess_and_extract_textual_embedding(block, use_stemming, lang):
|
|
324
|
+
processed_block = block.lower() if not use_stemming else stem_text(block.lower(), lang)
|
|
325
|
+
return extract_textual_embeddings(processed_block)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def find_needle_in_haystack(
|
|
329
|
+
*, haystack: str, needle: str, block_size=300,
|
|
330
|
+
embedding_mode: str = 'both',
|
|
331
|
+
semantic_embeddings_weight: float = 0.3,
|
|
332
|
+
textual_embeddings_weight: float = 0.7,
|
|
333
|
+
use_stemming: bool = False,
|
|
334
|
+
correct_spelling_needle: bool = False
|
|
335
|
+
):
|
|
336
|
+
import traceback
|
|
337
|
+
try:
|
|
338
|
+
if embedding_mode not in {'semantic', 'textual', 'both'}:
|
|
339
|
+
raise ValueError("Invalid embedding_mode. Choose 'semantic', 'textual', or 'both'.")
|
|
340
|
+
|
|
341
|
+
blocks = structurize_text(haystack, tokens_per_chunk=block_size)
|
|
342
|
+
|
|
343
|
+
lang = detect_language(f"{needle}\n\n{haystack}")
|
|
344
|
+
|
|
345
|
+
if correct_spelling_needle:
|
|
346
|
+
needle = correct_spelling(needle, lang)
|
|
347
|
+
|
|
348
|
+
needle_semantic_embedding = None
|
|
349
|
+
needle_textual_embedding = None
|
|
350
|
+
|
|
351
|
+
if embedding_mode in {'semantic', 'both'}:
|
|
352
|
+
needle_semantic_embedding = extract_semantic_embeddings(needle)
|
|
353
|
+
|
|
354
|
+
if embedding_mode in {'textual', 'both'}:
|
|
355
|
+
needle_textual_embedding = extract_textual_embeddings(
|
|
356
|
+
needle.lower() if not use_stemming else stem_text(needle, lang)
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
haystack_semantic_embeddings = []
|
|
360
|
+
haystack_textual_embeddings = []
|
|
361
|
+
|
|
362
|
+
if embedding_mode in {'semantic', 'both'}:
|
|
363
|
+
if len(blocks) == 1:
|
|
364
|
+
haystack_semantic_embeddings = [extract_semantic_embeddings(blocks[0])]
|
|
365
|
+
else:
|
|
366
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
367
|
+
with ProcessPoolExecutor() as executor:
|
|
368
|
+
haystack_semantic_embeddings = list(executor.map(extract_semantic_embeddings, blocks))
|
|
369
|
+
|
|
370
|
+
if embedding_mode in {'textual', 'both'}:
|
|
371
|
+
if len(blocks) == 1:
|
|
372
|
+
haystack_textual_embeddings = [preprocess_and_extract_textual_embedding(blocks[0], use_stemming, lang)]
|
|
373
|
+
else:
|
|
374
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
375
|
+
from multiprocessing import cpu_count
|
|
376
|
+
with ProcessPoolExecutor(max_workers=int(cpu_count() // 1.5)) as executor:
|
|
377
|
+
haystack_textual_embeddings = list(
|
|
378
|
+
executor.map(preprocess_and_extract_textual_embedding, blocks, [use_stemming] * len(blocks), [lang] * len(blocks))
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
semantic_similarities = []
|
|
382
|
+
textual_similarities = []
|
|
383
|
+
|
|
384
|
+
if embedding_mode in {'semantic', 'both'}:
|
|
385
|
+
semantic_similarities = [
|
|
386
|
+
calculate_similarity(needle_semantic_embedding, be)
|
|
387
|
+
for be in haystack_semantic_embeddings
|
|
388
|
+
]
|
|
389
|
+
|
|
390
|
+
if embedding_mode in {'textual', 'both'}:
|
|
391
|
+
textual_similarities = [
|
|
392
|
+
calculate_similarity(needle_textual_embedding, be)
|
|
393
|
+
for be in haystack_textual_embeddings
|
|
394
|
+
]
|
|
395
|
+
|
|
396
|
+
if embedding_mode == 'semantic':
|
|
397
|
+
sorted_blocks = sorted(zip(blocks, semantic_similarities), key=lambda x: x[1], reverse=True)
|
|
398
|
+
elif embedding_mode == 'textual':
|
|
399
|
+
sorted_blocks = sorted(zip(blocks, textual_similarities), key=lambda x: x[1], reverse=True)
|
|
400
|
+
else:
|
|
401
|
+
sorted_blocks = sorted(
|
|
402
|
+
zip(blocks, semantic_similarities, textual_similarities),
|
|
403
|
+
key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight,
|
|
404
|
+
reverse=True
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
most_similar_block = sorted_blocks[0][0]
|
|
408
|
+
most_similar_block_index = blocks.index(most_similar_block)
|
|
409
|
+
start_index = most_similar_block_index - 1 if most_similar_block_index > 0 else 0
|
|
410
|
+
needle_region = blocks[start_index:most_similar_block_index + 2]
|
|
411
|
+
return ''.join(needle_region).strip()
|
|
412
|
+
except Exception:
|
|
413
|
+
traceback.print_exc()
|
|
414
|
+
return haystack
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
[build-system]
|
|
2
|
-
requires = ["setuptools>=61.0"
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "semantic_compressor"
|
|
7
|
-
version = "2.
|
|
7
|
+
version = "2.5"
|
|
8
8
|
authors = [
|
|
9
9
|
{ name="Carlo Moro", email="cnmoro@gmail.com" },
|
|
10
10
|
]
|
|
@@ -17,10 +17,10 @@ classifiers = [
|
|
|
17
17
|
"Operating System :: OS Independent",
|
|
18
18
|
]
|
|
19
19
|
dependencies = [
|
|
20
|
-
"numpy
|
|
20
|
+
"numpy",
|
|
21
21
|
"nltk",
|
|
22
22
|
"scikit-learn",
|
|
23
|
-
"
|
|
23
|
+
"lingua-language-detector",
|
|
24
24
|
"model2vec",
|
|
25
25
|
"pyspellchecker"
|
|
26
26
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: semantic_compressor
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5
|
|
4
4
|
Author: Carlo Moro
|
|
5
5
|
Author-email: Carlo Moro <cnmoro@gmail.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -9,13 +9,14 @@ Classifier: Operating System :: OS Independent
|
|
|
9
9
|
Requires-Python: >=3.7
|
|
10
10
|
Description-Content-Type: text/markdown
|
|
11
11
|
License-File: LICENSE
|
|
12
|
-
Requires-Dist: numpy
|
|
12
|
+
Requires-Dist: numpy
|
|
13
13
|
Requires-Dist: nltk
|
|
14
14
|
Requires-Dist: scikit-learn
|
|
15
|
-
Requires-Dist:
|
|
15
|
+
Requires-Dist: lingua-language-detector
|
|
16
16
|
Requires-Dist: model2vec
|
|
17
17
|
Requires-Dist: pyspellchecker
|
|
18
18
|
Dynamic: author
|
|
19
|
+
Dynamic: license-file
|
|
19
20
|
Dynamic: requires-python
|
|
20
21
|
|
|
21
22
|
```python
|
{semantic_compressor-2.4 → semantic_compressor-2.5}/semantic_compressor.egg-info/SOURCES.txt
RENAMED
|
@@ -11,7 +11,6 @@ compressor/minbpe/basic.py
|
|
|
11
11
|
compressor/minbpe/regex.py
|
|
12
12
|
compressor/resources/__init__.py
|
|
13
13
|
compressor/resources/en_stopwords.pkl
|
|
14
|
-
compressor/resources/lid.176.ftz
|
|
15
14
|
compressor/resources/pt_stopwords.pkl
|
|
16
15
|
compressor/resources/nltk_data/corpora/stopwords.zip
|
|
17
16
|
compressor/resources/nltk_data/corpora/wordnet.zip
|
|
@@ -184,4 +183,16 @@ semantic_compressor.egg-info/PKG-INFO
|
|
|
184
183
|
semantic_compressor.egg-info/SOURCES.txt
|
|
185
184
|
semantic_compressor.egg-info/dependency_links.txt
|
|
186
185
|
semantic_compressor.egg-info/requires.txt
|
|
187
|
-
semantic_compressor.egg-info/top_level.txt
|
|
186
|
+
semantic_compressor.egg-info/top_level.txt
|
|
187
|
+
tests/__init__.py
|
|
188
|
+
tests/conftest.py
|
|
189
|
+
tests/test_benchmark.py
|
|
190
|
+
tests/test_clean_text.py
|
|
191
|
+
tests/test_compression.py
|
|
192
|
+
tests/test_correct_spelling.py
|
|
193
|
+
tests/test_embeddings.py
|
|
194
|
+
tests/test_find_needle_in_haystack.py
|
|
195
|
+
tests/test_language_and_stemming.py
|
|
196
|
+
tests/test_ngrams.py
|
|
197
|
+
tests/test_semantic_embeddings.py
|
|
198
|
+
tests/test_tokenizer.py
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name='semantic_compressor',
|
|
5
|
-
version='2.
|
|
5
|
+
version='2.5',
|
|
6
6
|
author='Carlo Moro',
|
|
7
7
|
author_email='cnmoro@gmail.com',
|
|
8
8
|
description="Semantic text compression",
|
|
@@ -12,10 +12,10 @@ setup(
|
|
|
12
12
|
},
|
|
13
13
|
include_package_data=True,
|
|
14
14
|
install_requires=[
|
|
15
|
-
"numpy
|
|
15
|
+
"numpy",
|
|
16
16
|
"nltk",
|
|
17
17
|
"scikit-learn",
|
|
18
|
-
"
|
|
18
|
+
"lingua-language-detector",
|
|
19
19
|
"model2vec",
|
|
20
20
|
"pyspellchecker"
|
|
21
21
|
],
|
|
File without changes
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import pytest
|
|
4
|
+
import importlib.resources
|
|
5
|
+
|
|
6
|
+
_NLTK_DATA_PATH = None
|
|
7
|
+
try:
|
|
8
|
+
_NLTK_DATA_PATH = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
|
|
9
|
+
except Exception:
|
|
10
|
+
_NLTK_DATA_PATH = os.path.join(
|
|
11
|
+
os.path.dirname(__file__), '..', 'compressor', 'resources', 'nltk_data'
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
os.environ['NLTK_DATA'] = _NLTK_DATA_PATH
|
|
15
|
+
|
|
16
|
+
from compressor.semantic import (
|
|
17
|
+
clean_text,
|
|
18
|
+
detect_language,
|
|
19
|
+
stem_text,
|
|
20
|
+
count_tokens,
|
|
21
|
+
structurize_text,
|
|
22
|
+
extract_textual_embeddings,
|
|
23
|
+
calculate_similarity,
|
|
24
|
+
compute_and_remove_repeated_ngrams,
|
|
25
|
+
correct_spelling,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def pytest_configure(config):
|
|
30
|
+
config.addinivalue_line(
|
|
31
|
+
"markers",
|
|
32
|
+
"need_model: mark test as requiring the model2vec model (skipped if not available)",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _check_model_available():
|
|
37
|
+
try:
|
|
38
|
+
from compressor.semantic import _get_embedding_model
|
|
39
|
+
_get_embedding_model()
|
|
40
|
+
return True
|
|
41
|
+
except Exception:
|
|
42
|
+
return False
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def pytest_collection_modifyitems(config, items):
|
|
46
|
+
model_available = _check_model_available()
|
|
47
|
+
if not model_available:
|
|
48
|
+
skip_need_model = pytest.mark.skip(reason="model2vec model not available")
|
|
49
|
+
for item in items:
|
|
50
|
+
if "need_model" in item.keywords:
|
|
51
|
+
item.add_marker(skip_need_model)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@pytest.fixture
|
|
55
|
+
def sample_text_en():
|
|
56
|
+
return (
|
|
57
|
+
"The quick brown fox jumps over the lazy dog. "
|
|
58
|
+
"This is a test sentence for the semantic compressor. "
|
|
59
|
+
"Natural language processing is a fascinating field. "
|
|
60
|
+
"Machine learning algorithms can analyze text data efficiently."
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@pytest.fixture
|
|
65
|
+
def sample_text_pt():
|
|
66
|
+
return (
|
|
67
|
+
"O rato roeu a roupa do rei de Roma. "
|
|
68
|
+
"Esta é uma frase de teste para o compressor semântico. "
|
|
69
|
+
"Processamento de linguagem natural é uma área fascinante. "
|
|
70
|
+
"Algoritmos de aprendizado de máquina podem analisar texto eficientemente."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@pytest.fixture
|
|
75
|
+
def sample_text_noisy():
|
|
76
|
+
return "Hello, World!!! This is... a very noisy??? text---with | weird • characters [and] (parens)."
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@pytest.fixture
|
|
80
|
+
def sample_text_hyphenated():
|
|
81
|
+
return "This is a hyphen- ated word that should be re- paired.\n\nSecond paragraph here."
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@pytest.fixture
|
|
85
|
+
def sample_text_dense():
|
|
86
|
+
return (
|
|
87
|
+
"Artificial intelligence has transformed the modern world. "
|
|
88
|
+
"Deep learning models can recognize patterns in complex data. "
|
|
89
|
+
"Neural networks are inspired by the human brain. "
|
|
90
|
+
"Natural language understanding enables machines to read text. "
|
|
91
|
+
"Computer vision allows machines to interpret images and video."
|
|
92
|
+
)
|