semantic-compressor 2.0__tar.gz → 2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {semantic_compressor-2.0/semantic_compressor.egg-info → semantic_compressor-2.2}/PKG-INFO +1 -1
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/README +98 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +118 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/collocations.tab +96 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab +52789 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt +54 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt +211 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/collocations.tab +101 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab +53913 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt +64 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt +99 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/collocations.tab +37 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab +32208 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt +54 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +156 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab +37 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +20366 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +39 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt +48 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/collocations.tab +100 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab +68544 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt +89 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt +81 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/collocations.tab +167 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab +79765 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt +86 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt +61 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/french/collocations.tab +18 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/french/ortho_context.tab +26726 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/french/sent_starters.txt +48 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt +71 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/german/collocations.tab +28 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/german/ortho_context.tab +60260 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/german/sent_starters.txt +107 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt +100 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/collocations.tab +7 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/ortho_context.tab +29624 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/greek/sent_starters.txt +54 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/abbrev_types.txt +125 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/collocations.tab +6 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/ortho_context.tab +29929 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/italian/sent_starters.txt +40 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/abbrev_types.txt +285 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/collocations.tab +153 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/ortho_context.tab +10520 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/sent_starters.txt +14 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/abbrev_types.txt +106 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/collocations.tab +54 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/ortho_context.tab +54125 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/sent_starters.txt +63 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/abbrev_types.txt +225 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/collocations.tab +57 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/ortho_context.tab +81425 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/polish/sent_starters.txt +71 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/abbrev_types.txt +72 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/collocations.tab +5 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/ortho_context.tab +30167 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/sent_starters.txt +40 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/abbrev_types.txt +1989 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/collocations.tab +0 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/ortho_context.tab +1 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/russian/sent_starters.txt +0 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/abbrev_types.txt +73 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/collocations.tab +74 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/ortho_context.tab +35434 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/sent_starters.txt +58 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/abbrev_types.txt +66 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/collocations.tab +7 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/ortho_context.tab +27443 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/sent_starters.txt +46 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/abbrev_types.txt +39 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/collocations.tab +8 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/ortho_context.tab +44485 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/sent_starters.txt +49 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt +67 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab +14 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab +45926 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt +87 -0
- semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab.zip +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/semantic.py +1 -1
- {semantic_compressor-2.0 → semantic_compressor-2.2}/pyproject.toml +1 -1
- {semantic_compressor-2.0 → semantic_compressor-2.2/semantic_compressor.egg-info}/PKG-INFO +1 -1
- semantic_compressor-2.2/semantic_compressor.egg-info/SOURCES.txt +187 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/setup.py +1 -1
- semantic_compressor-2.0/semantic_compressor.egg-info/SOURCES.txt +0 -109
- {semantic_compressor-2.0 → semantic_compressor-2.2}/LICENSE +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/MANIFEST.in +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/README.md +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/__init__.py +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/minbpe/__init__.py +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/minbpe/base.py +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/minbpe/basic.py +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/minbpe/regex.py +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/__init__.py +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/en_stopwords.pkl +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/lid.176.ftz +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/README +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/albanian +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/arabic +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/azerbaijani +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/basque +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/belarusian +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/bengali +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/catalan +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/chinese +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/danish +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/dutch +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/english +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/finnish +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/french +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/german +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/greek +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/hebrew +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/hinglish +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/hungarian +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/indonesian +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/italian +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/kazakh +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/nepali +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/norwegian +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/portuguese +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/romanian +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/russian +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/slovene +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/spanish +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/swedish +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/tajik +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/tamil +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords/turkish +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/stopwords.zip +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/wordnet.zip +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/words/README +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/words/en +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/corpora/words/en-basic +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/stemmers/rslp/step0.pt +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/stemmers/rslp/step1.pt +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/stemmers/rslp/step2.pt +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/stemmers/rslp/step3.pt +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/stemmers/rslp/step4.pt +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/stemmers/rslp/step5.pt +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/stemmers/rslp/step6.pt +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/stemmers/rslp.zip +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/.DS_Store +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/README +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/czech.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/danish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/dutch.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/english.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/estonian.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/finnish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/french.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/german.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/greek.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/italian.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/malayalam.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/norwegian.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/polish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/portuguese.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/russian.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/slovene.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/spanish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/swedish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/PY3/turkish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/README +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/czech.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/danish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/dutch.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/english.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/estonian.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/finnish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/french.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/german.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/greek.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/italian.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/malayalam.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/norwegian.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/polish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/portuguese.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/russian.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/slovene.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/spanish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/swedish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt/turkish.pickle +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/nltk_data/tokenizers/punkt.zip +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/compressor/resources/pt_stopwords.pkl +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/semantic_compressor.egg-info/dependency_links.txt +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/semantic_compressor.egg-info/requires.txt +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/semantic_compressor.egg-info/top_level.txt +0 -0
- {semantic_compressor-2.0 → semantic_compressor-2.2}/setup.cfg +0 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
2
|
+
|
3
|
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
4
|
+
been contributed by various people using NLTK for sentence boundary detection.
|
5
|
+
|
6
|
+
For information about how to use these models, please confer the tokenization HOWTO:
|
7
|
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
8
|
+
and chapter 3.8 of the NLTK book:
|
9
|
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
10
|
+
|
11
|
+
There are pretrained tokenizers for the following languages:
|
12
|
+
|
13
|
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
14
|
+
=======================================================================================================================================================================
|
15
|
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
16
|
+
Literarni Noviny
|
17
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
18
|
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
19
|
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
20
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
21
|
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
22
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
23
|
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
24
|
+
(American)
|
25
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
26
|
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
27
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
28
|
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
29
|
+
Text Bank (Suomen Kielen newspapers
|
30
|
+
Tekstipankki)
|
31
|
+
Finnish Center for IT Science
|
32
|
+
(CSC)
|
33
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
34
|
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
35
|
+
(European)
|
36
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
37
|
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
38
|
+
(Switzerland) CD-ROM
|
39
|
+
(Uses "ss"
|
40
|
+
instead of "ß")
|
41
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
42
|
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
43
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
44
|
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
45
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
46
|
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
47
|
+
(Bokmål and Information Technologies,
|
48
|
+
Nynorsk) Bergen
|
49
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
50
|
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
51
|
+
(http://www.nkjp.pl/)
|
52
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
53
|
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
54
|
+
(Brazilian) (Linguateca)
|
55
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
56
|
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
57
|
+
Slovene Academy for Arts
|
58
|
+
and Sciences
|
59
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
60
|
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
61
|
+
(European)
|
62
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
63
|
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
64
|
+
(and some other texts)
|
65
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
66
|
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
67
|
+
(Türkçe Derlem Projesi)
|
68
|
+
University of Ankara
|
69
|
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
70
|
+
|
71
|
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
72
|
+
Unicode using the codecs module.
|
73
|
+
|
74
|
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
75
|
+
Computational Linguistics 32: 485-525.
|
76
|
+
|
77
|
+
---- Training Code ----
|
78
|
+
|
79
|
+
# import punkt
|
80
|
+
import nltk.tokenize.punkt
|
81
|
+
|
82
|
+
# Make a new Tokenizer
|
83
|
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
84
|
+
|
85
|
+
# Read in training corpus (one example: Slovene)
|
86
|
+
import codecs
|
87
|
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
88
|
+
|
89
|
+
# Train tokenizer
|
90
|
+
tokenizer.train(text)
|
91
|
+
|
92
|
+
# Dump pickled tokenizer
|
93
|
+
import pickle
|
94
|
+
out = open("slovene.pickle","wb")
|
95
|
+
pickle.dump(tokenizer, out)
|
96
|
+
out.close()
|
97
|
+
|
98
|
+
---------
|
semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
t
|
2
|
+
množ
|
3
|
+
např
|
4
|
+
j.h
|
5
|
+
man
|
6
|
+
ú
|
7
|
+
jug
|
8
|
+
dr
|
9
|
+
bl
|
10
|
+
ml
|
11
|
+
okr
|
12
|
+
st
|
13
|
+
uh
|
14
|
+
šp
|
15
|
+
judr
|
16
|
+
u.s.a
|
17
|
+
p
|
18
|
+
arg
|
19
|
+
žitě
|
20
|
+
st.celsia
|
21
|
+
etc
|
22
|
+
p.s
|
23
|
+
t.r
|
24
|
+
lok
|
25
|
+
mil
|
26
|
+
ict
|
27
|
+
n
|
28
|
+
tl
|
29
|
+
min
|
30
|
+
č
|
31
|
+
d
|
32
|
+
al
|
33
|
+
ravenně
|
34
|
+
mj
|
35
|
+
nar
|
36
|
+
plk
|
37
|
+
s.p
|
38
|
+
a.g
|
39
|
+
roč
|
40
|
+
b
|
41
|
+
zdi
|
42
|
+
r.s.c
|
43
|
+
přek
|
44
|
+
m
|
45
|
+
gen
|
46
|
+
csc
|
47
|
+
mudr
|
48
|
+
vic
|
49
|
+
š
|
50
|
+
sb
|
51
|
+
resp
|
52
|
+
tzn
|
53
|
+
iv
|
54
|
+
s.r.o
|
55
|
+
mar
|
56
|
+
w
|
57
|
+
čs
|
58
|
+
vi
|
59
|
+
tzv
|
60
|
+
ul
|
61
|
+
pen
|
62
|
+
zv
|
63
|
+
str
|
64
|
+
čp
|
65
|
+
org
|
66
|
+
rak
|
67
|
+
sv
|
68
|
+
pplk
|
69
|
+
u.s
|
70
|
+
prof
|
71
|
+
c.k
|
72
|
+
op
|
73
|
+
g
|
74
|
+
vii
|
75
|
+
kr
|
76
|
+
ing
|
77
|
+
j.o
|
78
|
+
drsc
|
79
|
+
m3
|
80
|
+
l
|
81
|
+
tr
|
82
|
+
ceo
|
83
|
+
ch
|
84
|
+
fuk
|
85
|
+
vl
|
86
|
+
viii
|
87
|
+
líp
|
88
|
+
hl.m
|
89
|
+
t.zv
|
90
|
+
phdr
|
91
|
+
o.k
|
92
|
+
tis
|
93
|
+
doc
|
94
|
+
kl
|
95
|
+
ard
|
96
|
+
čkd
|
97
|
+
pok
|
98
|
+
apod
|
99
|
+
r
|
100
|
+
př
|
101
|
+
a.s
|
102
|
+
j
|
103
|
+
jr
|
104
|
+
i.m
|
105
|
+
e
|
106
|
+
kupř
|
107
|
+
f
|
108
|
+
tř
|
109
|
+
xvi
|
110
|
+
mir
|
111
|
+
atď
|
112
|
+
vr
|
113
|
+
r.i.v
|
114
|
+
hl
|
115
|
+
kv
|
116
|
+
t.j
|
117
|
+
y
|
118
|
+
q.p.r
|
semantic_compressor-2.2/compressor/resources/nltk_data/tokenizers/punkt_tab/czech/collocations.tab
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
i dejmala
|
2
|
+
##number## prosince
|
3
|
+
h steina
|
4
|
+
##number## listopadu
|
5
|
+
a dvořák
|
6
|
+
v klaus
|
7
|
+
i čnhl
|
8
|
+
##number## wladyslawowo
|
9
|
+
##number## letech
|
10
|
+
a jiráska
|
11
|
+
a dubček
|
12
|
+
##number## štrasburk
|
13
|
+
##number## juniorské
|
14
|
+
##number## století
|
15
|
+
##number## kola
|
16
|
+
##number## pád
|
17
|
+
##number## května
|
18
|
+
##number## týdne
|
19
|
+
v dlouhý
|
20
|
+
k design
|
21
|
+
##number## červenec
|
22
|
+
i ligy
|
23
|
+
##number## kolo
|
24
|
+
z svěrák
|
25
|
+
##number## mája
|
26
|
+
##number## šimková
|
27
|
+
a bělého
|
28
|
+
a bradáč
|
29
|
+
##number## ročníku
|
30
|
+
##number## dubna
|
31
|
+
a vivaldiho
|
32
|
+
v mečiara
|
33
|
+
c carrićre
|
34
|
+
##number## sjezd
|
35
|
+
##number## výroční
|
36
|
+
##number## kole
|
37
|
+
##number## narozenin
|
38
|
+
k maleevová
|
39
|
+
i čnfl
|
40
|
+
##number## pádě
|
41
|
+
##number## září
|
42
|
+
##number## výročí
|
43
|
+
a dvořáka
|
44
|
+
h g.
|
45
|
+
##number## ledna
|
46
|
+
a dvorský
|
47
|
+
h měsíc
|
48
|
+
##number## srpna
|
49
|
+
##number## tř.
|
50
|
+
a mozarta
|
51
|
+
##number## sudetoněmeckých
|
52
|
+
o sokolov
|
53
|
+
k škrach
|
54
|
+
v benda
|
55
|
+
##number## symfonie
|
56
|
+
##number## července
|
57
|
+
x šalda
|
58
|
+
c abrahama
|
59
|
+
a tichý
|
60
|
+
##number## místo
|
61
|
+
k bielecki
|
62
|
+
v havel
|
63
|
+
##number## etapu
|
64
|
+
a dubčeka
|
65
|
+
i liga
|
66
|
+
##number## světový
|
67
|
+
v klausem
|
68
|
+
##number## ženy
|
69
|
+
##number## létech
|
70
|
+
##number## minutě
|
71
|
+
##number## listopadem
|
72
|
+
##number## místě
|
73
|
+
o vlček
|
74
|
+
k peteraje
|
75
|
+
i sponzor
|
76
|
+
##number## června
|
77
|
+
##number## min.
|
78
|
+
##number## oprávněnou
|
79
|
+
##number## květnu
|
80
|
+
##number## aktu
|
81
|
+
##number## květnem
|
82
|
+
##number## října
|
83
|
+
i rynda
|
84
|
+
##number## února
|
85
|
+
i snfl
|
86
|
+
a mozart
|
87
|
+
z košler
|
88
|
+
a dvorskému
|
89
|
+
v marhoul
|
90
|
+
v mečiar
|
91
|
+
##number## ročník
|
92
|
+
##number## máje
|
93
|
+
v havla
|
94
|
+
k gott
|
95
|
+
s bacha
|
96
|
+
##number## ad
|