semantic-compressor 2.1__py3-none-any.whl → 2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. compressor/resources/nltk_data/tokenizers/punkt_tab/README +98 -0
  2. compressor/resources/nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +118 -0
  3. compressor/resources/nltk_data/tokenizers/punkt_tab/czech/collocations.tab +96 -0
  4. compressor/resources/nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab +52789 -0
  5. compressor/resources/nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt +54 -0
  6. compressor/resources/nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt +211 -0
  7. compressor/resources/nltk_data/tokenizers/punkt_tab/danish/collocations.tab +101 -0
  8. compressor/resources/nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab +53913 -0
  9. compressor/resources/nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt +64 -0
  10. compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt +99 -0
  11. compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/collocations.tab +37 -0
  12. compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab +32208 -0
  13. compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt +54 -0
  14. compressor/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +156 -0
  15. compressor/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab +37 -0
  16. compressor/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +20366 -0
  17. compressor/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +39 -0
  18. compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt +48 -0
  19. compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/collocations.tab +100 -0
  20. compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab +68544 -0
  21. compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt +89 -0
  22. compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt +81 -0
  23. compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/collocations.tab +167 -0
  24. compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab +79765 -0
  25. compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt +86 -0
  26. compressor/resources/nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt +61 -0
  27. compressor/resources/nltk_data/tokenizers/punkt_tab/french/collocations.tab +18 -0
  28. compressor/resources/nltk_data/tokenizers/punkt_tab/french/ortho_context.tab +26726 -0
  29. compressor/resources/nltk_data/tokenizers/punkt_tab/french/sent_starters.txt +48 -0
  30. compressor/resources/nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt +71 -0
  31. compressor/resources/nltk_data/tokenizers/punkt_tab/german/collocations.tab +28 -0
  32. compressor/resources/nltk_data/tokenizers/punkt_tab/german/ortho_context.tab +60260 -0
  33. compressor/resources/nltk_data/tokenizers/punkt_tab/german/sent_starters.txt +107 -0
  34. compressor/resources/nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt +100 -0
  35. compressor/resources/nltk_data/tokenizers/punkt_tab/greek/collocations.tab +7 -0
  36. compressor/resources/nltk_data/tokenizers/punkt_tab/greek/ortho_context.tab +29624 -0
  37. compressor/resources/nltk_data/tokenizers/punkt_tab/greek/sent_starters.txt +54 -0
  38. compressor/resources/nltk_data/tokenizers/punkt_tab/italian/abbrev_types.txt +125 -0
  39. compressor/resources/nltk_data/tokenizers/punkt_tab/italian/collocations.tab +6 -0
  40. compressor/resources/nltk_data/tokenizers/punkt_tab/italian/ortho_context.tab +29929 -0
  41. compressor/resources/nltk_data/tokenizers/punkt_tab/italian/sent_starters.txt +40 -0
  42. compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/abbrev_types.txt +285 -0
  43. compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/collocations.tab +153 -0
  44. compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/ortho_context.tab +10520 -0
  45. compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/sent_starters.txt +14 -0
  46. compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/abbrev_types.txt +106 -0
  47. compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/collocations.tab +54 -0
  48. compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/ortho_context.tab +54125 -0
  49. compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/sent_starters.txt +63 -0
  50. compressor/resources/nltk_data/tokenizers/punkt_tab/polish/abbrev_types.txt +225 -0
  51. compressor/resources/nltk_data/tokenizers/punkt_tab/polish/collocations.tab +57 -0
  52. compressor/resources/nltk_data/tokenizers/punkt_tab/polish/ortho_context.tab +81425 -0
  53. compressor/resources/nltk_data/tokenizers/punkt_tab/polish/sent_starters.txt +71 -0
  54. compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/abbrev_types.txt +72 -0
  55. compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/collocations.tab +5 -0
  56. compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/ortho_context.tab +30167 -0
  57. compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/sent_starters.txt +40 -0
  58. compressor/resources/nltk_data/tokenizers/punkt_tab/russian/abbrev_types.txt +1989 -0
  59. compressor/resources/nltk_data/tokenizers/punkt_tab/russian/collocations.tab +0 -0
  60. compressor/resources/nltk_data/tokenizers/punkt_tab/russian/ortho_context.tab +1 -0
  61. compressor/resources/nltk_data/tokenizers/punkt_tab/russian/sent_starters.txt +0 -0
  62. compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/abbrev_types.txt +73 -0
  63. compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/collocations.tab +74 -0
  64. compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/ortho_context.tab +35434 -0
  65. compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/sent_starters.txt +58 -0
  66. compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/abbrev_types.txt +66 -0
  67. compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/collocations.tab +7 -0
  68. compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/ortho_context.tab +27443 -0
  69. compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/sent_starters.txt +46 -0
  70. compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/abbrev_types.txt +39 -0
  71. compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/collocations.tab +8 -0
  72. compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/ortho_context.tab +44485 -0
  73. compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/sent_starters.txt +49 -0
  74. compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt +67 -0
  75. compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab +14 -0
  76. compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab +45926 -0
  77. compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt +87 -0
  78. compressor/resources/nltk_data/tokenizers/punkt_tab.zip +0 -0
  79. compressor/semantic.py +37 -3
  80. {semantic_compressor-2.1.dist-info → semantic_compressor-2.3.dist-info}/METADATA +1 -1
  81. {semantic_compressor-2.1.dist-info → semantic_compressor-2.3.dist-info}/RECORD +84 -6
  82. {semantic_compressor-2.1.dist-info → semantic_compressor-2.3.dist-info}/LICENSE +0 -0
  83. {semantic_compressor-2.1.dist-info → semantic_compressor-2.3.dist-info}/WHEEL +0 -0
  84. {semantic_compressor-2.1.dist-info → semantic_compressor-2.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,87 @@
1
+ şd
2
+ şimdi
3
+ mahkeme
4
+ ancak
5
+ bilindiği
6
+ arınç
7
+ örneğin
8
+ bunların
9
+ dolayısıyla
10
+ hazine
11
+ anap
12
+ biz
13
+ her
14
+ cumhurbaşkanı
15
+ işte
16
+ hükümet
17
+ diyarbakır
18
+ abd
19
+ artık
20
+ benim
21
+ önümüzdeki
22
+ chp
23
+ bunu
24
+ bence
25
+ akp
26
+ son
27
+ bütün
28
+ irak
29
+ yargıtay
30
+ tüsiad
31
+ ankara
32
+ bu
33
+ iddiaya
34
+ babacan
35
+ tbmm
36
+ bugün
37
+ halbuki
38
+ kanadoğlu
39
+ uzmanlar
40
+ çünkü
41
+ bunun
42
+ burada
43
+ bizim
44
+ buna
45
+ türk
46
+ edinilen
47
+ çiller
48
+ ben
49
+ sezer
50
+ maliye
51
+ başbakan
52
+ yani
53
+ fakat
54
+ diyanet
55
+ imf
56
+ yılmaz
57
+ bana
58
+ ağar
59
+ aslında
60
+ dyp
61
+ bddk
62
+ siz
63
+ gül
64
+ böyle
65
+ eğer
66
+ yeni
67
+ öte
68
+ ab
69
+ sigara
70
+ sonuçta
71
+ bush
72
+ hükümetin
73
+ oysa
74
+ yetkililer
75
+ üstelik
76
+ geçen
77
+ zaten
78
+ böylece
79
+ baykal
80
+ ayrıca
81
+ onlar
82
+ erdoğan
83
+ türkiye
84
+ bunlar
85
+ istanbul
86
+ ama
87
+ ysk
compressor/semantic.py CHANGED
@@ -14,6 +14,7 @@ from nltk.stem import PorterStemmer
14
14
  from nltk.stem import RSLPStemmer
15
15
  from collections import Counter
16
16
  from model2vec import StaticModel
17
+ import re
17
18
 
18
19
  tokenizer = RegexTokenizer()
19
20
 
@@ -32,6 +33,35 @@ embedding_model = StaticModel.from_pretrained("minishlab/potion-base-2M")
32
33
 
33
34
  hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
34
35
 
36
+ def clean_text(text: str) -> str:
37
+ # 1) Fix hyphenation at line breaks
38
+ text = re.sub(r'(\w)-\s*\n\s*(\w)', r'\1\2', text)
39
+ # 2) Strip stray pipes, bullets, brackets, quotes, unmatched parens
40
+ text = re.sub(r'[\|\•\[\]\(\)\"“”]', ' ', text)
41
+ # 3) Remove leading list hyphens
42
+ text = re.sub(r'(?m)^\s*-\s*', '', text)
43
+ # 4) Remove hyphens not between letters
44
+ text = re.sub(r'(?<!\w)-(?!\w)', ' ', text)
45
+ # 5) Collapse repeated punctuation
46
+ text = re.sub(r'([!?.,;:]){2,}', r'\1', text)
47
+ # 6) Normalize whitespace
48
+ text = re.sub(r'[ \t]+', ' ', text)
49
+ text = re.sub(r'\n{2,}', '\n', text).strip()
50
+
51
+ # 7) Aggressive cleanup if >20% noise, but keep basic punctuation
52
+ alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
53
+ if alpha_ratio < 0.8:
54
+ text = re.sub(r'[^A-Za-zÀ-ÿ\s\.\,\;\:\?\!]', ' ', text)
55
+ text = re.sub(r'\s{2,}', ' ', text).strip()
56
+
57
+ # 8) Reattach punctuation to preceding word and normalize post-punct spacing
58
+ # "word ." → "word."
59
+ text = re.sub(r'\s+([\.!,\?;:])', r'\1', text)
60
+ # "word.Next" → "word. Next"
61
+ text = re.sub(r'([\.!,\?;:])(?=\S)', r'\1 ', text)
62
+
63
+ return text
64
+
35
65
  def extract_textual_embeddings(text):
36
66
  X = hashing_vectorizer.fit_transform([text])
37
67
  dense_matrix = X.toarray()
@@ -100,7 +130,7 @@ def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
100
130
  def calculate_similarity(embed1, embed2):
101
131
  return cosine_similarity([embed1], [embed2])[0][0]
102
132
 
103
- def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None):
133
+ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None, perform_cleaning: bool = True):
104
134
  def create_lda_model(texts, stopwords):
105
135
  vectorizer = CountVectorizer(stop_words=stopwords)
106
136
  doc_term_matrix = vectorizer.fit_transform(texts)
@@ -129,6 +159,9 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, refere
129
159
  return importance
130
160
 
131
161
  try:
162
+ if perform_cleaning:
163
+ full_text = clean_text(full_text)
164
+
132
165
  # Split the text into sentences
133
166
  sentences = sent_tokenize(full_text)
134
167
 
@@ -192,7 +225,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, refere
192
225
 
193
226
  return full_text
194
227
 
195
- def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None):
228
+ def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None, perform_cleaning=True):
196
229
  """
197
230
  Compress text using either a compression rate or a target token count.
198
231
  If both are provided, the compression rate will be used.
@@ -219,7 +252,8 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
219
252
  return semantic_compress_text(
220
253
  full_text = text,
221
254
  compression_rate = compression_rate,
222
- reference_text = reference_text_steering
255
+ reference_text = reference_text_steering,
256
+ perform_cleaning = perform_cleaning
223
257
  )
224
258
  except Exception:
225
259
  traceback.print_exc()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: semantic_compressor
3
- Version: 2.1
3
+ Version: 2.3
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,5 +1,5 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=0TpOSQBhpfBcqyCs-08NbMOBvdMsvMPvKDlZIz-5Q4Q,15669
2
+ compressor/semantic.py,sha256=36PflgF3qMwEVRWSOgU0IlldvxRZAs9f38EAZuqOk_Y,17065
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
@@ -56,6 +56,7 @@ compressor/resources/nltk_data/stemmers/rslp/step5.pt,sha256=1yEVXyDPJvrQE3aQhp1
56
56
  compressor/resources/nltk_data/stemmers/rslp/step6.pt,sha256=Gy6SxIzZBd4b2B5yWiNOLUuciNYme9MOcnEM62RJLdw,143
57
57
  compressor/resources/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle,sha256=JaWhnHzteyusODHaW8CvzCw05d0BzU82G7eZlJppYjg,6138625
58
58
  compressor/resources/nltk_data/tokenizers/punkt.zip,sha256=UcMHiZSur2UL_I4Ci-T7QrSg0XfUHAErapg5eWU2YOw,13905355
59
+ compressor/resources/nltk_data/tokenizers/punkt_tab.zip,sha256=5X9kGHl0J3cmo0F8pvGB7FQDZ2xxdnLu9qdIp7IOAQY,4319076
59
60
  compressor/resources/nltk_data/tokenizers/punkt/.DS_Store,sha256=JDRKA4cwk1i-cV4bgD7JYsmggA_XtgokkqaB24H5GHM,6148
60
61
  compressor/resources/nltk_data/tokenizers/punkt/README,sha256=0yUcrmapNZvWjAOeOkYXKgXKnfCyfc36I65ZTWjSfsQ,8574
61
62
  compressor/resources/nltk_data/tokenizers/punkt/czech.pickle,sha256=W6c9KTx9eVOVa88C82lexcHw1Sfyo8OAl_VZM5T6FpA,1265552
@@ -97,8 +98,85 @@ compressor/resources/nltk_data/tokenizers/punkt/PY3/slovene.pickle,sha256=Uu8swO
97
98
  compressor/resources/nltk_data/tokenizers/punkt/PY3/spanish.pickle,sha256=FkpQ-txaSfjsdCbq4R0xEe51K0ij7zc9R3RQERkqWYQ,562337
98
99
  compressor/resources/nltk_data/tokenizers/punkt/PY3/swedish.pickle,sha256=sPfVOL_VJmYzsJ6ELNkungrBDx2SO_IR4Ul5ct3Ecxg,979681
99
100
  compressor/resources/nltk_data/tokenizers/punkt/PY3/turkish.pickle,sha256=rmjvWGNyisUzLofrH2uudy_zKhOkyqKwGlxoED6FPFs,1017038
100
- semantic_compressor-2.1.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
101
- semantic_compressor-2.1.dist-info/METADATA,sha256=-ubUP-0T2LvMpBk3hUG1c429Y-9HuknQX_CciuIbS1c,6178
102
- semantic_compressor-2.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
103
- semantic_compressor-2.1.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
104
- semantic_compressor-2.1.dist-info/RECORD,,
101
+ compressor/resources/nltk_data/tokenizers/punkt_tab/README,sha256=0yUcrmapNZvWjAOeOkYXKgXKnfCyfc36I65ZTWjSfsQ,8574
102
+ compressor/resources/nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt,sha256=jrH-taakb5vY9qWuWOIYQopltFm169YWaNg_qArPwyk,462
103
+ compressor/resources/nltk_data/tokenizers/punkt_tab/czech/collocations.tab,sha256=_Vj1SWWMg4m-Mld-IyGHvpN2x5cI9mxgOIqEkWyPPz0,1390
104
+ compressor/resources/nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab,sha256=RRyxpooDuojO0SR-a7SMRC6H3OHJ9GMQRhj6xQmGbNQ,684150
105
+ compressor/resources/nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt,sha256=QvHeGKsp-YQhE7h-wCAgLAwlMWyX03-WhDWnx-4wGMo,352
106
+ compressor/resources/nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt,sha256=SjWLJre6NyyrPO3rQ7TpmnaunQEVBJOWQUHrQqYuSm4,1130
107
+ compressor/resources/nltk_data/tokenizers/punkt_tab/danish/collocations.tab,sha256=sAPTuDmv1Hr6gdZGIKHPFbdblEISp_PuanVx8EGuAuw,1693
108
+ compressor/resources/nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab,sha256=WdfM2nse72y83bbS8ZMMJVpdQ2AOZuMu_yjquBkMByc,743835
109
+ compressor/resources/nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt,sha256=-N1BvbG7YzV0si04IF4cBzY5oLgMuVkZZL-S2EMXUn8,415
110
+ compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt,sha256=3NSvBxAWksmGBJugerzWt8GiRk705GqTLPktbiIDiN0,434
111
+ compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/collocations.tab,sha256=mgE1Y7zkY3WVptudLusKs_aGhdcEriDZetcYqpGvvko,623
112
+ compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab,sha256=4gd7V0hWSpZm5DfAqW6pxb7Q_jq1x1ZTVYR4XTryD8k,428403
113
+ compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt,sha256=c0RUrMWVLZGh36yozf2rtxXYzdNXicdJUoY52AdR9Fw,310
114
+ compressor/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt,sha256=kqPgcPQ9m0xVNHWMpArXNDsE5-Kb_gwutlijlEWk93k,619
115
+ compressor/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab,sha256=ji2hIl5N0syduiYe4jHMsTSFniG0YAbn9HLF7iaa8M8,594
116
+ compressor/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab,sha256=S7zKJe09PwbAJAKr-EGbnwM7itwG57SC7KTkX4Gl3Ew,236303
117
+ compressor/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt,sha256=8_hTVIPh26SHJBt2SUUWgSO8oyCalkXlms0SJdx27aw,241
118
+ compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt,sha256=AweHj7Mq8eIdjzugZE45yJYC8viqTkMElD9-ng_QgbI,224
119
+ compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/collocations.tab,sha256=8UAKfr88gqPVqsiG61vwahqECUbPBSuxUy2nwlCUeX8,1825
120
+ compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab,sha256=dot-biupP_CW4kSvF1w0OxpzRjxKG5m-6-apqlfqHXg,936992
121
+ compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt,sha256=et57uMp8UnqsCVF6bvvfT3Semp_A5D0oVsWL6EcDDIM,679
122
+ compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt,sha256=eCm6q8tdaeJiMoOaH1trxtErDlm38KZfarHBY6y-uAA,484
123
+ compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/collocations.tab,sha256=TXF6G00ftxCMqcntLMY_i0gTz69zgZ6W_OeUld3EgpI,3046
124
+ compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab,sha256=Q-Vai_qXdWEzegkQXFcANt8RNGxvkeV9hjFSDu-yYWc,1189717
125
+ compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt,sha256=q4HPO3dO8GismK25n8s7GgXK8EI736MCvcKF4VWXVn8,810
126
+ compressor/resources/nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt,sha256=AXR-z-0Igw1YLYyUY0tjo22jFsCmf-7ChxzwPw3Vxlc,245
127
+ compressor/resources/nltk_data/tokenizers/punkt_tab/french/collocations.tab,sha256=jd8VDvWg0iaU58TYB_gLbVXacxyoHqE5Yz9WCzeK-ZA,334
128
+ compressor/resources/nltk_data/tokenizers/punkt_tab/french/ortho_context.tab,sha256=cJDvQpJkfU_hNxWb12ilMEAV0GFNWf3PwwsjiWNDZaw,334275
129
+ compressor/resources/nltk_data/tokenizers/punkt_tab/french/sent_starters.txt,sha256=tFoeva6c30bPLovgFrRxi_T4JURfx4PURfrwz7LVafg,233
130
+ compressor/resources/nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt,sha256=JZ5deLydYaEtVbpHVSxOnFygg2TXikZp6GT9behoGOM,246
131
+ compressor/resources/nltk_data/tokenizers/punkt_tab/german/collocations.tab,sha256=93vVE_QugXn1yTIRHDEr41PVkTOH0L_pCpXBCREbyiU,527
132
+ compressor/resources/nltk_data/tokenizers/punkt_tab/german/ortho_context.tab,sha256=AjiH54z7kZ8aAPmpaoj0ITFYJ0jPgz_UaUkO5mWWMHw,943500
133
+ compressor/resources/nltk_data/tokenizers/punkt_tab/german/sent_starters.txt,sha256=_ktj7b3TP3i6CnVfA1aAqGSUixRa6C_JkHJECmKnes0,762
134
+ compressor/resources/nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt,sha256=DVYlO368g_DyQDR5vPK46lH_3qie5g40AY6zLkdQ1oM,570
135
+ compressor/resources/nltk_data/tokenizers/punkt_tab/greek/collocations.tab,sha256=D9cyU01WF3yAh0ElyolK5gruewUu4oZaQFtFTP1zem8,118
136
+ compressor/resources/nltk_data/tokenizers/punkt_tab/greek/ortho_context.tab,sha256=qH5EZ3kfl0-AjA05b2yCzToBTU-0RYoih-L8vOcITpw,632838
137
+ compressor/resources/nltk_data/tokenizers/punkt_tab/greek/sent_starters.txt,sha256=GkUK7IUWTj-_64FHaMnxIKvXE5cmr1u14qV3CWc5yo4,567
138
+ compressor/resources/nltk_data/tokenizers/punkt_tab/italian/abbrev_types.txt,sha256=XUBva_s15V9r6igr8Ui9zJ5Gd4YOsBqOZ8NLiUJ0rFA,534
139
+ compressor/resources/nltk_data/tokenizers/punkt_tab/italian/collocations.tab,sha256=gWe1igSbsE3o1pLtM9cpY1A6BhcXzTfKyzO7LQBmnK8,53
140
+ compressor/resources/nltk_data/tokenizers/punkt_tab/italian/ortho_context.tab,sha256=Ri_9SLwcm4bv5m0E2CbNHnpXRb6HG4EhdZ7SkgqWc3A,369822
141
+ compressor/resources/nltk_data/tokenizers/punkt_tab/italian/sent_starters.txt,sha256=TcLN_aNNTyhuWHXWtisUh3GnmacfDUW1BNojy2G1nVE,208
142
+ compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/abbrev_types.txt,sha256=HqLNAWKcjL6sB5nTk4PhDE1pSHNMJV03487aVQumJGQ,4714
143
+ compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/collocations.tab,sha256=cbw45CEz8ojGYKM4Fp2IWJRBNSgq1BZqLF0jzIoG7XM,3422
144
+ compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/ortho_context.tab,sha256=UonkP0bvNnn0677ZXvH61bZN16qJVwQq2ytmHMauRCU,118161
145
+ compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/sent_starters.txt,sha256=kYyQKqnQtycAQQJYJ0rjNdpRFI_uf8JwlatBFLI6JFw,106
146
+ compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/abbrev_types.txt,sha256=vs7XQHbtq-ZPRe_7gjAS0TlttB81-h2L4qxrab97xA8,496
147
+ compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/collocations.tab,sha256=nIFbAeL3UYVStT8JASZE3x-g2MdkK6_53F9h6F4L5CI,869
148
+ compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/ortho_context.tab,sha256=V-oqoSneauQUzQy-y6iyOZxP3ndaIUV3an6PECPz8YM,734070
149
+ compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/sent_starters.txt,sha256=K0nvDwwuG06LBq7hFy5NiM5q0BSG_kGCz_b8RDpz3es,378
150
+ compressor/resources/nltk_data/tokenizers/punkt_tab/polish/abbrev_types.txt,sha256=Qm6jfKSG0NzDZTZTxhPYU9C7OSkDf47mv2wS3a6HOqE,973
151
+ compressor/resources/nltk_data/tokenizers/punkt_tab/polish/collocations.tab,sha256=lhZT8GBthey4p2DK1Pad8SPwELxLNd4CU_6y14aWZrQ,1101
152
+ compressor/resources/nltk_data/tokenizers/punkt_tab/polish/ortho_context.tab,sha256=93CAQGcBLuVwf_xl2QUjGA4JurCSMKSSUXd76CGhpPY,1066577
153
+ compressor/resources/nltk_data/tokenizers/punkt_tab/polish/sent_starters.txt,sha256=hWQHauIr49VJo_CXoYRYrCln1lvSja18wiPFP1zifRs,518
154
+ compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/abbrev_types.txt,sha256=zVw8-cTIt2a8ntvJZESw4hibg3smxYOqLM6DoY5ChfU,301
155
+ compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/collocations.tab,sha256=oVDSyjuL2dh80l8uHSFCYHzfk0Tv6RS9A0r2AXoW2EE,64
156
+ compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/ortho_context.tab,sha256=LZwUP3syL8dHOKIeCysE_u0WaCRayT5ajlOZ5uDY7Yo,363124
157
+ compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/sent_starters.txt,sha256=e39KFYax34WSvylMjdDRmq-QdA14zjtAox7_K5Zp0V4,209
158
+ compressor/resources/nltk_data/tokenizers/punkt_tab/russian/abbrev_types.txt,sha256=dRkVJr5RapkYlI_jrUb2wsTe4UMUNZqJm4wuv0lstjE,15363
159
+ compressor/resources/nltk_data/tokenizers/punkt_tab/russian/collocations.tab,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
160
+ compressor/resources/nltk_data/tokenizers/punkt_tab/russian/ortho_context.tab,sha256=spbLqRAGVp0-956yz_I3i_W-W3Xp6D6hGbiD08PuabY,10
161
+ compressor/resources/nltk_data/tokenizers/punkt_tab/russian/sent_starters.txt,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
+ compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/abbrev_types.txt,sha256=sdaxbR0rPAxBMsFSi5hD0VV_f1RARNsn9OjS1phfrys,280
163
+ compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/collocations.tab,sha256=TBDar9_0luNp86iAm-vxh6vIUd9jBFeuk8fKT9o1BlE,1131
164
+ compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/ortho_context.tab,sha256=O6gmvdflVDr3r44xfUZSuotH_YEhV4VaYnTsE4ekaz4,445129
165
+ compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/sent_starters.txt,sha256=ha4U2t7zxMn3U0kdhJGauxDgkAOt_iSHlNSll_q8V5M,358
166
+ compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/abbrev_types.txt,sha256=JGpLPvutRbY-uWcrcYV0tTWs721lbyTcF6RtE6OO-6E,234
167
+ compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/collocations.tab,sha256=-l6n2_YfKxwSGA1mCjASyfqvSWLrElSdd6-V4qQL5l0,74
168
+ compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/ortho_context.tab,sha256=E0bs_FeyYXSYf_qb1KJVimb0ZZKlE9o4WWSHkDUd83A,337707
169
+ compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/sent_starters.txt,sha256=TU9zgLoWOAthWOno6taXPFLyfjyciK9j8VpBxJCZgxk,289
170
+ compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/abbrev_types.txt,sha256=Hhp4vV5_1ufrSxW3UaI9RBzahF1fwLYxlYF3svUE2gA,208
171
+ compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/collocations.tab,sha256=vSJgQzyUYXunz2EC5MEDUJmnLlova2BzRjklLkFjPAQ,136
172
+ compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/ortho_context.tab,sha256=1IxXBdBXCZNijlOcohHwXNtarXrfzG7XskMQlBv6yfg,616904
173
+ compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/sent_starters.txt,sha256=bw96Z3P3SIojuvxsKefUVU55LYnGt6ifvWqDvshe92c,295
174
+ compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt,sha256=3lYHjF7rP-B-JOq2fEqvROIYHL6A7dl2yxYigZWZ1v0,424
175
+ compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab,sha256=BhzimBd2qPh12k8kvr1-E4-NodkFe0PQf1gBSOwQajM,273
176
+ compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab,sha256=_CFCJ_mdXqPucNII3xaxmE6rN10ZRu03kGHGz1wXGL4,642682
177
+ compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt,sha256=kyOftVtdKubZRahKlOEYuoqBYyaxfNwRuoERvqDJeCg,613
178
+ semantic_compressor-2.3.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
179
+ semantic_compressor-2.3.dist-info/METADATA,sha256=eM7GwG1XgI-vBcj9CU4iTUD9fGby4DmsMRQicIuWW0M,6178
180
+ semantic_compressor-2.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
181
+ semantic_compressor-2.3.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
182
+ semantic_compressor-2.3.dist-info/RECORD,,