semantic-compressor 2.2__py3-none-any.whl → 2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
compressor/semantic.py CHANGED
@@ -14,6 +14,7 @@ from nltk.stem import PorterStemmer
14
14
  from nltk.stem import RSLPStemmer
15
15
  from collections import Counter
16
16
  from model2vec import StaticModel
17
+ import re
17
18
 
18
19
  tokenizer = RegexTokenizer()
19
20
 
@@ -32,6 +33,35 @@ embedding_model = StaticModel.from_pretrained("minishlab/potion-base-2M")
32
33
 
33
34
  hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
34
35
 
36
+ def clean_text(text: str) -> str:
37
+ # 1) Fix hyphenation at line breaks
38
+ text = re.sub(r'(\w)-\s*\n\s*(\w)', r'\1\2', text)
39
+ # 2) Strip stray pipes, bullets, brackets, quotes, unmatched parens
40
+ text = re.sub(r'[\|\•\[\]\(\)\"“”]', ' ', text)
41
+ # 3) Remove leading list hyphens
42
+ text = re.sub(r'(?m)^\s*-\s*', '', text)
43
+ # 4) Remove hyphens not between letters
44
+ text = re.sub(r'(?<!\w)-(?!\w)', ' ', text)
45
+ # 5) Collapse repeated punctuation
46
+ text = re.sub(r'([!?.,;:]){2,}', r'\1', text)
47
+ # 6) Normalize whitespace
48
+ text = re.sub(r'[ \t]+', ' ', text)
49
+ text = re.sub(r'\n{2,}', '\n', text).strip()
50
+
51
+ # 7) Aggressive cleanup if >20% noise, but keep basic punctuation
52
+ alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
53
+ if alpha_ratio < 0.8:
54
+ text = re.sub(r'[^A-Za-zÀ-ÿ\s\.\,\;\:\?\!]', ' ', text)
55
+ text = re.sub(r'\s{2,}', ' ', text).strip()
56
+
57
+ # 8) Reattach punctuation to preceding word and normalize post-punct spacing
58
+ # "word ." → "word."
59
+ text = re.sub(r'\s+([\.!,\?;:])', r'\1', text)
60
+ # "word.Next" → "word. Next"
61
+ text = re.sub(r'([\.!,\?;:])(?=\S)', r'\1 ', text)
62
+
63
+ return text
64
+
35
65
  def extract_textual_embeddings(text):
36
66
  X = hashing_vectorizer.fit_transform([text])
37
67
  dense_matrix = X.toarray()
@@ -100,7 +130,7 @@ def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
100
130
  def calculate_similarity(embed1, embed2):
101
131
  return cosine_similarity([embed1], [embed2])[0][0]
102
132
 
103
- def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None):
133
+ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None, perform_cleaning: bool = True):
104
134
  def create_lda_model(texts, stopwords):
105
135
  vectorizer = CountVectorizer(stop_words=stopwords)
106
136
  doc_term_matrix = vectorizer.fit_transform(texts)
@@ -129,6 +159,9 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, refere
129
159
  return importance
130
160
 
131
161
  try:
162
+ if perform_cleaning:
163
+ full_text = clean_text(full_text)
164
+
132
165
  # Split the text into sentences
133
166
  sentences = sent_tokenize(full_text)
134
167
 
@@ -192,7 +225,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, refere
192
225
 
193
226
  return full_text
194
227
 
195
- def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None):
228
+ def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None, perform_cleaning=True):
196
229
  """
197
230
  Compress text using either a compression rate or a target token count.
198
231
  If both are provided, the compression rate will be used.
@@ -219,7 +252,8 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
219
252
  return semantic_compress_text(
220
253
  full_text = text,
221
254
  compression_rate = compression_rate,
222
- reference_text = reference_text_steering
255
+ reference_text = reference_text_steering,
256
+ perform_cleaning = perform_cleaning
223
257
  )
224
258
  except Exception:
225
259
  traceback.print_exc()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: semantic_compressor
3
- Version: 2.2
3
+ Version: 2.3
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,5 +1,5 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=0TpOSQBhpfBcqyCs-08NbMOBvdMsvMPvKDlZIz-5Q4Q,15669
2
+ compressor/semantic.py,sha256=36PflgF3qMwEVRWSOgU0IlldvxRZAs9f38EAZuqOk_Y,17065
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
@@ -175,8 +175,8 @@ compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt,sha
175
175
  compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab,sha256=BhzimBd2qPh12k8kvr1-E4-NodkFe0PQf1gBSOwQajM,273
176
176
  compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab,sha256=_CFCJ_mdXqPucNII3xaxmE6rN10ZRu03kGHGz1wXGL4,642682
177
177
  compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt,sha256=kyOftVtdKubZRahKlOEYuoqBYyaxfNwRuoERvqDJeCg,613
178
- semantic_compressor-2.2.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
179
- semantic_compressor-2.2.dist-info/METADATA,sha256=XMlw60617-mpHu59sdqmvxMUtcPfKOj7xrlriYb2wvw,6178
180
- semantic_compressor-2.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
181
- semantic_compressor-2.2.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
182
- semantic_compressor-2.2.dist-info/RECORD,,
178
+ semantic_compressor-2.3.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
179
+ semantic_compressor-2.3.dist-info/METADATA,sha256=eM7GwG1XgI-vBcj9CU4iTUD9fGby4DmsMRQicIuWW0M,6178
180
+ semantic_compressor-2.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
181
+ semantic_compressor-2.3.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
182
+ semantic_compressor-2.3.dist-info/RECORD,,