semantic-compressor 2.2__py3-none-any.whl → 2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of semantic-compressor might be problematic. Click here for more details.
- compressor/semantic.py +38 -4
- {semantic_compressor-2.2.dist-info → semantic_compressor-2.4.dist-info}/METADATA +1 -1
- {semantic_compressor-2.2.dist-info → semantic_compressor-2.4.dist-info}/RECORD +6 -6
- {semantic_compressor-2.2.dist-info → semantic_compressor-2.4.dist-info}/LICENSE +0 -0
- {semantic_compressor-2.2.dist-info → semantic_compressor-2.4.dist-info}/WHEEL +0 -0
- {semantic_compressor-2.2.dist-info → semantic_compressor-2.4.dist-info}/top_level.txt +0 -0
compressor/semantic.py
CHANGED
@@ -14,6 +14,7 @@ from nltk.stem import PorterStemmer
|
|
14
14
|
from nltk.stem import RSLPStemmer
|
15
15
|
from collections import Counter
|
16
16
|
from model2vec import StaticModel
|
17
|
+
import re
|
17
18
|
|
18
19
|
tokenizer = RegexTokenizer()
|
19
20
|
|
@@ -28,10 +29,39 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
|
|
28
29
|
portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
|
29
30
|
langdetect_model = fasttext.load_model(fasttext_model_path)
|
30
31
|
|
31
|
-
embedding_model = StaticModel.from_pretrained("
|
32
|
+
embedding_model = StaticModel.from_pretrained("cnmoro/Linq-Embed-Mistral-Distilled")
|
32
33
|
|
33
34
|
hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
|
34
35
|
|
36
|
+
def clean_text(text: str) -> str:
|
37
|
+
# 1) Fix hyphenation at line breaks
|
38
|
+
text = re.sub(r'(\w)-\s*\n\s*(\w)', r'\1\2', text)
|
39
|
+
# 2) Strip stray pipes, bullets, brackets, quotes, unmatched parens
|
40
|
+
text = re.sub(r'[\|\•\[\]\(\)\"“”]', ' ', text)
|
41
|
+
# 3) Remove leading list hyphens
|
42
|
+
text = re.sub(r'(?m)^\s*-\s*', '', text)
|
43
|
+
# 4) Remove hyphens not between letters
|
44
|
+
text = re.sub(r'(?<!\w)-(?!\w)', ' ', text)
|
45
|
+
# 5) Collapse repeated punctuation
|
46
|
+
text = re.sub(r'([!?.,;:]){2,}', r'\1', text)
|
47
|
+
# 6) Normalize whitespace
|
48
|
+
text = re.sub(r'[ \t]+', ' ', text)
|
49
|
+
text = re.sub(r'\n{2,}', '\n', text).strip()
|
50
|
+
|
51
|
+
# 7) Aggressive cleanup if >20% noise, but keep basic punctuation
|
52
|
+
alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
|
53
|
+
if alpha_ratio < 0.8:
|
54
|
+
text = re.sub(r'[^A-Za-zÀ-ÿ\s\.\,\;\:\?\!]', ' ', text)
|
55
|
+
text = re.sub(r'\s{2,}', ' ', text).strip()
|
56
|
+
|
57
|
+
# 8) Reattach punctuation to preceding word and normalize post-punct spacing
|
58
|
+
# "word ." → "word."
|
59
|
+
text = re.sub(r'\s+([\.!,\?;:])', r'\1', text)
|
60
|
+
# "word.Next" → "word. Next"
|
61
|
+
text = re.sub(r'([\.!,\?;:])(?=\S)', r'\1 ', text)
|
62
|
+
|
63
|
+
return text
|
64
|
+
|
35
65
|
def extract_textual_embeddings(text):
|
36
66
|
X = hashing_vectorizer.fit_transform([text])
|
37
67
|
dense_matrix = X.toarray()
|
@@ -100,7 +130,7 @@ def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
|
|
100
130
|
def calculate_similarity(embed1, embed2):
|
101
131
|
return cosine_similarity([embed1], [embed2])[0][0]
|
102
132
|
|
103
|
-
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None):
|
133
|
+
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None, perform_cleaning: bool = True):
|
104
134
|
def create_lda_model(texts, stopwords):
|
105
135
|
vectorizer = CountVectorizer(stop_words=stopwords)
|
106
136
|
doc_term_matrix = vectorizer.fit_transform(texts)
|
@@ -129,6 +159,9 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, refere
|
|
129
159
|
return importance
|
130
160
|
|
131
161
|
try:
|
162
|
+
if perform_cleaning:
|
163
|
+
full_text = clean_text(full_text)
|
164
|
+
|
132
165
|
# Split the text into sentences
|
133
166
|
sentences = sent_tokenize(full_text)
|
134
167
|
|
@@ -192,7 +225,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, refere
|
|
192
225
|
|
193
226
|
return full_text
|
194
227
|
|
195
|
-
def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None):
|
228
|
+
def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None, perform_cleaning=True):
|
196
229
|
"""
|
197
230
|
Compress text using either a compression rate or a target token count.
|
198
231
|
If both are provided, the compression rate will be used.
|
@@ -219,7 +252,8 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
|
|
219
252
|
return semantic_compress_text(
|
220
253
|
full_text = text,
|
221
254
|
compression_rate = compression_rate,
|
222
|
-
reference_text = reference_text_steering
|
255
|
+
reference_text = reference_text_steering,
|
256
|
+
perform_cleaning = perform_cleaning
|
223
257
|
)
|
224
258
|
except Exception:
|
225
259
|
traceback.print_exc()
|
@@ -1,5 +1,5 @@
|
|
1
1
|
compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
compressor/semantic.py,sha256=
|
2
|
+
compressor/semantic.py,sha256=sJXbap3_oZpd-XMGkecrqQ7RSon-OE98u8iYcNRIskA,17076
|
3
3
|
compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
|
4
4
|
compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
|
5
5
|
compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
|
@@ -175,8 +175,8 @@ compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt,sha
|
|
175
175
|
compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab,sha256=BhzimBd2qPh12k8kvr1-E4-NodkFe0PQf1gBSOwQajM,273
|
176
176
|
compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab,sha256=_CFCJ_mdXqPucNII3xaxmE6rN10ZRu03kGHGz1wXGL4,642682
|
177
177
|
compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt,sha256=kyOftVtdKubZRahKlOEYuoqBYyaxfNwRuoERvqDJeCg,613
|
178
|
-
semantic_compressor-2.
|
179
|
-
semantic_compressor-2.
|
180
|
-
semantic_compressor-2.
|
181
|
-
semantic_compressor-2.
|
182
|
-
semantic_compressor-2.
|
178
|
+
semantic_compressor-2.4.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
|
179
|
+
semantic_compressor-2.4.dist-info/METADATA,sha256=-QsGOKQoDo4YBW88p-KWPrr0GlHrgvygoRnSN2C09-Y,6178
|
180
|
+
semantic_compressor-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
181
|
+
semantic_compressor-2.4.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
|
182
|
+
semantic_compressor-2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|