semantic-compressor 1.1__py3-none-any.whl → 1.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- compressor/semantic.py +135 -22
- {semantic_compressor-1.1.dist-info → semantic_compressor-1.2.dist-info}/METADATA +1 -1
- {semantic_compressor-1.1.dist-info → semantic_compressor-1.2.dist-info}/RECORD +6 -6
- {semantic_compressor-1.1.dist-info → semantic_compressor-1.2.dist-info}/LICENSE +0 -0
- {semantic_compressor-1.1.dist-info → semantic_compressor-1.2.dist-info}/WHEEL +0 -0
- {semantic_compressor-1.1.dist-info → semantic_compressor-1.2.dist-info}/top_level.txt +0 -0
compressor/semantic.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
+
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
|
1
2
|
import numpy as np, pickle, fasttext, os, traceback, importlib
|
2
|
-
from sklearn.feature_extraction.text import CountVectorizer
|
3
3
|
from sklearn.decomposition import LatentDirichletAllocation
|
4
4
|
from sklearn.metrics.pairwise import cosine_similarity
|
5
5
|
from onnxruntime_extensions import get_library_path
|
6
6
|
from compressor.minbpe.regex import RegexTokenizer
|
7
7
|
from nltk.tokenize import sent_tokenize
|
8
8
|
from multiprocessing import cpu_count
|
9
|
+
from collections import Counter
|
9
10
|
import onnxruntime as ort
|
10
11
|
|
11
12
|
tokenizer = RegexTokenizer()
|
@@ -33,7 +34,15 @@ embedding_model = ort.InferenceSession(
|
|
33
34
|
providers=_providers
|
34
35
|
)
|
35
36
|
|
36
|
-
|
37
|
+
hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
|
38
|
+
|
39
|
+
def extract_textual_embeddings(text):
|
40
|
+
X = hashing_vectorizer.fit_transform([text])
|
41
|
+
dense_matrix = X.toarray()
|
42
|
+
fixed_size_matrix = np.sum(dense_matrix, axis=0)
|
43
|
+
return fixed_size_matrix.tolist()
|
44
|
+
|
45
|
+
def extract_semantic_embeddings(text):
|
37
46
|
return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
|
38
47
|
|
39
48
|
def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
|
@@ -59,10 +68,43 @@ def detect_language(text):
|
|
59
68
|
detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
|
60
69
|
return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
|
61
70
|
|
62
|
-
def
|
63
|
-
|
64
|
-
|
71
|
+
def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
|
72
|
+
words = text.split()
|
73
|
+
|
74
|
+
ngrams = [' '.join(words[i:i+ngram_size]) for i in range(len(words)-ngram_size+1)]
|
75
|
+
|
76
|
+
counter = Counter(ngrams)
|
77
|
+
|
78
|
+
repeated_ngrams = [ngram for ngram, count in counter.items() if count > threshold]
|
79
|
+
|
80
|
+
# Iterate through each repeated n-gram and remove the duplicates
|
81
|
+
for ngram in repeated_ngrams:
|
82
|
+
# Track if it's the first occurrence
|
83
|
+
first_occurrence = True
|
84
|
+
i = 0
|
85
|
+
|
86
|
+
while i <= len(words) - ngram_size:
|
87
|
+
# Form a sliding window n-gram from the current position
|
88
|
+
current_ngram = ' '.join(words[i:i+ngram_size])
|
89
|
+
|
90
|
+
if current_ngram == ngram:
|
91
|
+
if first_occurrence:
|
92
|
+
# Mark the first occurrence and skip
|
93
|
+
first_occurrence = False
|
94
|
+
i += ngram_size # Move ahead by the size of the n-gram
|
95
|
+
else:
|
96
|
+
# Remove the n-gram by removing the words that make up this n-gram
|
97
|
+
del words[i:i+ngram_size]
|
98
|
+
else:
|
99
|
+
i += 1 # Move forward
|
100
|
+
|
101
|
+
# Rejoin the words back into a single string
|
102
|
+
return ' '.join(words)
|
103
|
+
|
104
|
+
def calculate_similarity(embed1, embed2):
|
105
|
+
return cosine_similarity([embed1], [embed2])[0][0]
|
65
106
|
|
107
|
+
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None):
|
66
108
|
def create_lda_model(texts, stopwords):
|
67
109
|
vectorizer = CountVectorizer(stop_words=stopwords)
|
68
110
|
doc_term_matrix = vectorizer.fit_transform(texts)
|
@@ -75,7 +117,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
75
117
|
return lda.transform(vec)[0]
|
76
118
|
|
77
119
|
def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
|
78
|
-
sentence_embedding =
|
120
|
+
sentence_embedding = extract_semantic_embeddings(sentence)
|
79
121
|
semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
|
80
122
|
|
81
123
|
topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
|
@@ -106,7 +148,13 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
106
148
|
lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
|
107
149
|
|
108
150
|
# Get document-level embedding
|
109
|
-
doc_embedding =
|
151
|
+
doc_embedding = extract_semantic_embeddings(full_text)
|
152
|
+
|
153
|
+
if reference_text is not None:
|
154
|
+
reference_text_embedding = extract_semantic_embeddings(reference_text)
|
155
|
+
|
156
|
+
# Compute an weighted average of the two embeddings (60% document and 40% reference)
|
157
|
+
doc_embedding = 0.6 * doc_embedding + 0.4 * reference_text_embedding
|
110
158
|
|
111
159
|
# Calculate importance for each sentence
|
112
160
|
sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
|
@@ -137,13 +185,18 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
137
185
|
# Reorder sentences to maintain original flow
|
138
186
|
compressed_text.sort(key=lambda x: sentences.index(x))
|
139
187
|
|
140
|
-
|
188
|
+
# Capitalize the first letter of each sentence
|
189
|
+
compressed_text = [sentence.capitalize() for sentence in compressed_text]
|
190
|
+
|
191
|
+
cleaned_compressed_text = ' '.join(compressed_text).replace(' ', ' ').strip()
|
192
|
+
cleaned_compressed_text = compute_and_remove_repeated_ngrams(cleaned_compressed_text)
|
193
|
+
return cleaned_compressed_text
|
141
194
|
except Exception:
|
142
195
|
traceback.print_exc()
|
143
196
|
|
144
197
|
return full_text
|
145
198
|
|
146
|
-
def compress_text(text, *, target_token_count=None, compression_rate=0.7):
|
199
|
+
def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None):
|
147
200
|
"""
|
148
201
|
Compress text using either a compression rate or a target token count.
|
149
202
|
If both are provided, the compression rate will be used.
|
@@ -152,20 +205,80 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7):
|
|
152
205
|
text (str): The text to be compressed.
|
153
206
|
target_token_count (int, optional): The target token count for compression. Defaults to None.
|
154
207
|
compression_rate (float, optional): The compression rate as a percentage. Defaults to 0.7. Example: 0.7 means 70% reduction.
|
155
|
-
|
208
|
+
reference_text_steering (str, optional): The reference text to steer the compression. Defaults to None.
|
209
|
+
|
156
210
|
Returns:
|
157
211
|
str: The compressed text.
|
158
212
|
"""
|
213
|
+
try:
|
214
|
+
if target_token_count is None:
|
215
|
+
compression_rate = 1 - compression_rate
|
216
|
+
original_token_count = count_tokens(text)
|
217
|
+
target_token_count = int(original_token_count * compression_rate)
|
218
|
+
else:
|
219
|
+
original_token_count = count_tokens(text)
|
220
|
+
if original_token_count <= target_token_count:
|
221
|
+
return text
|
222
|
+
# Get the compression rate
|
223
|
+
compression_rate = target_token_count / original_token_count
|
224
|
+
|
225
|
+
return semantic_compress_text(
|
226
|
+
full_text = text,
|
227
|
+
compression_rate = compression_rate,
|
228
|
+
reference_text = reference_text_steering
|
229
|
+
)
|
230
|
+
except Exception:
|
231
|
+
traceback.print_exc()
|
232
|
+
|
233
|
+
return text
|
159
234
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
235
|
+
def find_needle_in_haystack(
|
236
|
+
*, haystack: str, needle: str, block_size = 350,
|
237
|
+
semantic_embeddings_weight: float = 0.3, textual_embeddings_weight: float = 0.7
|
238
|
+
):
|
239
|
+
"""
|
240
|
+
Finds the string block in the haystack that contains the needle.
|
241
|
+
|
242
|
+
Args:
|
243
|
+
haystack (str): The haystack string.
|
244
|
+
needle (str): The needle string.
|
245
|
+
block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
|
246
|
+
semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
|
247
|
+
textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
|
248
|
+
|
249
|
+
Returns:
|
250
|
+
str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
|
251
|
+
"""
|
252
|
+
|
253
|
+
try:
|
254
|
+
# Split the haystack into blocks
|
255
|
+
blocks = structurize_text(haystack, tokens_per_chunk=block_size)
|
256
|
+
|
257
|
+
# Compute the embeddings of the needle
|
258
|
+
needle_semantic_embedding = extract_semantic_embeddings(needle)
|
259
|
+
needle_textual_embedding = extract_textual_embeddings(needle.lower())
|
260
|
+
|
261
|
+
# Compute the embeddings of the haystack (each block)
|
262
|
+
haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
|
263
|
+
haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
|
264
|
+
|
265
|
+
# Compute the similarity between the needle and each block
|
266
|
+
semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
|
267
|
+
textual_similarities = [calculate_similarity(needle_textual_embedding, block_embedding) for block_embedding in haystack_textual_embeddings]
|
268
|
+
|
269
|
+
# Sort the blocks by similarity, using the weighted average of semantic and textual similarity
|
270
|
+
sorted_blocks = sorted(zip(blocks, semantic_similarities, textual_similarities), key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight, reverse=True)
|
271
|
+
|
272
|
+
# The most similar block is the one that contains the needle
|
273
|
+
most_similar_block = sorted_blocks[0][0]
|
274
|
+
|
275
|
+
# Find the index of the needle in all the blocks
|
276
|
+
most_similar_block_index = blocks.index(most_similar_block)
|
277
|
+
|
278
|
+
needle_region = blocks[most_similar_block_index-1:most_similar_block_index+2]
|
279
|
+
|
280
|
+
return ''.join(needle_region).strip()
|
281
|
+
except Exception:
|
282
|
+
traceback.print_exc()
|
283
|
+
|
284
|
+
return haystack
|
@@ -1,5 +1,5 @@
|
|
1
1
|
compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
compressor/semantic.py,sha256=
|
2
|
+
compressor/semantic.py,sha256=UUskzKy3Uj90UMDC_zRbPgCr30IxANCUuO1h0nWkKHU,12429
|
3
3
|
compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
|
4
4
|
compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
|
5
5
|
compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
|
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
|
|
8
8
|
compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
|
9
9
|
compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
10
10
|
compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
|
11
|
-
semantic_compressor-1.
|
12
|
-
semantic_compressor-1.
|
13
|
-
semantic_compressor-1.
|
14
|
-
semantic_compressor-1.
|
15
|
-
semantic_compressor-1.
|
11
|
+
semantic_compressor-1.2.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
|
12
|
+
semantic_compressor-1.2.dist-info/METADATA,sha256=s9pltj6AtpXW6OEcZE1h3W8OPYks_PbhdCbJDR9e5b0,4545
|
13
|
+
semantic_compressor-1.2.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
14
|
+
semantic_compressor-1.2.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
|
15
|
+
semantic_compressor-1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|