semantic-compressor 1.1__py3-none-any.whl → 1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressor/semantic.py +135 -22
- {semantic_compressor-1.1.dist-info → semantic_compressor-1.2.dist-info}/METADATA +1 -1
- {semantic_compressor-1.1.dist-info → semantic_compressor-1.2.dist-info}/RECORD +6 -6
- {semantic_compressor-1.1.dist-info → semantic_compressor-1.2.dist-info}/LICENSE +0 -0
- {semantic_compressor-1.1.dist-info → semantic_compressor-1.2.dist-info}/WHEEL +0 -0
- {semantic_compressor-1.1.dist-info → semantic_compressor-1.2.dist-info}/top_level.txt +0 -0
compressor/semantic.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
+
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
|
1
2
|
import numpy as np, pickle, fasttext, os, traceback, importlib
|
2
|
-
from sklearn.feature_extraction.text import CountVectorizer
|
3
3
|
from sklearn.decomposition import LatentDirichletAllocation
|
4
4
|
from sklearn.metrics.pairwise import cosine_similarity
|
5
5
|
from onnxruntime_extensions import get_library_path
|
6
6
|
from compressor.minbpe.regex import RegexTokenizer
|
7
7
|
from nltk.tokenize import sent_tokenize
|
8
8
|
from multiprocessing import cpu_count
|
9
|
+
from collections import Counter
|
9
10
|
import onnxruntime as ort
|
10
11
|
|
11
12
|
tokenizer = RegexTokenizer()
|
@@ -33,7 +34,15 @@ embedding_model = ort.InferenceSession(
|
|
33
34
|
providers=_providers
|
34
35
|
)
|
35
36
|
|
36
|
-
|
37
|
+
hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
|
38
|
+
|
39
|
+
def extract_textual_embeddings(text):
|
40
|
+
X = hashing_vectorizer.fit_transform([text])
|
41
|
+
dense_matrix = X.toarray()
|
42
|
+
fixed_size_matrix = np.sum(dense_matrix, axis=0)
|
43
|
+
return fixed_size_matrix.tolist()
|
44
|
+
|
45
|
+
def extract_semantic_embeddings(text):
|
37
46
|
return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
|
38
47
|
|
39
48
|
def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
|
@@ -59,10 +68,43 @@ def detect_language(text):
|
|
59
68
|
detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
|
60
69
|
return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
|
61
70
|
|
62
|
-
def
|
63
|
-
|
64
|
-
|
71
|
+
def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
|
72
|
+
words = text.split()
|
73
|
+
|
74
|
+
ngrams = [' '.join(words[i:i+ngram_size]) for i in range(len(words)-ngram_size+1)]
|
75
|
+
|
76
|
+
counter = Counter(ngrams)
|
77
|
+
|
78
|
+
repeated_ngrams = [ngram for ngram, count in counter.items() if count > threshold]
|
79
|
+
|
80
|
+
# Iterate through each repeated n-gram and remove the duplicates
|
81
|
+
for ngram in repeated_ngrams:
|
82
|
+
# Track if it's the first occurrence
|
83
|
+
first_occurrence = True
|
84
|
+
i = 0
|
85
|
+
|
86
|
+
while i <= len(words) - ngram_size:
|
87
|
+
# Form a sliding window n-gram from the current position
|
88
|
+
current_ngram = ' '.join(words[i:i+ngram_size])
|
89
|
+
|
90
|
+
if current_ngram == ngram:
|
91
|
+
if first_occurrence:
|
92
|
+
# Mark the first occurrence and skip
|
93
|
+
first_occurrence = False
|
94
|
+
i += ngram_size # Move ahead by the size of the n-gram
|
95
|
+
else:
|
96
|
+
# Remove the n-gram by removing the words that make up this n-gram
|
97
|
+
del words[i:i+ngram_size]
|
98
|
+
else:
|
99
|
+
i += 1 # Move forward
|
100
|
+
|
101
|
+
# Rejoin the words back into a single string
|
102
|
+
return ' '.join(words)
|
103
|
+
|
104
|
+
def calculate_similarity(embed1, embed2):
|
105
|
+
return cosine_similarity([embed1], [embed2])[0][0]
|
65
106
|
|
107
|
+
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None):
|
66
108
|
def create_lda_model(texts, stopwords):
|
67
109
|
vectorizer = CountVectorizer(stop_words=stopwords)
|
68
110
|
doc_term_matrix = vectorizer.fit_transform(texts)
|
@@ -75,7 +117,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
75
117
|
return lda.transform(vec)[0]
|
76
118
|
|
77
119
|
def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
|
78
|
-
sentence_embedding =
|
120
|
+
sentence_embedding = extract_semantic_embeddings(sentence)
|
79
121
|
semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
|
80
122
|
|
81
123
|
topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
|
@@ -106,7 +148,13 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
106
148
|
lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
|
107
149
|
|
108
150
|
# Get document-level embedding
|
109
|
-
doc_embedding =
|
151
|
+
doc_embedding = extract_semantic_embeddings(full_text)
|
152
|
+
|
153
|
+
if reference_text is not None:
|
154
|
+
reference_text_embedding = extract_semantic_embeddings(reference_text)
|
155
|
+
|
156
|
+
# Compute an weighted average of the two embeddings (60% document and 40% reference)
|
157
|
+
doc_embedding = 0.6 * doc_embedding + 0.4 * reference_text_embedding
|
110
158
|
|
111
159
|
# Calculate importance for each sentence
|
112
160
|
sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
|
@@ -137,13 +185,18 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
137
185
|
# Reorder sentences to maintain original flow
|
138
186
|
compressed_text.sort(key=lambda x: sentences.index(x))
|
139
187
|
|
140
|
-
|
188
|
+
# Capitalize the first letter of each sentence
|
189
|
+
compressed_text = [sentence.capitalize() for sentence in compressed_text]
|
190
|
+
|
191
|
+
cleaned_compressed_text = ' '.join(compressed_text).replace(' ', ' ').strip()
|
192
|
+
cleaned_compressed_text = compute_and_remove_repeated_ngrams(cleaned_compressed_text)
|
193
|
+
return cleaned_compressed_text
|
141
194
|
except Exception:
|
142
195
|
traceback.print_exc()
|
143
196
|
|
144
197
|
return full_text
|
145
198
|
|
146
|
-
def compress_text(text, *, target_token_count=None, compression_rate=0.7):
|
199
|
+
def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None):
|
147
200
|
"""
|
148
201
|
Compress text using either a compression rate or a target token count.
|
149
202
|
If both are provided, the compression rate will be used.
|
@@ -152,20 +205,80 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7):
|
|
152
205
|
text (str): The text to be compressed.
|
153
206
|
target_token_count (int, optional): The target token count for compression. Defaults to None.
|
154
207
|
compression_rate (float, optional): The compression rate as a percentage. Defaults to 0.7. Example: 0.7 means 70% reduction.
|
155
|
-
|
208
|
+
reference_text_steering (str, optional): The reference text to steer the compression. Defaults to None.
|
209
|
+
|
156
210
|
Returns:
|
157
211
|
str: The compressed text.
|
158
212
|
"""
|
213
|
+
try:
|
214
|
+
if target_token_count is None:
|
215
|
+
compression_rate = 1 - compression_rate
|
216
|
+
original_token_count = count_tokens(text)
|
217
|
+
target_token_count = int(original_token_count * compression_rate)
|
218
|
+
else:
|
219
|
+
original_token_count = count_tokens(text)
|
220
|
+
if original_token_count <= target_token_count:
|
221
|
+
return text
|
222
|
+
# Get the compression rate
|
223
|
+
compression_rate = target_token_count / original_token_count
|
224
|
+
|
225
|
+
return semantic_compress_text(
|
226
|
+
full_text = text,
|
227
|
+
compression_rate = compression_rate,
|
228
|
+
reference_text = reference_text_steering
|
229
|
+
)
|
230
|
+
except Exception:
|
231
|
+
traceback.print_exc()
|
232
|
+
|
233
|
+
return text
|
159
234
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
235
|
+
def find_needle_in_haystack(
|
236
|
+
*, haystack: str, needle: str, block_size = 350,
|
237
|
+
semantic_embeddings_weight: float = 0.3, textual_embeddings_weight: float = 0.7
|
238
|
+
):
|
239
|
+
"""
|
240
|
+
Finds the string block in the haystack that contains the needle.
|
241
|
+
|
242
|
+
Args:
|
243
|
+
haystack (str): The haystack string.
|
244
|
+
needle (str): The needle string.
|
245
|
+
block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
|
246
|
+
semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
|
247
|
+
textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
|
248
|
+
|
249
|
+
Returns:
|
250
|
+
str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
|
251
|
+
"""
|
252
|
+
|
253
|
+
try:
|
254
|
+
# Split the haystack into blocks
|
255
|
+
blocks = structurize_text(haystack, tokens_per_chunk=block_size)
|
256
|
+
|
257
|
+
# Compute the embeddings of the needle
|
258
|
+
needle_semantic_embedding = extract_semantic_embeddings(needle)
|
259
|
+
needle_textual_embedding = extract_textual_embeddings(needle.lower())
|
260
|
+
|
261
|
+
# Compute the embeddings of the haystack (each block)
|
262
|
+
haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
|
263
|
+
haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
|
264
|
+
|
265
|
+
# Compute the similarity between the needle and each block
|
266
|
+
semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
|
267
|
+
textual_similarities = [calculate_similarity(needle_textual_embedding, block_embedding) for block_embedding in haystack_textual_embeddings]
|
268
|
+
|
269
|
+
# Sort the blocks by similarity, using the weighted average of semantic and textual similarity
|
270
|
+
sorted_blocks = sorted(zip(blocks, semantic_similarities, textual_similarities), key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight, reverse=True)
|
271
|
+
|
272
|
+
# The most similar block is the one that contains the needle
|
273
|
+
most_similar_block = sorted_blocks[0][0]
|
274
|
+
|
275
|
+
# Find the index of the needle in all the blocks
|
276
|
+
most_similar_block_index = blocks.index(most_similar_block)
|
277
|
+
|
278
|
+
needle_region = blocks[most_similar_block_index-1:most_similar_block_index+2]
|
279
|
+
|
280
|
+
return ''.join(needle_region).strip()
|
281
|
+
except Exception:
|
282
|
+
traceback.print_exc()
|
283
|
+
|
284
|
+
return haystack
|
@@ -1,5 +1,5 @@
|
|
1
1
|
compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
compressor/semantic.py,sha256=
|
2
|
+
compressor/semantic.py,sha256=UUskzKy3Uj90UMDC_zRbPgCr30IxANCUuO1h0nWkKHU,12429
|
3
3
|
compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
|
4
4
|
compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
|
5
5
|
compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
|
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
|
|
8
8
|
compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
|
9
9
|
compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
10
10
|
compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
|
11
|
-
semantic_compressor-1.
|
12
|
-
semantic_compressor-1.
|
13
|
-
semantic_compressor-1.
|
14
|
-
semantic_compressor-1.
|
15
|
-
semantic_compressor-1.
|
11
|
+
semantic_compressor-1.2.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
|
12
|
+
semantic_compressor-1.2.dist-info/METADATA,sha256=s9pltj6AtpXW6OEcZE1h3W8OPYks_PbhdCbJDR9e5b0,4545
|
13
|
+
semantic_compressor-1.2.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
14
|
+
semantic_compressor-1.2.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
|
15
|
+
semantic_compressor-1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|