semantic-compressor 1.1__py3-none-any.whl → 1.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
compressor/semantic.py CHANGED
@@ -1,11 +1,12 @@
1
+ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
1
2
  import numpy as np, pickle, fasttext, os, traceback, importlib
2
- from sklearn.feature_extraction.text import CountVectorizer
3
3
  from sklearn.decomposition import LatentDirichletAllocation
4
4
  from sklearn.metrics.pairwise import cosine_similarity
5
5
  from onnxruntime_extensions import get_library_path
6
6
  from compressor.minbpe.regex import RegexTokenizer
7
7
  from nltk.tokenize import sent_tokenize
8
8
  from multiprocessing import cpu_count
9
+ from collections import Counter
9
10
  import onnxruntime as ort
10
11
 
11
12
  tokenizer = RegexTokenizer()
@@ -33,7 +34,15 @@ embedding_model = ort.InferenceSession(
33
34
  providers=_providers
34
35
  )
35
36
 
36
- def extract_embeddings(text):
37
+ hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
38
+
39
+ def extract_textual_embeddings(text):
40
+ X = hashing_vectorizer.fit_transform([text])
41
+ dense_matrix = X.toarray()
42
+ fixed_size_matrix = np.sum(dense_matrix, axis=0)
43
+ return fixed_size_matrix.tolist()
44
+
45
+ def extract_semantic_embeddings(text):
37
46
  return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
38
47
 
39
48
  def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
@@ -59,10 +68,43 @@ def detect_language(text):
59
68
  detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
60
69
  return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
61
70
 
62
- def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
63
- def calculate_similarity(embed1, embed2):
64
- return cosine_similarity([embed1], [embed2])[0][0]
71
+ def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
72
+ words = text.split()
73
+
74
+ ngrams = [' '.join(words[i:i+ngram_size]) for i in range(len(words)-ngram_size+1)]
75
+
76
+ counter = Counter(ngrams)
77
+
78
+ repeated_ngrams = [ngram for ngram, count in counter.items() if count > threshold]
79
+
80
+ # Iterate through each repeated n-gram and remove the duplicates
81
+ for ngram in repeated_ngrams:
82
+ # Track if it's the first occurrence
83
+ first_occurrence = True
84
+ i = 0
85
+
86
+ while i <= len(words) - ngram_size:
87
+ # Form a sliding window n-gram from the current position
88
+ current_ngram = ' '.join(words[i:i+ngram_size])
89
+
90
+ if current_ngram == ngram:
91
+ if first_occurrence:
92
+ # Mark the first occurrence and skip
93
+ first_occurrence = False
94
+ i += ngram_size # Move ahead by the size of the n-gram
95
+ else:
96
+ # Remove the n-gram by removing the words that make up this n-gram
97
+ del words[i:i+ngram_size]
98
+ else:
99
+ i += 1 # Move forward
100
+
101
+ # Rejoin the words back into a single string
102
+ return ' '.join(words)
103
+
104
+ def calculate_similarity(embed1, embed2):
105
+ return cosine_similarity([embed1], [embed2])[0][0]
65
106
 
107
+ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None):
66
108
  def create_lda_model(texts, stopwords):
67
109
  vectorizer = CountVectorizer(stop_words=stopwords)
68
110
  doc_term_matrix = vectorizer.fit_transform(texts)
@@ -75,7 +117,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
75
117
  return lda.transform(vec)[0]
76
118
 
77
119
  def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
78
- sentence_embedding = extract_embeddings(sentence)
120
+ sentence_embedding = extract_semantic_embeddings(sentence)
79
121
  semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
80
122
 
81
123
  topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
@@ -106,7 +148,13 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
106
148
  lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
107
149
 
108
150
  # Get document-level embedding
109
- doc_embedding = extract_embeddings(full_text)
151
+ doc_embedding = extract_semantic_embeddings(full_text)
152
+
153
+ if reference_text is not None:
154
+ reference_text_embedding = extract_semantic_embeddings(reference_text)
155
+
156
+ # Compute an weighted average of the two embeddings (60% document and 40% reference)
157
+ doc_embedding = 0.6 * doc_embedding + 0.4 * reference_text_embedding
110
158
 
111
159
  # Calculate importance for each sentence
112
160
  sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
@@ -137,13 +185,18 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
137
185
  # Reorder sentences to maintain original flow
138
186
  compressed_text.sort(key=lambda x: sentences.index(x))
139
187
 
140
- return ' '.join(compressed_text)
188
+ # Capitalize the first letter of each sentence
189
+ compressed_text = [sentence.capitalize() for sentence in compressed_text]
190
+
191
+ cleaned_compressed_text = ' '.join(compressed_text).replace(' ', ' ').strip()
192
+ cleaned_compressed_text = compute_and_remove_repeated_ngrams(cleaned_compressed_text)
193
+ return cleaned_compressed_text
141
194
  except Exception:
142
195
  traceback.print_exc()
143
196
 
144
197
  return full_text
145
198
 
146
- def compress_text(text, *, target_token_count=None, compression_rate=0.7):
199
+ def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None):
147
200
  """
148
201
  Compress text using either a compression rate or a target token count.
149
202
  If both are provided, the compression rate will be used.
@@ -152,20 +205,80 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7):
152
205
  text (str): The text to be compressed.
153
206
  target_token_count (int, optional): The target token count for compression. Defaults to None.
154
207
  compression_rate (float, optional): The compression rate as a percentage. Defaults to 0.7. Example: 0.7 means 70% reduction.
155
-
208
+ reference_text_steering (str, optional): The reference text to steer the compression. Defaults to None.
209
+
156
210
  Returns:
157
211
  str: The compressed text.
158
212
  """
213
+ try:
214
+ if target_token_count is None:
215
+ compression_rate = 1 - compression_rate
216
+ original_token_count = count_tokens(text)
217
+ target_token_count = int(original_token_count * compression_rate)
218
+ else:
219
+ original_token_count = count_tokens(text)
220
+ if original_token_count <= target_token_count:
221
+ return text
222
+ # Get the compression rate
223
+ compression_rate = target_token_count / original_token_count
224
+
225
+ return semantic_compress_text(
226
+ full_text = text,
227
+ compression_rate = compression_rate,
228
+ reference_text = reference_text_steering
229
+ )
230
+ except Exception:
231
+ traceback.print_exc()
232
+
233
+ return text
159
234
 
160
- if target_token_count is None:
161
- compression_rate = 1 - compression_rate
162
- original_token_count = count_tokens(text)
163
- target_token_count = int(original_token_count * compression_rate)
164
- else:
165
- original_token_count = count_tokens(text)
166
- if original_token_count <= target_token_count:
167
- return text
168
- # Get the compression rate
169
- compression_rate = target_token_count / original_token_count
170
-
171
- return semantic_compress_text(text, compression_rate)
235
+ def find_needle_in_haystack(
236
+ *, haystack: str, needle: str, block_size = 350,
237
+ semantic_embeddings_weight: float = 0.3, textual_embeddings_weight: float = 0.7
238
+ ):
239
+ """
240
+ Finds the string block in the haystack that contains the needle.
241
+
242
+ Args:
243
+ haystack (str): The haystack string.
244
+ needle (str): The needle string.
245
+ block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
246
+ semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
247
+ textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
248
+
249
+ Returns:
250
+ str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
251
+ """
252
+
253
+ try:
254
+ # Split the haystack into blocks
255
+ blocks = structurize_text(haystack, tokens_per_chunk=block_size)
256
+
257
+ # Compute the embeddings of the needle
258
+ needle_semantic_embedding = extract_semantic_embeddings(needle)
259
+ needle_textual_embedding = extract_textual_embeddings(needle.lower())
260
+
261
+ # Compute the embeddings of the haystack (each block)
262
+ haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
263
+ haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
264
+
265
+ # Compute the similarity between the needle and each block
266
+ semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
267
+ textual_similarities = [calculate_similarity(needle_textual_embedding, block_embedding) for block_embedding in haystack_textual_embeddings]
268
+
269
+ # Sort the blocks by similarity, using the weighted average of semantic and textual similarity
270
+ sorted_blocks = sorted(zip(blocks, semantic_similarities, textual_similarities), key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight, reverse=True)
271
+
272
+ # The most similar block is the one that contains the needle
273
+ most_similar_block = sorted_blocks[0][0]
274
+
275
+ # Find the index of the needle in all the blocks
276
+ most_similar_block_index = blocks.index(most_similar_block)
277
+
278
+ needle_region = blocks[most_similar_block_index-1:most_similar_block_index+2]
279
+
280
+ return ''.join(needle_region).strip()
281
+ except Exception:
282
+ traceback.print_exc()
283
+
284
+ return haystack
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.1
3
+ Version: 1.2
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,5 +1,5 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=wXKmerx4j7cdYqjyHDJVkRX_k-2tbnBx5gNa6Qnpgtg,7148
2
+ compressor/semantic.py,sha256=UUskzKy3Uj90UMDC_zRbPgCr30IxANCUuO1h0nWkKHU,12429
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
8
8
  compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
9
  compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
10
  compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
- semantic_compressor-1.1.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
- semantic_compressor-1.1.dist-info/METADATA,sha256=sXv2tP4lrymD77feKsrt69B4cBbC-ktMh5IMxRe5XWQ,4545
13
- semantic_compressor-1.1.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
14
- semantic_compressor-1.1.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
- semantic_compressor-1.1.dist-info/RECORD,,
11
+ semantic_compressor-1.2.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
+ semantic_compressor-1.2.dist-info/METADATA,sha256=s9pltj6AtpXW6OEcZE1h3W8OPYks_PbhdCbJDR9e5b0,4545
13
+ semantic_compressor-1.2.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
14
+ semantic_compressor-1.2.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
+ semantic_compressor-1.2.dist-info/RECORD,,