semantic-compressor 1.1__py3-none-any.whl → 1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
compressor/semantic.py CHANGED
@@ -1,11 +1,12 @@
1
+ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
1
2
  import numpy as np, pickle, fasttext, os, traceback, importlib
2
- from sklearn.feature_extraction.text import CountVectorizer
3
3
  from sklearn.decomposition import LatentDirichletAllocation
4
4
  from sklearn.metrics.pairwise import cosine_similarity
5
5
  from onnxruntime_extensions import get_library_path
6
6
  from compressor.minbpe.regex import RegexTokenizer
7
7
  from nltk.tokenize import sent_tokenize
8
8
  from multiprocessing import cpu_count
9
+ from collections import Counter
9
10
  import onnxruntime as ort
10
11
 
11
12
  tokenizer = RegexTokenizer()
@@ -33,7 +34,15 @@ embedding_model = ort.InferenceSession(
33
34
  providers=_providers
34
35
  )
35
36
 
36
- def extract_embeddings(text):
37
+ hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
38
+
39
+ def extract_textual_embeddings(text):
40
+ X = hashing_vectorizer.fit_transform([text])
41
+ dense_matrix = X.toarray()
42
+ fixed_size_matrix = np.sum(dense_matrix, axis=0)
43
+ return fixed_size_matrix.tolist()
44
+
45
+ def extract_semantic_embeddings(text):
37
46
  return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
38
47
 
39
48
  def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
@@ -59,10 +68,43 @@ def detect_language(text):
59
68
  detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
60
69
  return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
61
70
 
62
- def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
63
- def calculate_similarity(embed1, embed2):
64
- return cosine_similarity([embed1], [embed2])[0][0]
71
+ def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
72
+ words = text.split()
73
+
74
+ ngrams = [' '.join(words[i:i+ngram_size]) for i in range(len(words)-ngram_size+1)]
75
+
76
+ counter = Counter(ngrams)
77
+
78
+ repeated_ngrams = [ngram for ngram, count in counter.items() if count > threshold]
79
+
80
+ # Iterate through each repeated n-gram and remove the duplicates
81
+ for ngram in repeated_ngrams:
82
+ # Track if it's the first occurrence
83
+ first_occurrence = True
84
+ i = 0
85
+
86
+ while i <= len(words) - ngram_size:
87
+ # Form a sliding window n-gram from the current position
88
+ current_ngram = ' '.join(words[i:i+ngram_size])
89
+
90
+ if current_ngram == ngram:
91
+ if first_occurrence:
92
+ # Mark the first occurrence and skip
93
+ first_occurrence = False
94
+ i += ngram_size # Move ahead by the size of the n-gram
95
+ else:
96
+ # Remove the n-gram by removing the words that make up this n-gram
97
+ del words[i:i+ngram_size]
98
+ else:
99
+ i += 1 # Move forward
100
+
101
+ # Rejoin the words back into a single string
102
+ return ' '.join(words)
103
+
104
+ def calculate_similarity(embed1, embed2):
105
+ return cosine_similarity([embed1], [embed2])[0][0]
65
106
 
107
+ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None):
66
108
  def create_lda_model(texts, stopwords):
67
109
  vectorizer = CountVectorizer(stop_words=stopwords)
68
110
  doc_term_matrix = vectorizer.fit_transform(texts)
@@ -75,7 +117,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
75
117
  return lda.transform(vec)[0]
76
118
 
77
119
  def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
78
- sentence_embedding = extract_embeddings(sentence)
120
+ sentence_embedding = extract_semantic_embeddings(sentence)
79
121
  semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
80
122
 
81
123
  topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
@@ -106,7 +148,13 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
106
148
  lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
107
149
 
108
150
  # Get document-level embedding
109
- doc_embedding = extract_embeddings(full_text)
151
+ doc_embedding = extract_semantic_embeddings(full_text)
152
+
153
+ if reference_text is not None:
154
+ reference_text_embedding = extract_semantic_embeddings(reference_text)
155
+
156
+ # Compute an weighted average of the two embeddings (60% document and 40% reference)
157
+ doc_embedding = 0.6 * doc_embedding + 0.4 * reference_text_embedding
110
158
 
111
159
  # Calculate importance for each sentence
112
160
  sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
@@ -137,13 +185,18 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
137
185
  # Reorder sentences to maintain original flow
138
186
  compressed_text.sort(key=lambda x: sentences.index(x))
139
187
 
140
- return ' '.join(compressed_text)
188
+ # Capitalize the first letter of each sentence
189
+ compressed_text = [sentence.capitalize() for sentence in compressed_text]
190
+
191
+ cleaned_compressed_text = ' '.join(compressed_text).replace(' ', ' ').strip()
192
+ cleaned_compressed_text = compute_and_remove_repeated_ngrams(cleaned_compressed_text)
193
+ return cleaned_compressed_text
141
194
  except Exception:
142
195
  traceback.print_exc()
143
196
 
144
197
  return full_text
145
198
 
146
- def compress_text(text, *, target_token_count=None, compression_rate=0.7):
199
+ def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None):
147
200
  """
148
201
  Compress text using either a compression rate or a target token count.
149
202
  If both are provided, the compression rate will be used.
@@ -152,20 +205,80 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7):
152
205
  text (str): The text to be compressed.
153
206
  target_token_count (int, optional): The target token count for compression. Defaults to None.
154
207
  compression_rate (float, optional): The compression rate as a percentage. Defaults to 0.7. Example: 0.7 means 70% reduction.
155
-
208
+ reference_text_steering (str, optional): The reference text to steer the compression. Defaults to None.
209
+
156
210
  Returns:
157
211
  str: The compressed text.
158
212
  """
213
+ try:
214
+ if target_token_count is None:
215
+ compression_rate = 1 - compression_rate
216
+ original_token_count = count_tokens(text)
217
+ target_token_count = int(original_token_count * compression_rate)
218
+ else:
219
+ original_token_count = count_tokens(text)
220
+ if original_token_count <= target_token_count:
221
+ return text
222
+ # Get the compression rate
223
+ compression_rate = target_token_count / original_token_count
224
+
225
+ return semantic_compress_text(
226
+ full_text = text,
227
+ compression_rate = compression_rate,
228
+ reference_text = reference_text_steering
229
+ )
230
+ except Exception:
231
+ traceback.print_exc()
232
+
233
+ return text
159
234
 
160
- if target_token_count is None:
161
- compression_rate = 1 - compression_rate
162
- original_token_count = count_tokens(text)
163
- target_token_count = int(original_token_count * compression_rate)
164
- else:
165
- original_token_count = count_tokens(text)
166
- if original_token_count <= target_token_count:
167
- return text
168
- # Get the compression rate
169
- compression_rate = target_token_count / original_token_count
170
-
171
- return semantic_compress_text(text, compression_rate)
235
+ def find_needle_in_haystack(
236
+ *, haystack: str, needle: str, block_size = 350,
237
+ semantic_embeddings_weight: float = 0.3, textual_embeddings_weight: float = 0.7
238
+ ):
239
+ """
240
+ Finds the string block in the haystack that contains the needle.
241
+
242
+ Args:
243
+ haystack (str): The haystack string.
244
+ needle (str): The needle string.
245
+ block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
246
+ semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
247
+ textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
248
+
249
+ Returns:
250
+ str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
251
+ """
252
+
253
+ try:
254
+ # Split the haystack into blocks
255
+ blocks = structurize_text(haystack, tokens_per_chunk=block_size)
256
+
257
+ # Compute the embeddings of the needle
258
+ needle_semantic_embedding = extract_semantic_embeddings(needle)
259
+ needle_textual_embedding = extract_textual_embeddings(needle.lower())
260
+
261
+ # Compute the embeddings of the haystack (each block)
262
+ haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
263
+ haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
264
+
265
+ # Compute the similarity between the needle and each block
266
+ semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
267
+ textual_similarities = [calculate_similarity(needle_textual_embedding, block_embedding) for block_embedding in haystack_textual_embeddings]
268
+
269
+ # Sort the blocks by similarity, using the weighted average of semantic and textual similarity
270
+ sorted_blocks = sorted(zip(blocks, semantic_similarities, textual_similarities), key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight, reverse=True)
271
+
272
+ # The most similar block is the one that contains the needle
273
+ most_similar_block = sorted_blocks[0][0]
274
+
275
+ # Find the index of the needle in all the blocks
276
+ most_similar_block_index = blocks.index(most_similar_block)
277
+
278
+ needle_region = blocks[most_similar_block_index-1:most_similar_block_index+2]
279
+
280
+ return ''.join(needle_region).strip()
281
+ except Exception:
282
+ traceback.print_exc()
283
+
284
+ return haystack
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.1
3
+ Version: 1.2
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,5 +1,5 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=wXKmerx4j7cdYqjyHDJVkRX_k-2tbnBx5gNa6Qnpgtg,7148
2
+ compressor/semantic.py,sha256=UUskzKy3Uj90UMDC_zRbPgCr30IxANCUuO1h0nWkKHU,12429
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
8
8
  compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
9
  compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
10
  compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
- semantic_compressor-1.1.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
- semantic_compressor-1.1.dist-info/METADATA,sha256=sXv2tP4lrymD77feKsrt69B4cBbC-ktMh5IMxRe5XWQ,4545
13
- semantic_compressor-1.1.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
14
- semantic_compressor-1.1.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
- semantic_compressor-1.1.dist-info/RECORD,,
11
+ semantic_compressor-1.2.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
+ semantic_compressor-1.2.dist-info/METADATA,sha256=s9pltj6AtpXW6OEcZE1h3W8OPYks_PbhdCbJDR9e5b0,4545
13
+ semantic_compressor-1.2.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
14
+ semantic_compressor-1.2.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
+ semantic_compressor-1.2.dist-info/RECORD,,