semantic-compressor 1.1__tar.gz → 1.2__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (21) hide show
  1. {semantic_compressor-1.1 → semantic_compressor-1.2}/PKG-INFO +1 -1
  2. {semantic_compressor-1.1 → semantic_compressor-1.2}/compressor/semantic.py +135 -22
  3. {semantic_compressor-1.1 → semantic_compressor-1.2}/pyproject.toml +1 -1
  4. {semantic_compressor-1.1 → semantic_compressor-1.2}/semantic_compressor.egg-info/PKG-INFO +1 -1
  5. {semantic_compressor-1.1 → semantic_compressor-1.2}/setup.py +1 -1
  6. {semantic_compressor-1.1 → semantic_compressor-1.2}/LICENSE +0 -0
  7. {semantic_compressor-1.1 → semantic_compressor-1.2}/README.md +0 -0
  8. {semantic_compressor-1.1 → semantic_compressor-1.2}/compressor/__init__.py +0 -0
  9. {semantic_compressor-1.1 → semantic_compressor-1.2}/compressor/minbpe/__init__.py +0 -0
  10. {semantic_compressor-1.1 → semantic_compressor-1.2}/compressor/minbpe/base.py +0 -0
  11. {semantic_compressor-1.1 → semantic_compressor-1.2}/compressor/minbpe/basic.py +0 -0
  12. {semantic_compressor-1.1 → semantic_compressor-1.2}/compressor/minbpe/regex.py +0 -0
  13. {semantic_compressor-1.1 → semantic_compressor-1.2}/compressor/resources/embedding_model.onnx +0 -0
  14. {semantic_compressor-1.1 → semantic_compressor-1.2}/compressor/resources/en_stopwords.pkl +0 -0
  15. {semantic_compressor-1.1 → semantic_compressor-1.2}/compressor/resources/lid.176.ftz +0 -0
  16. {semantic_compressor-1.1 → semantic_compressor-1.2}/compressor/resources/pt_stopwords.pkl +0 -0
  17. {semantic_compressor-1.1 → semantic_compressor-1.2}/semantic_compressor.egg-info/SOURCES.txt +0 -0
  18. {semantic_compressor-1.1 → semantic_compressor-1.2}/semantic_compressor.egg-info/dependency_links.txt +0 -0
  19. {semantic_compressor-1.1 → semantic_compressor-1.2}/semantic_compressor.egg-info/requires.txt +0 -0
  20. {semantic_compressor-1.1 → semantic_compressor-1.2}/semantic_compressor.egg-info/top_level.txt +0 -0
  21. {semantic_compressor-1.1 → semantic_compressor-1.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.1
3
+ Version: 1.2
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,11 +1,12 @@
1
+ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
1
2
  import numpy as np, pickle, fasttext, os, traceback, importlib
2
- from sklearn.feature_extraction.text import CountVectorizer
3
3
  from sklearn.decomposition import LatentDirichletAllocation
4
4
  from sklearn.metrics.pairwise import cosine_similarity
5
5
  from onnxruntime_extensions import get_library_path
6
6
  from compressor.minbpe.regex import RegexTokenizer
7
7
  from nltk.tokenize import sent_tokenize
8
8
  from multiprocessing import cpu_count
9
+ from collections import Counter
9
10
  import onnxruntime as ort
10
11
 
11
12
  tokenizer = RegexTokenizer()
@@ -33,7 +34,15 @@ embedding_model = ort.InferenceSession(
33
34
  providers=_providers
34
35
  )
35
36
 
36
- def extract_embeddings(text):
37
+ hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
38
+
39
+ def extract_textual_embeddings(text):
40
+ X = hashing_vectorizer.fit_transform([text])
41
+ dense_matrix = X.toarray()
42
+ fixed_size_matrix = np.sum(dense_matrix, axis=0)
43
+ return fixed_size_matrix.tolist()
44
+
45
+ def extract_semantic_embeddings(text):
37
46
  return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
38
47
 
39
48
  def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
@@ -59,10 +68,43 @@ def detect_language(text):
59
68
  detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
60
69
  return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
61
70
 
62
- def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
63
- def calculate_similarity(embed1, embed2):
64
- return cosine_similarity([embed1], [embed2])[0][0]
71
+ def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
72
+ words = text.split()
73
+
74
+ ngrams = [' '.join(words[i:i+ngram_size]) for i in range(len(words)-ngram_size+1)]
75
+
76
+ counter = Counter(ngrams)
77
+
78
+ repeated_ngrams = [ngram for ngram, count in counter.items() if count > threshold]
79
+
80
+ # Iterate through each repeated n-gram and remove the duplicates
81
+ for ngram in repeated_ngrams:
82
+ # Track if it's the first occurrence
83
+ first_occurrence = True
84
+ i = 0
85
+
86
+ while i <= len(words) - ngram_size:
87
+ # Form a sliding window n-gram from the current position
88
+ current_ngram = ' '.join(words[i:i+ngram_size])
89
+
90
+ if current_ngram == ngram:
91
+ if first_occurrence:
92
+ # Mark the first occurrence and skip
93
+ first_occurrence = False
94
+ i += ngram_size # Move ahead by the size of the n-gram
95
+ else:
96
+ # Remove the n-gram by removing the words that make up this n-gram
97
+ del words[i:i+ngram_size]
98
+ else:
99
+ i += 1 # Move forward
100
+
101
+ # Rejoin the words back into a single string
102
+ return ' '.join(words)
103
+
104
+ def calculate_similarity(embed1, embed2):
105
+ return cosine_similarity([embed1], [embed2])[0][0]
65
106
 
107
+ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None):
66
108
  def create_lda_model(texts, stopwords):
67
109
  vectorizer = CountVectorizer(stop_words=stopwords)
68
110
  doc_term_matrix = vectorizer.fit_transform(texts)
@@ -75,7 +117,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
75
117
  return lda.transform(vec)[0]
76
118
 
77
119
  def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
78
- sentence_embedding = extract_embeddings(sentence)
120
+ sentence_embedding = extract_semantic_embeddings(sentence)
79
121
  semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
80
122
 
81
123
  topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
@@ -106,7 +148,13 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
106
148
  lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
107
149
 
108
150
  # Get document-level embedding
109
- doc_embedding = extract_embeddings(full_text)
151
+ doc_embedding = extract_semantic_embeddings(full_text)
152
+
153
+ if reference_text is not None:
154
+ reference_text_embedding = extract_semantic_embeddings(reference_text)
155
+
156
+ # Compute an weighted average of the two embeddings (60% document and 40% reference)
157
+ doc_embedding = 0.6 * doc_embedding + 0.4 * reference_text_embedding
110
158
 
111
159
  # Calculate importance for each sentence
112
160
  sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
@@ -137,13 +185,18 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
137
185
  # Reorder sentences to maintain original flow
138
186
  compressed_text.sort(key=lambda x: sentences.index(x))
139
187
 
140
- return ' '.join(compressed_text)
188
+ # Capitalize the first letter of each sentence
189
+ compressed_text = [sentence.capitalize() for sentence in compressed_text]
190
+
191
+ cleaned_compressed_text = ' '.join(compressed_text).replace(' ', ' ').strip()
192
+ cleaned_compressed_text = compute_and_remove_repeated_ngrams(cleaned_compressed_text)
193
+ return cleaned_compressed_text
141
194
  except Exception:
142
195
  traceback.print_exc()
143
196
 
144
197
  return full_text
145
198
 
146
- def compress_text(text, *, target_token_count=None, compression_rate=0.7):
199
+ def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None):
147
200
  """
148
201
  Compress text using either a compression rate or a target token count.
149
202
  If both are provided, the compression rate will be used.
@@ -152,20 +205,80 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7):
152
205
  text (str): The text to be compressed.
153
206
  target_token_count (int, optional): The target token count for compression. Defaults to None.
154
207
  compression_rate (float, optional): The compression rate as a percentage. Defaults to 0.7. Example: 0.7 means 70% reduction.
155
-
208
+ reference_text_steering (str, optional): The reference text to steer the compression. Defaults to None.
209
+
156
210
  Returns:
157
211
  str: The compressed text.
158
212
  """
213
+ try:
214
+ if target_token_count is None:
215
+ compression_rate = 1 - compression_rate
216
+ original_token_count = count_tokens(text)
217
+ target_token_count = int(original_token_count * compression_rate)
218
+ else:
219
+ original_token_count = count_tokens(text)
220
+ if original_token_count <= target_token_count:
221
+ return text
222
+ # Get the compression rate
223
+ compression_rate = target_token_count / original_token_count
224
+
225
+ return semantic_compress_text(
226
+ full_text = text,
227
+ compression_rate = compression_rate,
228
+ reference_text = reference_text_steering
229
+ )
230
+ except Exception:
231
+ traceback.print_exc()
232
+
233
+ return text
159
234
 
160
- if target_token_count is None:
161
- compression_rate = 1 - compression_rate
162
- original_token_count = count_tokens(text)
163
- target_token_count = int(original_token_count * compression_rate)
164
- else:
165
- original_token_count = count_tokens(text)
166
- if original_token_count <= target_token_count:
167
- return text
168
- # Get the compression rate
169
- compression_rate = target_token_count / original_token_count
170
-
171
- return semantic_compress_text(text, compression_rate)
235
+ def find_needle_in_haystack(
236
+ *, haystack: str, needle: str, block_size = 350,
237
+ semantic_embeddings_weight: float = 0.3, textual_embeddings_weight: float = 0.7
238
+ ):
239
+ """
240
+ Finds the string block in the haystack that contains the needle.
241
+
242
+ Args:
243
+ haystack (str): The haystack string.
244
+ needle (str): The needle string.
245
+ block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
246
+ semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
247
+ textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
248
+
249
+ Returns:
250
+ str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
251
+ """
252
+
253
+ try:
254
+ # Split the haystack into blocks
255
+ blocks = structurize_text(haystack, tokens_per_chunk=block_size)
256
+
257
+ # Compute the embeddings of the needle
258
+ needle_semantic_embedding = extract_semantic_embeddings(needle)
259
+ needle_textual_embedding = extract_textual_embeddings(needle.lower())
260
+
261
+ # Compute the embeddings of the haystack (each block)
262
+ haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
263
+ haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
264
+
265
+ # Compute the similarity between the needle and each block
266
+ semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
267
+ textual_similarities = [calculate_similarity(needle_textual_embedding, block_embedding) for block_embedding in haystack_textual_embeddings]
268
+
269
+ # Sort the blocks by similarity, using the weighted average of semantic and textual similarity
270
+ sorted_blocks = sorted(zip(blocks, semantic_similarities, textual_similarities), key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight, reverse=True)
271
+
272
+ # The most similar block is the one that contains the needle
273
+ most_similar_block = sorted_blocks[0][0]
274
+
275
+ # Find the index of the needle in all the blocks
276
+ most_similar_block_index = blocks.index(most_similar_block)
277
+
278
+ needle_region = blocks[most_similar_block_index-1:most_similar_block_index+2]
279
+
280
+ return ''.join(needle_region).strip()
281
+ except Exception:
282
+ traceback.print_exc()
283
+
284
+ return haystack
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "semantic_compressor"
7
- version = "1.1"
7
+ version = "1.2"
8
8
  authors = [
9
9
  { name="Carlo Moro", email="cnmoro@gmail.com" },
10
10
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.1
3
+ Version: 1.2
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='semantic_compressor',
5
- version='1.1',
5
+ version='1.2',
6
6
  author='Carlo Moro',
7
7
  author_email='cnmoro@gmail.com',
8
8
  description="Semantic text compression",