semantic-compressor 1.6__tar.gz → 1.7__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (21) hide show
  1. {semantic_compressor-1.6/semantic_compressor.egg-info → semantic_compressor-1.7}/PKG-INFO +1 -1
  2. {semantic_compressor-1.6 → semantic_compressor-1.7}/compressor/semantic.py +80 -18
  3. {semantic_compressor-1.6 → semantic_compressor-1.7}/pyproject.toml +1 -1
  4. {semantic_compressor-1.6 → semantic_compressor-1.7/semantic_compressor.egg-info}/PKG-INFO +1 -1
  5. {semantic_compressor-1.6 → semantic_compressor-1.7}/setup.py +1 -1
  6. {semantic_compressor-1.6 → semantic_compressor-1.7}/LICENSE +0 -0
  7. {semantic_compressor-1.6 → semantic_compressor-1.7}/README.md +0 -0
  8. {semantic_compressor-1.6 → semantic_compressor-1.7}/compressor/__init__.py +0 -0
  9. {semantic_compressor-1.6 → semantic_compressor-1.7}/compressor/minbpe/__init__.py +0 -0
  10. {semantic_compressor-1.6 → semantic_compressor-1.7}/compressor/minbpe/base.py +0 -0
  11. {semantic_compressor-1.6 → semantic_compressor-1.7}/compressor/minbpe/basic.py +0 -0
  12. {semantic_compressor-1.6 → semantic_compressor-1.7}/compressor/minbpe/regex.py +0 -0
  13. {semantic_compressor-1.6 → semantic_compressor-1.7}/compressor/resources/embedding_model.onnx +0 -0
  14. {semantic_compressor-1.6 → semantic_compressor-1.7}/compressor/resources/en_stopwords.pkl +0 -0
  15. {semantic_compressor-1.6 → semantic_compressor-1.7}/compressor/resources/lid.176.ftz +0 -0
  16. {semantic_compressor-1.6 → semantic_compressor-1.7}/compressor/resources/pt_stopwords.pkl +0 -0
  17. {semantic_compressor-1.6 → semantic_compressor-1.7}/semantic_compressor.egg-info/SOURCES.txt +0 -0
  18. {semantic_compressor-1.6 → semantic_compressor-1.7}/semantic_compressor.egg-info/dependency_links.txt +0 -0
  19. {semantic_compressor-1.6 → semantic_compressor-1.7}/semantic_compressor.egg-info/requires.txt +0 -0
  20. {semantic_compressor-1.6 → semantic_compressor-1.7}/semantic_compressor.egg-info/top_level.txt +0 -0
  21. {semantic_compressor-1.6 → semantic_compressor-1.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.6
3
+ Version: 1.7
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -4,6 +4,7 @@ from sklearn.decomposition import LatentDirichletAllocation
4
4
  from sklearn.metrics.pairwise import cosine_similarity
5
5
  from onnxruntime_extensions import get_library_path
6
6
  from compressor.minbpe.regex import RegexTokenizer
7
+ from concurrent.futures import ProcessPoolExecutor
7
8
  from nltk.tokenize import sent_tokenize
8
9
  from multiprocessing import cpu_count
9
10
  from spellchecker import SpellChecker
@@ -31,7 +32,7 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
31
32
  portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
32
33
  langdetect_model = fasttext.load_model(fasttext_model_path)
33
34
 
34
- embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', cpu_count() - 1)
35
+ embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', 1)
35
36
 
36
37
  _options = ort.SessionOptions()
37
38
  _options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
@@ -263,8 +264,25 @@ def correct_spelling(sentence, detected_lang="pt"):
263
264
 
264
265
  return " ".join(final_words)
265
266
 
267
+ def preprocess_and_extract_textual_embedding(block, use_stemming, lang):
268
+ """
269
+ Preprocesses a block (lowercasing and stemming if required) and extracts textual embeddings.
270
+
271
+ Args:
272
+ block (str): The text block to process.
273
+ use_stemming (bool): Whether to apply stemming.
274
+ lang (str): Language of the text for stemming.
275
+
276
+ Returns:
277
+ np.array: The textual embedding of the processed block.
278
+ """
279
+ processed_block = block.lower() if not use_stemming else stem_text(block.lower(), lang)
280
+ return extract_textual_embeddings(processed_block)
281
+
282
+
266
283
  def find_needle_in_haystack(
267
- *, haystack: str, needle: str, block_size = 300,
284
+ *, haystack: str, needle: str, block_size=300,
285
+ embedding_mode: str = 'both', # 'semantic', 'textual', or 'both'
268
286
  semantic_embeddings_weight: float = 0.3,
269
287
  textual_embeddings_weight: float = 0.7,
270
288
  use_stemming: bool = False,
@@ -277,16 +295,21 @@ def find_needle_in_haystack(
277
295
  haystack (str): The haystack string.
278
296
  needle (str): The needle string.
279
297
  block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
298
+ embedding_mode (str, optional): The embedding type to use: 'semantic', 'textual', or 'both'. Defaults to 'both'.
280
299
  semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
281
300
  textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
282
301
  use_stemming (bool, optional): Whether to use stemming for the text. Defaults to False.
283
302
  correct_spelling_needle (bool, optional): Whether to correct the spelling of the needle. Defaults to False.
284
-
303
+
285
304
  Returns:
286
305
  str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
287
306
  """
288
307
 
289
308
  try:
309
+ # Validate embedding_mode
310
+ if embedding_mode not in {'semantic', 'textual', 'both'}:
311
+ raise ValueError("Invalid embedding_mode. Choose 'semantic', 'textual', or 'both'.")
312
+
290
313
  # Split the haystack into blocks
291
314
  blocks = structurize_text(haystack, tokens_per_chunk=block_size)
292
315
 
@@ -295,33 +318,72 @@ def find_needle_in_haystack(
295
318
  if correct_spelling_needle:
296
319
  needle = correct_spelling(needle, lang)
297
320
 
298
- # Compute the embeddings of the needle
299
- needle_semantic_embedding = extract_semantic_embeddings(needle)
300
- needle_textual_embedding = extract_textual_embeddings(needle.lower() if not use_stemming else stem_text(needle, lang))
321
+ # Compute the embeddings of the needle based on the embedding mode
322
+ needle_semantic_embedding = None
323
+ needle_textual_embedding = None
301
324
 
325
+ if embedding_mode in {'semantic', 'both'}:
326
+ needle_semantic_embedding = extract_semantic_embeddings(needle)
327
+
328
+ if embedding_mode in {'textual', 'both'}:
329
+ needle_textual_embedding = extract_textual_embeddings(
330
+ needle.lower() if not use_stemming else stem_text(needle, lang)
331
+ )
332
+
302
333
  # Compute the embeddings of the haystack (each block)
303
- haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
304
- haystack_textual_embeddings = [extract_textual_embeddings(block.lower() if not use_stemming else stem_text(block.lower(), lang)) for block in blocks]
305
-
306
- # Compute the similarity between the needle and each block
307
- semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
308
- textual_similarities = [calculate_similarity(needle_textual_embedding, block_embedding) for block_embedding in haystack_textual_embeddings]
309
-
310
- # Sort the blocks by similarity, using the weighted average of semantic and textual similarity
311
- sorted_blocks = sorted(zip(blocks, semantic_similarities, textual_similarities), key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight, reverse=True)
334
+ haystack_semantic_embeddings = []
335
+ haystack_textual_embeddings = []
312
336
 
337
+ if embedding_mode in {'semantic', 'both'}:
338
+ with ProcessPoolExecutor() as executor:
339
+ haystack_semantic_embeddings = list(executor.map(extract_semantic_embeddings, blocks))
340
+
341
+ if embedding_mode in {'textual', 'both'}:
342
+ with ProcessPoolExecutor(max_workers=cpu_count()//1.5) as executor:
343
+ haystack_textual_embeddings = list(
344
+ executor.map(preprocess_and_extract_textual_embedding, blocks, [use_stemming]*len(blocks), [lang]*len(blocks))
345
+ )
346
+
347
+ # Compute similarities based on the embedding mode
348
+ semantic_similarities = []
349
+ textual_similarities = []
350
+
351
+ if embedding_mode in {'semantic', 'both'}:
352
+ semantic_similarities = [
353
+ calculate_similarity(needle_semantic_embedding, block_embedding)
354
+ for block_embedding in haystack_semantic_embeddings
355
+ ]
356
+
357
+ if embedding_mode in {'textual', 'both'}:
358
+ textual_similarities = [
359
+ calculate_similarity(needle_textual_embedding, block_embedding)
360
+ for block_embedding in haystack_textual_embeddings
361
+ ]
362
+
363
+ # Calculate the overall similarity score
364
+ if embedding_mode == 'semantic':
365
+ sorted_blocks = sorted(zip(blocks, semantic_similarities), key=lambda x: x[1], reverse=True)
366
+ elif embedding_mode == 'textual':
367
+ sorted_blocks = sorted(zip(blocks, textual_similarities), key=lambda x: x[1], reverse=True)
368
+ else: # both
369
+ sorted_blocks = sorted(
370
+ zip(blocks, semantic_similarities, textual_similarities),
371
+ key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight,
372
+ reverse=True
373
+ )
374
+
313
375
  # The most similar block is the one that contains the needle
314
376
  most_similar_block = sorted_blocks[0][0]
315
377
 
316
378
  # Find the index of the needle in all the blocks
317
379
  most_similar_block_index = blocks.index(most_similar_block)
318
380
 
319
- start_index = most_similar_block_index-1 if most_similar_block_index > 0 else 0
381
+ start_index = most_similar_block_index - 1 if most_similar_block_index > 0 else 0
320
382
 
321
- needle_region = blocks[start_index:most_similar_block_index+2]
383
+ needle_region = blocks[start_index:most_similar_block_index + 2]
322
384
 
323
385
  return ''.join(needle_region).strip()
324
386
  except Exception:
325
387
  traceback.print_exc()
326
388
 
327
- return haystack
389
+ return haystack
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "semantic_compressor"
7
- version = "1.6"
7
+ version = "1.7"
8
8
  authors = [
9
9
  { name="Carlo Moro", email="cnmoro@gmail.com" },
10
10
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.6
3
+ Version: 1.7
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='semantic_compressor',
5
- version='1.6',
5
+ version='1.7',
6
6
  author='Carlo Moro',
7
7
  author_email='cnmoro@gmail.com',
8
8
  description="Semantic text compression",