PyPI - semantic-compressor - Versions diffs - 1.6__py3-none-any.whl → 1.7__py3-none-any.whl - Mend

semantic-compressor 1.6py3-none-any.whl → 1.7py3-none-any.whl

Files changed (6) hide show

compressor/semantic.py CHANGED Viewed

@@ -4,6 +4,7 @@ from sklearn.decomposition import LatentDirichletAllocation
 from sklearn.metrics.pairwise import cosine_similarity
 from onnxruntime_extensions import get_library_path
 from compressor.minbpe.regex import RegexTokenizer
+from concurrent.futures import ProcessPoolExecutor
 from nltk.tokenize import sent_tokenize
 from multiprocessing import cpu_count
 from spellchecker import SpellChecker
@@ -31,7 +32,7 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
 portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
 langdetect_model = fasttext.load_model(fasttext_model_path)
-embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', cpu_count() - 1)
+embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', 1)
 _options = ort.SessionOptions()
 _options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
@@ -263,8 +264,25 @@ def correct_spelling(sentence, detected_lang="pt"):
     return " ".join(final_words)
+def preprocess_and_extract_textual_embedding(block, use_stemming, lang):
+    """
+    Preprocesses a block (lowercasing and stemming if required) and extracts textual embeddings.
+    Args:
+        block (str): The text block to process.
+        use_stemming (bool): Whether to apply stemming.
+        lang (str): Language of the text for stemming.
+    Returns:
+        np.array: The textual embedding of the processed block.
+    """
+    processed_block = block.lower() if not use_stemming else stem_text(block.lower(), lang)
+    return extract_textual_embeddings(processed_block)
 def find_needle_in_haystack(
-        *, haystack: str, needle: str, block_size = 300,
+        *, haystack: str, needle: str, block_size=300,
+        embedding_mode: str = 'both',  # 'semantic', 'textual', or 'both'
         semantic_embeddings_weight: float = 0.3,
         textual_embeddings_weight: float = 0.7,
         use_stemming: bool = False,
@@ -277,16 +295,21 @@ def find_needle_in_haystack(
         haystack (str): The haystack string.
         needle (str): The needle string.
         block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
+        embedding_mode (str, optional): The embedding type to use: 'semantic', 'textual', or 'both'. Defaults to 'both'.
         semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
         textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
         use_stemming (bool, optional): Whether to use stemming for the text. Defaults to False.
         correct_spelling_needle (bool, optional): Whether to correct the spelling of the needle. Defaults to False.
     Returns:
         str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
     """
     try:
+        # Validate embedding_mode
+        if embedding_mode not in {'semantic', 'textual', 'both'}:
+            raise ValueError("Invalid embedding_mode. Choose 'semantic', 'textual', or 'both'.")
         # Split the haystack into blocks
         blocks = structurize_text(haystack, tokens_per_chunk=block_size)
@@ -295,33 +318,72 @@ def find_needle_in_haystack(
         if correct_spelling_needle:
             needle = correct_spelling(needle, lang)
-        # Compute the embeddings of the needle
-        needle_semantic_embedding = extract_semantic_embeddings(needle)
-        needle_textual_embedding = extract_textual_embeddings(needle.lower() if not use_stemming else stem_text(needle, lang))
+        # Compute the embeddings of the needle based on the embedding mode
+        needle_semantic_embedding = None
+        needle_textual_embedding = None
+        if embedding_mode in {'semantic', 'both'}:
+            needle_semantic_embedding = extract_semantic_embeddings(needle)
+        if embedding_mode in {'textual', 'both'}:
+            needle_textual_embedding = extract_textual_embeddings(
+                needle.lower() if not use_stemming else stem_text(needle, lang)
+            )
         # Compute the embeddings of the haystack (each block)
-        haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
-        haystack_textual_embeddings = [extract_textual_embeddings(block.lower() if not use_stemming else stem_text(block.lower(), lang)) for block in blocks]
-        # Compute the similarity between the needle and each block
-        semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
-        textual_similarities = [calculate_similarity(needle_textual_embedding, block_embedding) for block_embedding in haystack_textual_embeddings]
-        # Sort the blocks by similarity, using the weighted average of semantic and textual similarity
-        sorted_blocks = sorted(zip(blocks, semantic_similarities, textual_similarities), key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight, reverse=True)
+        haystack_semantic_embeddings = []
+        haystack_textual_embeddings = []
+        if embedding_mode in {'semantic', 'both'}:
+            with ProcessPoolExecutor() as executor:
+                haystack_semantic_embeddings = list(executor.map(extract_semantic_embeddings, blocks))
+        if embedding_mode in {'textual', 'both'}:
+            with ProcessPoolExecutor(max_workers=cpu_count()//1.5) as executor:
+                haystack_textual_embeddings = list(
+                    executor.map(preprocess_and_extract_textual_embedding, blocks, [use_stemming]*len(blocks), [lang]*len(blocks))
+                )
+        # Compute similarities based on the embedding mode
+        semantic_similarities = []
+        textual_similarities = []
+        if embedding_mode in {'semantic', 'both'}:
+            semantic_similarities = [
+                calculate_similarity(needle_semantic_embedding, block_embedding)
+                for block_embedding in haystack_semantic_embeddings
+            ]
+        if embedding_mode in {'textual', 'both'}:
+            textual_similarities = [
+                calculate_similarity(needle_textual_embedding, block_embedding)
+                for block_embedding in haystack_textual_embeddings
+            ]
+        # Calculate the overall similarity score
+        if embedding_mode == 'semantic':
+            sorted_blocks = sorted(zip(blocks, semantic_similarities), key=lambda x: x[1], reverse=True)
+        elif embedding_mode == 'textual':
+            sorted_blocks = sorted(zip(blocks, textual_similarities), key=lambda x: x[1], reverse=True)
+        else:  # both
+            sorted_blocks = sorted(
+                zip(blocks, semantic_similarities, textual_similarities),
+                key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight,
+                reverse=True
+            )
         # The most similar block is the one that contains the needle
         most_similar_block = sorted_blocks[0][0]
         # Find the index of the needle in all the blocks
         most_similar_block_index = blocks.index(most_similar_block)
-        start_index = most_similar_block_index-1 if most_similar_block_index > 0 else 0
+        start_index = most_similar_block_index - 1 if most_similar_block_index > 0 else 0
-        needle_region = blocks[start_index:most_similar_block_index+2]
+        needle_region = blocks[start_index:most_similar_block_index + 2]
         return ''.join(needle_region).strip()
     except Exception:
         traceback.print_exc()
-    return haystack
+    return haystack

{semantic_compressor-1.6.dist-info → semantic_compressor-1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: semantic_compressor
-Version: 1.6
+Version: 1.7
 Author: Carlo Moro
 Author-email: Carlo Moro <cnmoro@gmail.com>
 Classifier: Programming Language :: Python :: 3

{semantic_compressor-1.6.dist-info → semantic_compressor-1.7.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-compressor/semantic.py,sha256=zIqmajE_X4tyLvvj5I5umdPqm_RB5D4sT5hRQG3qLZA,13985
+compressor/semantic.py,sha256=CJ5WhWKDuBT19qB_5EvMqWw5mtU2jCqBmOkVWXODLX0,16257
 compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
 compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
 compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
 compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
 compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
 compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
-semantic_compressor-1.6.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
-semantic_compressor-1.6.dist-info/METADATA,sha256=MAuc9bjclfAdJ-sLWNq1FYEiH0qtFvdGlvj_ja8gCW8,6178
-semantic_compressor-1.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-semantic_compressor-1.6.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
-semantic_compressor-1.6.dist-info/RECORD,,
+semantic_compressor-1.7.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
+semantic_compressor-1.7.dist-info/METADATA,sha256=I4nO2VQxeIOJAAzs2DMhxmotVV6IvdVMfeheUwAFCTQ,6178
+semantic_compressor-1.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+semantic_compressor-1.7.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
+semantic_compressor-1.7.dist-info/RECORD,,

{semantic_compressor-1.6.dist-info → semantic_compressor-1.7.dist-info}/LICENSE RENAMED Viewed

File without changes

{semantic_compressor-1.6.dist-info → semantic_compressor-1.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{semantic_compressor-1.6.dist-info → semantic_compressor-1.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

semantic-compressor 1.6__py3-none-any.whl → 1.7__py3-none-any.whl

semantic-compressor 1.6py3-none-any.whl → 1.7py3-none-any.whl