PyPI - semantic-compressor - Versions diffs - 1.1__py3-none-any.whl → 1.3__py3-none-any.whl - Mend

semantic-compressor 1.1py3-none-any.whl → 1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of semantic-compressor might be problematic. Click here for more details.

Files changed (6) hide show

compressor/semantic.py CHANGED Viewed

@@ -1,11 +1,12 @@
+from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
 import numpy as np, pickle, fasttext, os, traceback, importlib
-from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
 from sklearn.metrics.pairwise import cosine_similarity
 from onnxruntime_extensions import get_library_path
 from compressor.minbpe.regex import RegexTokenizer
 from nltk.tokenize import sent_tokenize
 from multiprocessing import cpu_count
+from collections import Counter
 import onnxruntime as ort
 tokenizer = RegexTokenizer()
@@ -33,7 +34,15 @@ embedding_model = ort.InferenceSession(
     providers=_providers
 )
-def extract_embeddings(text):
+hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
+def extract_textual_embeddings(text):
+    X = hashing_vectorizer.fit_transform([text])
+    dense_matrix = X.toarray()
+    fixed_size_matrix = np.sum(dense_matrix, axis=0)
+    return fixed_size_matrix.tolist()
+def extract_semantic_embeddings(text):
     return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
 def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
@@ -59,10 +68,43 @@ def detect_language(text):
     detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
     return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
-def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
-    def calculate_similarity(embed1, embed2):
-        return cosine_similarity([embed1], [embed2])[0][0]
+def compute_and_remove_repeated_ngrams(text, ngram_size=3, threshold=3):
+    words = text.split()
+    ngrams = [' '.join(words[i:i+ngram_size]) for i in range(len(words)-ngram_size+1)]
+    counter = Counter(ngrams)
+    repeated_ngrams = [ngram for ngram, count in counter.items() if count > threshold]
+    # Iterate through each repeated n-gram and remove the duplicates
+    for ngram in repeated_ngrams:
+        # Track if it's the first occurrence
+        first_occurrence = True
+        i = 0
+        while i <= len(words) - ngram_size:
+            # Form a sliding window n-gram from the current position
+            current_ngram = ' '.join(words[i:i+ngram_size])
+            if current_ngram == ngram:
+                if first_occurrence:
+                    # Mark the first occurrence and skip
+                    first_occurrence = False
+                    i += ngram_size  # Move ahead by the size of the n-gram
+                else:
+                    # Remove the n-gram by removing the words that make up this n-gram
+                    del words[i:i+ngram_size]
+            else:
+                i += 1  # Move forward
+    # Rejoin the words back into a single string
+    return ' '.join(words)
+def calculate_similarity(embed1, embed2):
+    return cosine_similarity([embed1], [embed2])[0][0]
+def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5, reference_text: str = None):
     def create_lda_model(texts, stopwords):
         vectorizer = CountVectorizer(stop_words=stopwords)
         doc_term_matrix = vectorizer.fit_transform(texts)
@@ -75,7 +117,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
         return lda.transform(vec)[0]
     def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
-        sentence_embedding = extract_embeddings(sentence)
+        sentence_embedding = extract_semantic_embeddings(sentence)
         semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
         topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
@@ -106,7 +148,13 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
         lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
         # Get document-level embedding
-        doc_embedding = extract_embeddings(full_text)
+        doc_embedding = extract_semantic_embeddings(full_text)
+        if reference_text is not None:
+            reference_text_embedding = extract_semantic_embeddings(reference_text)
+            # Compute an weighted average of the two embeddings (60% document and 40% reference)
+            doc_embedding = 0.6 * doc_embedding + 0.4 * reference_text_embedding
         # Calculate importance for each sentence
         sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
@@ -137,13 +185,18 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
         # Reorder sentences to maintain original flow
         compressed_text.sort(key=lambda x: sentences.index(x))
-        return ' '.join(compressed_text)
+        # Capitalize the first letter of each sentence
+        compressed_text = [sentence.capitalize() for sentence in compressed_text]
+        cleaned_compressed_text = ' '.join(compressed_text).replace('  ', ' ').strip()
+        cleaned_compressed_text = compute_and_remove_repeated_ngrams(cleaned_compressed_text)
+        return cleaned_compressed_text
     except Exception:
         traceback.print_exc()
     return full_text
-def compress_text(text, *, target_token_count=None, compression_rate=0.7):
+def compress_text(text, *, target_token_count=None, compression_rate=0.7, reference_text_steering=None):
     """
     Compress text using either a compression rate or a target token count.
     If both are provided, the compression rate will be used.
@@ -152,20 +205,81 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7):
         text (str): The text to be compressed.
         target_token_count (int, optional): The target token count for compression. Defaults to None.
         compression_rate (float, optional): The compression rate as a percentage. Defaults to 0.7. Example: 0.7 means 70% reduction.
+        reference_text_steering (str, optional): The reference text to steer the compression. Defaults to None.
     Returns:
         str: The compressed text.
     """
+    try:
+        if target_token_count is None:
+            compression_rate = 1 - compression_rate
+        else:
+            original_token_count = count_tokens(text)
+            if original_token_count <= target_token_count:
+                return text
+            # Get the compression rate
+            compression_rate = target_token_count / original_token_count
+        return semantic_compress_text(
+            full_text = text,
+            compression_rate = compression_rate,
+            reference_text = reference_text_steering
+        )
+    except Exception:
+        traceback.print_exc()
+    return text
+def find_needle_in_haystack(
+        *, haystack: str, needle: str, block_size = 300,
+        semantic_embeddings_weight: float = 0.3,
+        textual_embeddings_weight: float = 0.7
+    ):
+    """
+    Finds the string block in the haystack that contains the needle.
+    Args:
+        haystack (str): The haystack string.
+        needle (str): The needle string.
+        block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
+        semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
+        textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
+    Returns:
+        str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
+    """
+    try:
+        # Split the haystack into blocks
+        blocks = structurize_text(haystack, tokens_per_chunk=block_size)
+        # Compute the embeddings of the needle
+        needle_semantic_embedding = extract_semantic_embeddings(needle)
+        needle_textual_embedding = extract_textual_embeddings(needle.lower())
-    if target_token_count is None:
-        compression_rate = 1 - compression_rate
-        original_token_count = count_tokens(text)
-        target_token_count = int(original_token_count * compression_rate)
-    else:
-        original_token_count = count_tokens(text)
-        if original_token_count <= target_token_count:
-            return text
-        # Get the compression rate
-        compression_rate = target_token_count / original_token_count
-    return semantic_compress_text(text, compression_rate)
+        # Compute the embeddings of the haystack (each block)
+        haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
+        haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
+        # Compute the similarity between the needle and each block
+        semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
+        textual_similarities = [calculate_similarity(needle_textual_embedding, block_embedding) for block_embedding in haystack_textual_embeddings]
+        # Sort the blocks by similarity, using the weighted average of semantic and textual similarity
+        sorted_blocks = sorted(zip(blocks, semantic_similarities, textual_similarities), key=lambda x: x[1] * semantic_embeddings_weight + x[2] * textual_embeddings_weight, reverse=True)
+        # The most similar block is the one that contains the needle
+        most_similar_block = sorted_blocks[0][0]
+        # Find the index of the needle in all the blocks
+        most_similar_block_index = blocks.index(most_similar_block)
+        start_index = most_similar_block_index-1 if most_similar_block_index > 0 else 0
+        needle_region = blocks[start_index:most_similar_block_index+2]
+        return ''.join(needle_region).strip()
+    except Exception:
+        traceback.print_exc()
+    return haystack

{semantic_compressor-1.1.dist-info → semantic_compressor-1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: semantic_compressor
-Version: 1.1
+Version: 1.3
 Author: Carlo Moro
 Author-email: Carlo Moro <cnmoro@gmail.com>
 Classifier: Programming Language :: Python :: 3
@@ -17,7 +17,7 @@ Requires-Dist: onnxruntime
 Requires-Dist: onnxruntime-extensions
 ```python
-from compressor.semantic import compress_text
+from compressor.semantic import compress_text, find_needle_in_haystack
 text = """
 Akin to France's heartier, spicier, richer boeuf bourguignon, "alcatra" is synonymous with a single island in the remote Azores archipelago.  The Azores, an archipelago of nine islands belonging to Portugal and located roughly between Europe and the US, are cow country. They're said to be home to more cattle than people, and despite being home to less than 3% of Portugal's population, the islands produce 30% of Portugal's dairy products and 13% of its beef.   Beef is part of everyday life in the Azores, and come spring on one particular island, the ingredient even crosses paths with religion.   In the days following Easter, Azorean people kick off a series of religious celebrations called Festas do Espírito Santo (Festivals of the Holy Spirit). During the 13th Century, a Catholic sect called the Cult of the Holy Spirit predicted a utopian era on Earth. This fringe faith was discouraged in mainland Europe but lived on in these remote islands in the middle of the Atlantic Ocean. The sect was also promoted by Portuguese queen Elizabeth of Aragon (also known as Elizabeth of Portugal), who was known for her charity.  Over the subsequent centuries, a series of festivals emerged on the Azores that blended these utopian aspirations with the queen's alleged generosity. Between Easter and the week following Whitsunday, a total of eight weeks, the islands host a series of parades and other cultural and religious festivals that revolve around brightly coloured community houses called impérios. During this time, the community houses also collect donations from locals, which is then redistributed to people in the form of bread, beef and wine.  These three elements generally come together in the form of a soup, called sopa do Espírito Santo, that's served at the impérios during the festivals. But on the island of Terceira, locals combine these ingredients in a different and delicious way, one that's become synonymous with the island's culinary identity.  Austin Bush The Festas do Espírito Santo revolve around community houses called impérios (Credit: Austin Bush)Austin Bush The Festas do Espírito Santo revolve around community houses called impérios (Credit: Austin Bush) "People eat alcatra year round, but especially during the celebrations in spring and summer," explains Duarte Fournier. He is the Grand Master of the Brotherhood of Alcatra, a culinary fraternity on Terceira, and is telling me about the island's signature dish: cuts of beef braised in local wine, smoked pork fat and dried spices, resulting in something of a heartier, spicier, richer version of France's famed boeuf bourguignon.  We're sitting at a cafe in Angra do Heroísmo, Terceira's largest city, and as we chat, children race to and from a nearby império delivering massive trays of raw beef to neighbours. Fournier tells me that alcatra likely has its origins in northern Portugal, where there's a tradition of baking goat in wine.  "We don't know why it's called alcatra," he says. "We suppose it's from Arabic. Al catar means 'small pieces of meat'."  According to Fournier, alcatra differs from mainland Portugal's baked meat dishes in that it includes dried spices, generally allspice and black peppercorns, but also sometimes clove or cinnamon.
@@ -30,4 +30,15 @@ print(compressed_text_90_percent)
 compressed_text_to_100_tokens = compress_text(text, target_token_count=100)
 print(compressed_text_to_100_tokens)
 # 'Akin to France\'s heartier, spicier, richer boeuf bourguignon, "alcatra" is synonymous with a single island in the remote Azores archipelago.'
+text_reference = "Archipelago Islands"
+compressed_text_with_steering = compress_text(text, compression_rate=0.7, reference_text_steering=text_reference)
+# 'Akin to france\'s heartier, spicier, richer boeuf bourguignon, "alcatra" is synonymous with a single island in the remote azores archipelago. The azores, an archipelago of nine islands belonging to portugal and located roughly between europe and the us, are cow country. Beef is part of everyday life in the azores, and come spring on one particular island, the ingredient even crosses paths with religion. But on the island of terceira, locals combine these ingredients in a different and delicious way, one that\'s become synonymous with the island\'s culinary identity. He is the grand master of the brotherhood of alcatra, a culinary fraternity on terceira, and is telling me about the island\'s signature dish: cuts of beef braised in local wine, smoked pork fat and dried spices, resulting in something of a heartier, spicier, richer version of france\'s famed boeuf bourguignon.'
+needle_in_haystack = find_needle_in_haystack(
+    haystack = text,
+    needle = "Archipelago Islands",
+    block_size = 200
+)
+# 'Akin to France\'s heartier, spicier, richer boeuf bourguignon, "alcatra" is synonymous with a single island in the remote Azores archipelago.  The Azores, an archipelago of nine islands belonging to Portugal and located roughly between Europe and the US, are cow country. They\'re said to be home to more cattle than people, and despite being home to less than 3% of Portugal\'s population, the islands'
 ```

{semantic_compressor-1.1.dist-info → semantic_compressor-1.3.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-compressor/semantic.py,sha256=wXKmerx4j7cdYqjyHDJVkRX_k-2tbnBx5gNa6Qnpgtg,7148
+compressor/semantic.py,sha256=8MQdV-ZmTBMC3sEIQr565hafxS6v5_A_-dNiqb0R5Xg,12379
 compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
 compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
 compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
 compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
 compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
 compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
-semantic_compressor-1.1.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
-semantic_compressor-1.1.dist-info/METADATA,sha256=sXv2tP4lrymD77feKsrt69B4cBbC-ktMh5IMxRe5XWQ,4545
-semantic_compressor-1.1.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-semantic_compressor-1.1.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
-semantic_compressor-1.1.dist-info/RECORD,,
+semantic_compressor-1.3.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
+semantic_compressor-1.3.dist-info/METADATA,sha256=baw_1lughU6R-9nQ_23COy7DP70ZI6H_DlH3YvrYBRU,6148
+semantic_compressor-1.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+semantic_compressor-1.3.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
+semantic_compressor-1.3.dist-info/RECORD,,

{semantic_compressor-1.1.dist-info → semantic_compressor-1.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{semantic_compressor-1.1.dist-info → semantic_compressor-1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{semantic_compressor-1.1.dist-info → semantic_compressor-1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

semantic-compressor 1.1__py3-none-any.whl → 1.3__py3-none-any.whl

Potentially problematic release.

semantic-compressor 1.1py3-none-any.whl → 1.3py3-none-any.whl