PyPI - semantic-compressor - Versions diffs - 1.3__tar.gz → 1.5__tar.gz - Mend

semantic-compressor 1.3tar.gz → 1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

{semantic_compressor-1.3/semantic_compressor.egg-info → semantic_compressor-1.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: semantic_compressor
-Version: 1.3
+Version: 1.5
 Author: Carlo Moro
 Author-email: Carlo Moro <cnmoro@gmail.com>
 Classifier: Programming Language :: Python :: 3
@@ -15,6 +15,7 @@ Requires-Dist: scikit-learn
 Requires-Dist: fasttext
 Requires-Dist: onnxruntime
 Requires-Dist: onnxruntime-extensions
+Requires-Dist: pyspellchecker
 ```python
 from compressor.semantic import compress_text, find_needle_in_haystack

{semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/semantic.py RENAMED Viewed

@@ -6,14 +6,24 @@ from onnxruntime_extensions import get_library_path
 from compressor.minbpe.regex import RegexTokenizer
 from nltk.tokenize import sent_tokenize
 from multiprocessing import cpu_count
+from spellchecker import SpellChecker
+from nltk.stem import PorterStemmer
+from nltk.stem import RSLPStemmer
 from collections import Counter
 import onnxruntime as ort
+import nltk
+# Inicializando os stemmers
+stemmer_english = PorterStemmer()
+stemmer_portuguese = RSLPStemmer()
 tokenizer = RegexTokenizer()
 nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
 os.environ['NLTK_DATA'] = nltk_data_path
+nltk.download('rslp')
 english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
 portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
 fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
@@ -230,10 +240,35 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
     return text
+def stem_text(text, lang='en'):
+    if lang == 'en':
+        stems = [stemmer_english.stem(word) for word in text.split()]
+        stemmed_text = " ".join(stems)
+    else:
+        stems = [stemmer_portuguese.stem(word) for word in text.split()]
+        stemmed_text = " ".join(stems)
+    return stemmed_text
+def correct_spelling(sentence, detected_lang="pt"):
+    spell = SpellChecker(language=detected_lang)
+    words = sentence.split()
+    fixed = [spell.correction(word) for word in words]
+    final_words = []
+    # Interpolate original words with fixed words (each word could be "None" in "fixed" when no correction is needed)
+    for original, fixed_word in zip(words, fixed):
+        final_words.append(fixed_word if fixed_word is not None else original)
+    return " ".join(final_words)
 def find_needle_in_haystack(
         *, haystack: str, needle: str, block_size = 300,
         semantic_embeddings_weight: float = 0.3,
-        textual_embeddings_weight: float = 0.7
+        textual_embeddings_weight: float = 0.7,
+        use_stemming: bool = False,
+        correct_spelling_needle: bool = False
     ):
     """
     Finds the string block in the haystack that contains the needle.
@@ -244,7 +279,9 @@ def find_needle_in_haystack(
         block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
         semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
         textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
+        use_stemming (bool, optional): Whether to use stemming for the text. Defaults to False.
+        correct_spelling_needle (bool, optional): Whether to correct the spelling of the needle. Defaults to False.
     Returns:
         str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
     """
@@ -252,14 +289,19 @@ def find_needle_in_haystack(
     try:
         # Split the haystack into blocks
         blocks = structurize_text(haystack, tokens_per_chunk=block_size)
+        lang = detect_language(f"{needle}\n\n{haystack}")
+        if correct_spelling_needle:
+            needle = correct_spelling(needle, lang)
         # Compute the embeddings of the needle
         needle_semantic_embedding = extract_semantic_embeddings(needle)
-        needle_textual_embedding = extract_textual_embeddings(needle.lower())
+        needle_textual_embedding = extract_textual_embeddings(needle.lower() if not use_stemming else stem_text(needle, lang))
         # Compute the embeddings of the haystack (each block)
         haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
-        haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
+        haystack_textual_embeddings = [extract_textual_embeddings(block.lower() if not use_stemming else stem_text(block.lower(), lang)) for block in blocks]
         # Compute the similarity between the needle and each block
         semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]

{semantic_compressor-1.3 → semantic_compressor-1.5}/pyproject.toml RENAMED Viewed

@@ -1,10 +1,10 @@
 [build-system]
-requires = ["setuptools>=61.0", "numpy<2", "nltk", "scikit-learn", "fasttext", "onnxruntime", "onnxruntime-extensions"]
+requires = ["setuptools>=61.0", "numpy<2", "nltk", "scikit-learn", "fasttext", "onnxruntime", "onnxruntime-extensions", "pyspellchecker"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "semantic_compressor"
-version = "1.3"
+version = "1.5"
 authors = [
   { name="Carlo Moro", email="cnmoro@gmail.com" },
 ]
@@ -22,5 +22,6 @@ dependencies = [
     "scikit-learn",
     "fasttext",
     "onnxruntime",
-    "onnxruntime-extensions"
+    "onnxruntime-extensions",
+    "pyspellchecker"
 ]

{semantic_compressor-1.3 → semantic_compressor-1.5/semantic_compressor.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: semantic_compressor
-Version: 1.3
+Version: 1.5
 Author: Carlo Moro
 Author-email: Carlo Moro <cnmoro@gmail.com>
 Classifier: Programming Language :: Python :: 3
@@ -15,6 +15,7 @@ Requires-Dist: scikit-learn
 Requires-Dist: fasttext
 Requires-Dist: onnxruntime
 Requires-Dist: onnxruntime-extensions
+Requires-Dist: pyspellchecker
 ```python
 from compressor.semantic import compress_text, find_needle_in_haystack

{semantic_compressor-1.3 → semantic_compressor-1.5}/semantic_compressor.egg-info/requires.txt RENAMED Viewed

@@ -4,3 +4,4 @@ scikit-learn
 fasttext
 onnxruntime
 onnxruntime-extensions
+pyspellchecker

{semantic_compressor-1.3 → semantic_compressor-1.5}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name='semantic_compressor',
-    version='1.3',
+    version='1.5',
     author='Carlo Moro',
     author_email='cnmoro@gmail.com',
     description="Semantic text compression",
@@ -17,7 +17,8 @@ setup(
         "scikit-learn",
         "fasttext",
         "onnxruntime",
-        "onnxruntime-extensions"
+        "onnxruntime-extensions",
+        "pyspellchecker"
     ],
     classifiers=[
         'Programming Language :: Python :: 3',