semantic-compressor 1.3__tar.gz → 1.5__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (21) hide show
  1. {semantic_compressor-1.3/semantic_compressor.egg-info → semantic_compressor-1.5}/PKG-INFO +2 -1
  2. {semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/semantic.py +46 -4
  3. {semantic_compressor-1.3 → semantic_compressor-1.5}/pyproject.toml +4 -3
  4. {semantic_compressor-1.3 → semantic_compressor-1.5/semantic_compressor.egg-info}/PKG-INFO +2 -1
  5. {semantic_compressor-1.3 → semantic_compressor-1.5}/semantic_compressor.egg-info/requires.txt +1 -0
  6. {semantic_compressor-1.3 → semantic_compressor-1.5}/setup.py +3 -2
  7. {semantic_compressor-1.3 → semantic_compressor-1.5}/LICENSE +0 -0
  8. {semantic_compressor-1.3 → semantic_compressor-1.5}/README.md +0 -0
  9. {semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/__init__.py +0 -0
  10. {semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/minbpe/__init__.py +0 -0
  11. {semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/minbpe/base.py +0 -0
  12. {semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/minbpe/basic.py +0 -0
  13. {semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/minbpe/regex.py +0 -0
  14. {semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/resources/embedding_model.onnx +0 -0
  15. {semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/resources/en_stopwords.pkl +0 -0
  16. {semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/resources/lid.176.ftz +0 -0
  17. {semantic_compressor-1.3 → semantic_compressor-1.5}/compressor/resources/pt_stopwords.pkl +0 -0
  18. {semantic_compressor-1.3 → semantic_compressor-1.5}/semantic_compressor.egg-info/SOURCES.txt +0 -0
  19. {semantic_compressor-1.3 → semantic_compressor-1.5}/semantic_compressor.egg-info/dependency_links.txt +0 -0
  20. {semantic_compressor-1.3 → semantic_compressor-1.5}/semantic_compressor.egg-info/top_level.txt +0 -0
  21. {semantic_compressor-1.3 → semantic_compressor-1.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.3
3
+ Version: 1.5
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -15,6 +15,7 @@ Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
16
  Requires-Dist: onnxruntime
17
17
  Requires-Dist: onnxruntime-extensions
18
+ Requires-Dist: pyspellchecker
18
19
 
19
20
  ```python
20
21
  from compressor.semantic import compress_text, find_needle_in_haystack
@@ -6,14 +6,24 @@ from onnxruntime_extensions import get_library_path
6
6
  from compressor.minbpe.regex import RegexTokenizer
7
7
  from nltk.tokenize import sent_tokenize
8
8
  from multiprocessing import cpu_count
9
+ from spellchecker import SpellChecker
10
+ from nltk.stem import PorterStemmer
11
+ from nltk.stem import RSLPStemmer
9
12
  from collections import Counter
10
13
  import onnxruntime as ort
14
+ import nltk
15
+
16
+ # Inicializando os stemmers
17
+ stemmer_english = PorterStemmer()
18
+ stemmer_portuguese = RSLPStemmer()
11
19
 
12
20
  tokenizer = RegexTokenizer()
13
21
  nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
14
22
 
15
23
  os.environ['NLTK_DATA'] = nltk_data_path
16
24
 
25
+ nltk.download('rslp')
26
+
17
27
  english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
18
28
  portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
19
29
  fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
@@ -230,10 +240,35 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
230
240
 
231
241
  return text
232
242
 
243
+ def stem_text(text, lang='en'):
244
+ if lang == 'en':
245
+ stems = [stemmer_english.stem(word) for word in text.split()]
246
+ stemmed_text = " ".join(stems)
247
+ else:
248
+ stems = [stemmer_portuguese.stem(word) for word in text.split()]
249
+ stemmed_text = " ".join(stems)
250
+
251
+ return stemmed_text
252
+
253
+ def correct_spelling(sentence, detected_lang="pt"):
254
+ spell = SpellChecker(language=detected_lang)
255
+ words = sentence.split()
256
+ fixed = [spell.correction(word) for word in words]
257
+
258
+ final_words = []
259
+
260
+ # Interpolate original words with fixed words (each word could be "None" in "fixed" when no correction is needed)
261
+ for original, fixed_word in zip(words, fixed):
262
+ final_words.append(fixed_word if fixed_word is not None else original)
263
+
264
+ return " ".join(final_words)
265
+
233
266
  def find_needle_in_haystack(
234
267
  *, haystack: str, needle: str, block_size = 300,
235
268
  semantic_embeddings_weight: float = 0.3,
236
- textual_embeddings_weight: float = 0.7
269
+ textual_embeddings_weight: float = 0.7,
270
+ use_stemming: bool = False,
271
+ correct_spelling_needle: bool = False
237
272
  ):
238
273
  """
239
274
  Finds the string block in the haystack that contains the needle.
@@ -244,7 +279,9 @@ def find_needle_in_haystack(
244
279
  block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
245
280
  semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
246
281
  textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
247
-
282
+ use_stemming (bool, optional): Whether to use stemming for the text. Defaults to False.
283
+ correct_spelling_needle (bool, optional): Whether to correct the spelling of the needle. Defaults to False.
284
+
248
285
  Returns:
249
286
  str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
250
287
  """
@@ -252,14 +289,19 @@ def find_needle_in_haystack(
252
289
  try:
253
290
  # Split the haystack into blocks
254
291
  blocks = structurize_text(haystack, tokens_per_chunk=block_size)
292
+
293
+ lang = detect_language(f"{needle}\n\n{haystack}")
294
+
295
+ if correct_spelling_needle:
296
+ needle = correct_spelling(needle, lang)
255
297
 
256
298
  # Compute the embeddings of the needle
257
299
  needle_semantic_embedding = extract_semantic_embeddings(needle)
258
- needle_textual_embedding = extract_textual_embeddings(needle.lower())
300
+ needle_textual_embedding = extract_textual_embeddings(needle.lower() if not use_stemming else stem_text(needle, lang))
259
301
 
260
302
  # Compute the embeddings of the haystack (each block)
261
303
  haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
262
- haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
304
+ haystack_textual_embeddings = [extract_textual_embeddings(block.lower() if not use_stemming else stem_text(block.lower(), lang)) for block in blocks]
263
305
 
264
306
  # Compute the similarity between the needle and each block
265
307
  semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
@@ -1,10 +1,10 @@
1
1
  [build-system]
2
- requires = ["setuptools>=61.0", "numpy<2", "nltk", "scikit-learn", "fasttext", "onnxruntime", "onnxruntime-extensions"]
2
+ requires = ["setuptools>=61.0", "numpy<2", "nltk", "scikit-learn", "fasttext", "onnxruntime", "onnxruntime-extensions", "pyspellchecker"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "semantic_compressor"
7
- version = "1.3"
7
+ version = "1.5"
8
8
  authors = [
9
9
  { name="Carlo Moro", email="cnmoro@gmail.com" },
10
10
  ]
@@ -22,5 +22,6 @@ dependencies = [
22
22
  "scikit-learn",
23
23
  "fasttext",
24
24
  "onnxruntime",
25
- "onnxruntime-extensions"
25
+ "onnxruntime-extensions",
26
+ "pyspellchecker"
26
27
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.3
3
+ Version: 1.5
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -15,6 +15,7 @@ Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
16
  Requires-Dist: onnxruntime
17
17
  Requires-Dist: onnxruntime-extensions
18
+ Requires-Dist: pyspellchecker
18
19
 
19
20
  ```python
20
21
  from compressor.semantic import compress_text, find_needle_in_haystack
@@ -4,3 +4,4 @@ scikit-learn
4
4
  fasttext
5
5
  onnxruntime
6
6
  onnxruntime-extensions
7
+ pyspellchecker
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='semantic_compressor',
5
- version='1.3',
5
+ version='1.5',
6
6
  author='Carlo Moro',
7
7
  author_email='cnmoro@gmail.com',
8
8
  description="Semantic text compression",
@@ -17,7 +17,8 @@ setup(
17
17
  "scikit-learn",
18
18
  "fasttext",
19
19
  "onnxruntime",
20
- "onnxruntime-extensions"
20
+ "onnxruntime-extensions",
21
+ "pyspellchecker"
21
22
  ],
22
23
  classifiers=[
23
24
  'Programming Language :: Python :: 3',