semantic-compressor 1.3__py3-none-any.whl → 1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
compressor/semantic.py CHANGED
@@ -6,14 +6,24 @@ from onnxruntime_extensions import get_library_path
6
6
  from compressor.minbpe.regex import RegexTokenizer
7
7
  from nltk.tokenize import sent_tokenize
8
8
  from multiprocessing import cpu_count
9
+ from spellchecker import SpellChecker
10
+ from nltk.stem import PorterStemmer
11
+ from nltk.stem import RSLPStemmer
9
12
  from collections import Counter
10
13
  import onnxruntime as ort
14
+ import nltk
15
+
16
+ # Inicializando os stemmers
17
+ stemmer_english = PorterStemmer()
18
+ stemmer_portuguese = RSLPStemmer()
11
19
 
12
20
  tokenizer = RegexTokenizer()
13
21
  nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
14
22
 
15
23
  os.environ['NLTK_DATA'] = nltk_data_path
16
24
 
25
+ nltk.download('rslp')
26
+
17
27
  english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
18
28
  portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
19
29
  fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
@@ -230,10 +240,28 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
230
240
 
231
241
  return text
232
242
 
243
+ def stem_text(text, lang='en'):
244
+ if lang == 'en':
245
+ stems = [stemmer_english.stem(word) for word in text.split()]
246
+ stemmed_text = " ".join(stems)
247
+ else:
248
+ stems = [stemmer_portuguese.stem(word) for word in text.split()]
249
+ stemmed_text = " ".join(stems)
250
+
251
+ return stemmed_text
252
+
253
+ def correct_spelling(frase, detected_lang="pt"):
254
+ spell = SpellChecker(language=detected_lang)
255
+ words = frase.split()
256
+ fixed = [spell.correction(word) for word in words]
257
+ return " ".join(fixed)
258
+
233
259
  def find_needle_in_haystack(
234
260
  *, haystack: str, needle: str, block_size = 300,
235
261
  semantic_embeddings_weight: float = 0.3,
236
- textual_embeddings_weight: float = 0.7
262
+ textual_embeddings_weight: float = 0.7,
263
+ use_stemming: bool = False,
264
+ correct_spelling_needle: bool = False
237
265
  ):
238
266
  """
239
267
  Finds the string block in the haystack that contains the needle.
@@ -244,7 +272,9 @@ def find_needle_in_haystack(
244
272
  block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
245
273
  semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
246
274
  textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
247
-
275
+ use_stemming (bool, optional): Whether to use stemming for the text. Defaults to False.
276
+ correct_spelling_needle (bool, optional): Whether to correct the spelling of the needle. Defaults to False.
277
+
248
278
  Returns:
249
279
  str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
250
280
  """
@@ -252,14 +282,19 @@ def find_needle_in_haystack(
252
282
  try:
253
283
  # Split the haystack into blocks
254
284
  blocks = structurize_text(haystack, tokens_per_chunk=block_size)
285
+
286
+ lang = detect_language(f"{needle}\n\n{haystack}")
287
+
288
+ if correct_spelling_needle:
289
+ needle = correct_spelling(needle, lang)
255
290
 
256
291
  # Compute the embeddings of the needle
257
292
  needle_semantic_embedding = extract_semantic_embeddings(needle)
258
- needle_textual_embedding = extract_textual_embeddings(needle.lower())
293
+ needle_textual_embedding = extract_textual_embeddings(needle.lower() if not use_stemming else stem_text(needle, lang))
259
294
 
260
295
  # Compute the embeddings of the haystack (each block)
261
296
  haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
262
- haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
297
+ haystack_textual_embeddings = [extract_textual_embeddings(block.lower() if not use_stemming else stem_text(block.lower(), lang)) for block in blocks]
263
298
 
264
299
  # Compute the similarity between the needle and each block
265
300
  semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.3
3
+ Version: 1.4
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -15,6 +15,7 @@ Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
16
  Requires-Dist: onnxruntime
17
17
  Requires-Dist: onnxruntime-extensions
18
+ Requires-Dist: pyspellchecker
18
19
 
19
20
  ```python
20
21
  from compressor.semantic import compress_text, find_needle_in_haystack
@@ -1,5 +1,5 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=8MQdV-ZmTBMC3sEIQr565hafxS6v5_A_-dNiqb0R5Xg,12379
2
+ compressor/semantic.py,sha256=OxqzVCAnICKD3W_P3SAe4JbJt-PyOs5VVR-go8taZVI,13701
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
8
8
  compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
9
  compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
10
  compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
- semantic_compressor-1.3.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
- semantic_compressor-1.3.dist-info/METADATA,sha256=baw_1lughU6R-9nQ_23COy7DP70ZI6H_DlH3YvrYBRU,6148
13
- semantic_compressor-1.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
14
- semantic_compressor-1.3.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
- semantic_compressor-1.3.dist-info/RECORD,,
11
+ semantic_compressor-1.4.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
+ semantic_compressor-1.4.dist-info/METADATA,sha256=BEKlYCs7nYakGXQzbC_8_Gz-MKSAXzSp01pAD0HjIS0,6178
13
+ semantic_compressor-1.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
+ semantic_compressor-1.4.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
+ semantic_compressor-1.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.1.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5