semantic-compressor 1.3__py3-none-any.whl → 1.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
compressor/semantic.py CHANGED
@@ -6,14 +6,24 @@ from onnxruntime_extensions import get_library_path
6
6
  from compressor.minbpe.regex import RegexTokenizer
7
7
  from nltk.tokenize import sent_tokenize
8
8
  from multiprocessing import cpu_count
9
+ from spellchecker import SpellChecker
10
+ from nltk.stem import PorterStemmer
11
+ from nltk.stem import RSLPStemmer
9
12
  from collections import Counter
10
13
  import onnxruntime as ort
14
+ import nltk
15
+
16
+ # Inicializando os stemmers
17
+ stemmer_english = PorterStemmer()
18
+ stemmer_portuguese = RSLPStemmer()
11
19
 
12
20
  tokenizer = RegexTokenizer()
13
21
  nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
14
22
 
15
23
  os.environ['NLTK_DATA'] = nltk_data_path
16
24
 
25
+ nltk.download('rslp')
26
+
17
27
  english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
18
28
  portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
19
29
  fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
@@ -230,10 +240,28 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
230
240
 
231
241
  return text
232
242
 
243
+ def stem_text(text, lang='en'):
244
+ if lang == 'en':
245
+ stems = [stemmer_english.stem(word) for word in text.split()]
246
+ stemmed_text = " ".join(stems)
247
+ else:
248
+ stems = [stemmer_portuguese.stem(word) for word in text.split()]
249
+ stemmed_text = " ".join(stems)
250
+
251
+ return stemmed_text
252
+
253
+ def correct_spelling(frase, detected_lang="pt"):
254
+ spell = SpellChecker(language=detected_lang)
255
+ words = frase.split()
256
+ fixed = [spell.correction(word) for word in words]
257
+ return " ".join(fixed)
258
+
233
259
  def find_needle_in_haystack(
234
260
  *, haystack: str, needle: str, block_size = 300,
235
261
  semantic_embeddings_weight: float = 0.3,
236
- textual_embeddings_weight: float = 0.7
262
+ textual_embeddings_weight: float = 0.7,
263
+ use_stemming: bool = False,
264
+ correct_spelling_needle: bool = False
237
265
  ):
238
266
  """
239
267
  Finds the string block in the haystack that contains the needle.
@@ -244,7 +272,9 @@ def find_needle_in_haystack(
244
272
  block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
245
273
  semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
246
274
  textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
247
-
275
+ use_stemming (bool, optional): Whether to use stemming for the text. Defaults to False.
276
+ correct_spelling_needle (bool, optional): Whether to correct the spelling of the needle. Defaults to False.
277
+
248
278
  Returns:
249
279
  str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
250
280
  """
@@ -252,14 +282,19 @@ def find_needle_in_haystack(
252
282
  try:
253
283
  # Split the haystack into blocks
254
284
  blocks = structurize_text(haystack, tokens_per_chunk=block_size)
285
+
286
+ lang = detect_language(f"{needle}\n\n{haystack}")
287
+
288
+ if correct_spelling_needle:
289
+ needle = correct_spelling(needle, lang)
255
290
 
256
291
  # Compute the embeddings of the needle
257
292
  needle_semantic_embedding = extract_semantic_embeddings(needle)
258
- needle_textual_embedding = extract_textual_embeddings(needle.lower())
293
+ needle_textual_embedding = extract_textual_embeddings(needle.lower() if not use_stemming else stem_text(needle, lang))
259
294
 
260
295
  # Compute the embeddings of the haystack (each block)
261
296
  haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
262
- haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
297
+ haystack_textual_embeddings = [extract_textual_embeddings(block.lower() if not use_stemming else stem_text(block.lower(), lang)) for block in blocks]
263
298
 
264
299
  # Compute the similarity between the needle and each block
265
300
  semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.3
3
+ Version: 1.4
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -15,6 +15,7 @@ Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
16
  Requires-Dist: onnxruntime
17
17
  Requires-Dist: onnxruntime-extensions
18
+ Requires-Dist: pyspellchecker
18
19
 
19
20
  ```python
20
21
  from compressor.semantic import compress_text, find_needle_in_haystack
@@ -1,5 +1,5 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=8MQdV-ZmTBMC3sEIQr565hafxS6v5_A_-dNiqb0R5Xg,12379
2
+ compressor/semantic.py,sha256=OxqzVCAnICKD3W_P3SAe4JbJt-PyOs5VVR-go8taZVI,13701
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
8
8
  compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
9
  compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
10
  compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
- semantic_compressor-1.3.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
- semantic_compressor-1.3.dist-info/METADATA,sha256=baw_1lughU6R-9nQ_23COy7DP70ZI6H_DlH3YvrYBRU,6148
13
- semantic_compressor-1.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
14
- semantic_compressor-1.3.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
- semantic_compressor-1.3.dist-info/RECORD,,
11
+ semantic_compressor-1.4.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
+ semantic_compressor-1.4.dist-info/METADATA,sha256=BEKlYCs7nYakGXQzbC_8_Gz-MKSAXzSp01pAD0HjIS0,6178
13
+ semantic_compressor-1.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
+ semantic_compressor-1.4.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
+ semantic_compressor-1.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.1.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5