semantic-compressor 1.4__py3-none-any.whl → 1.6__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
compressor/semantic.py CHANGED
@@ -13,10 +13,6 @@ from collections import Counter
13
13
  import onnxruntime as ort
14
14
  import nltk
15
15
 
16
- # Inicializando os stemmers
17
- stemmer_english = PorterStemmer()
18
- stemmer_portuguese = RSLPStemmer()
19
-
20
16
  tokenizer = RegexTokenizer()
21
17
  nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
22
18
 
@@ -24,6 +20,10 @@ os.environ['NLTK_DATA'] = nltk_data_path
24
20
 
25
21
  nltk.download('rslp')
26
22
 
23
+ # Inicializando os stemmers
24
+ stemmer_english = PorterStemmer()
25
+ stemmer_portuguese = RSLPStemmer()
26
+
27
27
  english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
28
28
  portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
29
29
  fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
@@ -250,11 +250,18 @@ def stem_text(text, lang='en'):
250
250
 
251
251
  return stemmed_text
252
252
 
253
- def correct_spelling(frase, detected_lang="pt"):
253
+ def correct_spelling(sentence, detected_lang="pt"):
254
254
  spell = SpellChecker(language=detected_lang)
255
- words = frase.split()
255
+ words = sentence.split()
256
256
  fixed = [spell.correction(word) for word in words]
257
- return " ".join(fixed)
257
+
258
+ final_words = []
259
+
260
+ # Interpolate original words with fixed words (each word could be "None" in "fixed" when no correction is needed)
261
+ for original, fixed_word in zip(words, fixed):
262
+ final_words.append(fixed_word if fixed_word is not None else original)
263
+
264
+ return " ".join(final_words)
258
265
 
259
266
  def find_needle_in_haystack(
260
267
  *, haystack: str, needle: str, block_size = 300,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.4
3
+ Version: 1.6
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,5 +1,5 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=OxqzVCAnICKD3W_P3SAe4JbJt-PyOs5VVR-go8taZVI,13701
2
+ compressor/semantic.py,sha256=zIqmajE_X4tyLvvj5I5umdPqm_RB5D4sT5hRQG3qLZA,13985
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
8
8
  compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
9
  compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
10
  compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
- semantic_compressor-1.4.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
- semantic_compressor-1.4.dist-info/METADATA,sha256=BEKlYCs7nYakGXQzbC_8_Gz-MKSAXzSp01pAD0HjIS0,6178
13
- semantic_compressor-1.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
- semantic_compressor-1.4.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
- semantic_compressor-1.4.dist-info/RECORD,,
11
+ semantic_compressor-1.6.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
+ semantic_compressor-1.6.dist-info/METADATA,sha256=MAuc9bjclfAdJ-sLWNq1FYEiH0qtFvdGlvj_ja8gCW8,6178
13
+ semantic_compressor-1.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
+ semantic_compressor-1.6.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
+ semantic_compressor-1.6.dist-info/RECORD,,