semantic-compressor 1.4__py3-none-any.whl → 1.6__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- compressor/semantic.py +14 -7
- {semantic_compressor-1.4.dist-info → semantic_compressor-1.6.dist-info}/METADATA +1 -1
- {semantic_compressor-1.4.dist-info → semantic_compressor-1.6.dist-info}/RECORD +6 -6
- {semantic_compressor-1.4.dist-info → semantic_compressor-1.6.dist-info}/LICENSE +0 -0
- {semantic_compressor-1.4.dist-info → semantic_compressor-1.6.dist-info}/WHEEL +0 -0
- {semantic_compressor-1.4.dist-info → semantic_compressor-1.6.dist-info}/top_level.txt +0 -0
compressor/semantic.py
CHANGED
@@ -13,10 +13,6 @@ from collections import Counter
|
|
13
13
|
import onnxruntime as ort
|
14
14
|
import nltk
|
15
15
|
|
16
|
-
# Inicializando os stemmers
|
17
|
-
stemmer_english = PorterStemmer()
|
18
|
-
stemmer_portuguese = RSLPStemmer()
|
19
|
-
|
20
16
|
tokenizer = RegexTokenizer()
|
21
17
|
nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
|
22
18
|
|
@@ -24,6 +20,10 @@ os.environ['NLTK_DATA'] = nltk_data_path
|
|
24
20
|
|
25
21
|
nltk.download('rslp')
|
26
22
|
|
23
|
+
# Inicializando os stemmers
|
24
|
+
stemmer_english = PorterStemmer()
|
25
|
+
stemmer_portuguese = RSLPStemmer()
|
26
|
+
|
27
27
|
english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
|
28
28
|
portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
|
29
29
|
fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
|
@@ -250,11 +250,18 @@ def stem_text(text, lang='en'):
|
|
250
250
|
|
251
251
|
return stemmed_text
|
252
252
|
|
253
|
-
def correct_spelling(
|
253
|
+
def correct_spelling(sentence, detected_lang="pt"):
|
254
254
|
spell = SpellChecker(language=detected_lang)
|
255
|
-
words =
|
255
|
+
words = sentence.split()
|
256
256
|
fixed = [spell.correction(word) for word in words]
|
257
|
-
|
257
|
+
|
258
|
+
final_words = []
|
259
|
+
|
260
|
+
# Interpolate original words with fixed words (each word could be "None" in "fixed" when no correction is needed)
|
261
|
+
for original, fixed_word in zip(words, fixed):
|
262
|
+
final_words.append(fixed_word if fixed_word is not None else original)
|
263
|
+
|
264
|
+
return " ".join(final_words)
|
258
265
|
|
259
266
|
def find_needle_in_haystack(
|
260
267
|
*, haystack: str, needle: str, block_size = 300,
|
@@ -1,5 +1,5 @@
|
|
1
1
|
compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
compressor/semantic.py,sha256=
|
2
|
+
compressor/semantic.py,sha256=zIqmajE_X4tyLvvj5I5umdPqm_RB5D4sT5hRQG3qLZA,13985
|
3
3
|
compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
|
4
4
|
compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
|
5
5
|
compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
|
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
|
|
8
8
|
compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
|
9
9
|
compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
10
10
|
compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
|
11
|
-
semantic_compressor-1.
|
12
|
-
semantic_compressor-1.
|
13
|
-
semantic_compressor-1.
|
14
|
-
semantic_compressor-1.
|
15
|
-
semantic_compressor-1.
|
11
|
+
semantic_compressor-1.6.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
|
12
|
+
semantic_compressor-1.6.dist-info/METADATA,sha256=MAuc9bjclfAdJ-sLWNq1FYEiH0qtFvdGlvj_ja8gCW8,6178
|
13
|
+
semantic_compressor-1.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
14
|
+
semantic_compressor-1.6.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
|
15
|
+
semantic_compressor-1.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|