semantic-compressor 1.4__py3-none-any.whl → 1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressor/semantic.py +14 -7
- {semantic_compressor-1.4.dist-info → semantic_compressor-1.6.dist-info}/METADATA +1 -1
- {semantic_compressor-1.4.dist-info → semantic_compressor-1.6.dist-info}/RECORD +6 -6
- {semantic_compressor-1.4.dist-info → semantic_compressor-1.6.dist-info}/LICENSE +0 -0
- {semantic_compressor-1.4.dist-info → semantic_compressor-1.6.dist-info}/WHEEL +0 -0
- {semantic_compressor-1.4.dist-info → semantic_compressor-1.6.dist-info}/top_level.txt +0 -0
compressor/semantic.py
CHANGED
@@ -13,10 +13,6 @@ from collections import Counter
|
|
13
13
|
import onnxruntime as ort
|
14
14
|
import nltk
|
15
15
|
|
16
|
-
# Inicializando os stemmers
|
17
|
-
stemmer_english = PorterStemmer()
|
18
|
-
stemmer_portuguese = RSLPStemmer()
|
19
|
-
|
20
16
|
tokenizer = RegexTokenizer()
|
21
17
|
nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
|
22
18
|
|
@@ -24,6 +20,10 @@ os.environ['NLTK_DATA'] = nltk_data_path
|
|
24
20
|
|
25
21
|
nltk.download('rslp')
|
26
22
|
|
23
|
+
# Inicializando os stemmers
|
24
|
+
stemmer_english = PorterStemmer()
|
25
|
+
stemmer_portuguese = RSLPStemmer()
|
26
|
+
|
27
27
|
english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
|
28
28
|
portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
|
29
29
|
fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
|
@@ -250,11 +250,18 @@ def stem_text(text, lang='en'):
|
|
250
250
|
|
251
251
|
return stemmed_text
|
252
252
|
|
253
|
-
def correct_spelling(
|
253
|
+
def correct_spelling(sentence, detected_lang="pt"):
|
254
254
|
spell = SpellChecker(language=detected_lang)
|
255
|
-
words =
|
255
|
+
words = sentence.split()
|
256
256
|
fixed = [spell.correction(word) for word in words]
|
257
|
-
|
257
|
+
|
258
|
+
final_words = []
|
259
|
+
|
260
|
+
# Interpolate original words with fixed words (each word could be "None" in "fixed" when no correction is needed)
|
261
|
+
for original, fixed_word in zip(words, fixed):
|
262
|
+
final_words.append(fixed_word if fixed_word is not None else original)
|
263
|
+
|
264
|
+
return " ".join(final_words)
|
258
265
|
|
259
266
|
def find_needle_in_haystack(
|
260
267
|
*, haystack: str, needle: str, block_size = 300,
|
@@ -1,5 +1,5 @@
|
|
1
1
|
compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
compressor/semantic.py,sha256=
|
2
|
+
compressor/semantic.py,sha256=zIqmajE_X4tyLvvj5I5umdPqm_RB5D4sT5hRQG3qLZA,13985
|
3
3
|
compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
|
4
4
|
compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
|
5
5
|
compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
|
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
|
|
8
8
|
compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
|
9
9
|
compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
10
10
|
compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
|
11
|
-
semantic_compressor-1.
|
12
|
-
semantic_compressor-1.
|
13
|
-
semantic_compressor-1.
|
14
|
-
semantic_compressor-1.
|
15
|
-
semantic_compressor-1.
|
11
|
+
semantic_compressor-1.6.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
|
12
|
+
semantic_compressor-1.6.dist-info/METADATA,sha256=MAuc9bjclfAdJ-sLWNq1FYEiH0qtFvdGlvj_ja8gCW8,6178
|
13
|
+
semantic_compressor-1.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
14
|
+
semantic_compressor-1.6.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
|
15
|
+
semantic_compressor-1.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|