semantic-compressor 1.3__py3-none-any.whl → 1.4__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- compressor/semantic.py +39 -4
- {semantic_compressor-1.3.dist-info → semantic_compressor-1.4.dist-info}/METADATA +2 -1
- {semantic_compressor-1.3.dist-info → semantic_compressor-1.4.dist-info}/RECORD +6 -6
- {semantic_compressor-1.3.dist-info → semantic_compressor-1.4.dist-info}/WHEEL +1 -1
- {semantic_compressor-1.3.dist-info → semantic_compressor-1.4.dist-info}/LICENSE +0 -0
- {semantic_compressor-1.3.dist-info → semantic_compressor-1.4.dist-info}/top_level.txt +0 -0
compressor/semantic.py
CHANGED
@@ -6,14 +6,24 @@ from onnxruntime_extensions import get_library_path
|
|
6
6
|
from compressor.minbpe.regex import RegexTokenizer
|
7
7
|
from nltk.tokenize import sent_tokenize
|
8
8
|
from multiprocessing import cpu_count
|
9
|
+
from spellchecker import SpellChecker
|
10
|
+
from nltk.stem import PorterStemmer
|
11
|
+
from nltk.stem import RSLPStemmer
|
9
12
|
from collections import Counter
|
10
13
|
import onnxruntime as ort
|
14
|
+
import nltk
|
15
|
+
|
16
|
+
# Inicializando os stemmers
|
17
|
+
stemmer_english = PorterStemmer()
|
18
|
+
stemmer_portuguese = RSLPStemmer()
|
11
19
|
|
12
20
|
tokenizer = RegexTokenizer()
|
13
21
|
nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
|
14
22
|
|
15
23
|
os.environ['NLTK_DATA'] = nltk_data_path
|
16
24
|
|
25
|
+
nltk.download('rslp')
|
26
|
+
|
17
27
|
english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
|
18
28
|
portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
|
19
29
|
fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
|
@@ -230,10 +240,28 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
|
|
230
240
|
|
231
241
|
return text
|
232
242
|
|
243
|
+
def stem_text(text, lang='en'):
|
244
|
+
if lang == 'en':
|
245
|
+
stems = [stemmer_english.stem(word) for word in text.split()]
|
246
|
+
stemmed_text = " ".join(stems)
|
247
|
+
else:
|
248
|
+
stems = [stemmer_portuguese.stem(word) for word in text.split()]
|
249
|
+
stemmed_text = " ".join(stems)
|
250
|
+
|
251
|
+
return stemmed_text
|
252
|
+
|
253
|
+
def correct_spelling(frase, detected_lang="pt"):
|
254
|
+
spell = SpellChecker(language=detected_lang)
|
255
|
+
words = frase.split()
|
256
|
+
fixed = [spell.correction(word) for word in words]
|
257
|
+
return " ".join(fixed)
|
258
|
+
|
233
259
|
def find_needle_in_haystack(
|
234
260
|
*, haystack: str, needle: str, block_size = 300,
|
235
261
|
semantic_embeddings_weight: float = 0.3,
|
236
|
-
textual_embeddings_weight: float = 0.7
|
262
|
+
textual_embeddings_weight: float = 0.7,
|
263
|
+
use_stemming: bool = False,
|
264
|
+
correct_spelling_needle: bool = False
|
237
265
|
):
|
238
266
|
"""
|
239
267
|
Finds the string block in the haystack that contains the needle.
|
@@ -244,7 +272,9 @@ def find_needle_in_haystack(
|
|
244
272
|
block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
|
245
273
|
semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
|
246
274
|
textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
|
247
|
-
|
275
|
+
use_stemming (bool, optional): Whether to use stemming for the text. Defaults to False.
|
276
|
+
correct_spelling_needle (bool, optional): Whether to correct the spelling of the needle. Defaults to False.
|
277
|
+
|
248
278
|
Returns:
|
249
279
|
str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
|
250
280
|
"""
|
@@ -252,14 +282,19 @@ def find_needle_in_haystack(
|
|
252
282
|
try:
|
253
283
|
# Split the haystack into blocks
|
254
284
|
blocks = structurize_text(haystack, tokens_per_chunk=block_size)
|
285
|
+
|
286
|
+
lang = detect_language(f"{needle}\n\n{haystack}")
|
287
|
+
|
288
|
+
if correct_spelling_needle:
|
289
|
+
needle = correct_spelling(needle, lang)
|
255
290
|
|
256
291
|
# Compute the embeddings of the needle
|
257
292
|
needle_semantic_embedding = extract_semantic_embeddings(needle)
|
258
|
-
needle_textual_embedding = extract_textual_embeddings(needle.lower())
|
293
|
+
needle_textual_embedding = extract_textual_embeddings(needle.lower() if not use_stemming else stem_text(needle, lang))
|
259
294
|
|
260
295
|
# Compute the embeddings of the haystack (each block)
|
261
296
|
haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
|
262
|
-
haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
|
297
|
+
haystack_textual_embeddings = [extract_textual_embeddings(block.lower() if not use_stemming else stem_text(block.lower(), lang)) for block in blocks]
|
263
298
|
|
264
299
|
# Compute the similarity between the needle and each block
|
265
300
|
semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: semantic_compressor
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.4
|
4
4
|
Author: Carlo Moro
|
5
5
|
Author-email: Carlo Moro <cnmoro@gmail.com>
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -15,6 +15,7 @@ Requires-Dist: scikit-learn
|
|
15
15
|
Requires-Dist: fasttext
|
16
16
|
Requires-Dist: onnxruntime
|
17
17
|
Requires-Dist: onnxruntime-extensions
|
18
|
+
Requires-Dist: pyspellchecker
|
18
19
|
|
19
20
|
```python
|
20
21
|
from compressor.semantic import compress_text, find_needle_in_haystack
|
@@ -1,5 +1,5 @@
|
|
1
1
|
compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
compressor/semantic.py,sha256=
|
2
|
+
compressor/semantic.py,sha256=OxqzVCAnICKD3W_P3SAe4JbJt-PyOs5VVR-go8taZVI,13701
|
3
3
|
compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
|
4
4
|
compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
|
5
5
|
compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
|
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
|
|
8
8
|
compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
|
9
9
|
compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
10
10
|
compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
|
11
|
-
semantic_compressor-1.
|
12
|
-
semantic_compressor-1.
|
13
|
-
semantic_compressor-1.
|
14
|
-
semantic_compressor-1.
|
15
|
-
semantic_compressor-1.
|
11
|
+
semantic_compressor-1.4.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
|
12
|
+
semantic_compressor-1.4.dist-info/METADATA,sha256=BEKlYCs7nYakGXQzbC_8_Gz-MKSAXzSp01pAD0HjIS0,6178
|
13
|
+
semantic_compressor-1.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
14
|
+
semantic_compressor-1.4.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
|
15
|
+
semantic_compressor-1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|