semantic-compressor 1.3__py3-none-any.whl → 1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressor/semantic.py +39 -4
- {semantic_compressor-1.3.dist-info → semantic_compressor-1.4.dist-info}/METADATA +2 -1
- {semantic_compressor-1.3.dist-info → semantic_compressor-1.4.dist-info}/RECORD +6 -6
- {semantic_compressor-1.3.dist-info → semantic_compressor-1.4.dist-info}/WHEEL +1 -1
- {semantic_compressor-1.3.dist-info → semantic_compressor-1.4.dist-info}/LICENSE +0 -0
- {semantic_compressor-1.3.dist-info → semantic_compressor-1.4.dist-info}/top_level.txt +0 -0
compressor/semantic.py
CHANGED
@@ -6,14 +6,24 @@ from onnxruntime_extensions import get_library_path
|
|
6
6
|
from compressor.minbpe.regex import RegexTokenizer
|
7
7
|
from nltk.tokenize import sent_tokenize
|
8
8
|
from multiprocessing import cpu_count
|
9
|
+
from spellchecker import SpellChecker
|
10
|
+
from nltk.stem import PorterStemmer
|
11
|
+
from nltk.stem import RSLPStemmer
|
9
12
|
from collections import Counter
|
10
13
|
import onnxruntime as ort
|
14
|
+
import nltk
|
15
|
+
|
16
|
+
# Inicializando os stemmers
|
17
|
+
stemmer_english = PorterStemmer()
|
18
|
+
stemmer_portuguese = RSLPStemmer()
|
11
19
|
|
12
20
|
tokenizer = RegexTokenizer()
|
13
21
|
nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
|
14
22
|
|
15
23
|
os.environ['NLTK_DATA'] = nltk_data_path
|
16
24
|
|
25
|
+
nltk.download('rslp')
|
26
|
+
|
17
27
|
english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
|
18
28
|
portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
|
19
29
|
fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
|
@@ -230,10 +240,28 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
|
|
230
240
|
|
231
241
|
return text
|
232
242
|
|
243
|
+
def stem_text(text, lang='en'):
|
244
|
+
if lang == 'en':
|
245
|
+
stems = [stemmer_english.stem(word) for word in text.split()]
|
246
|
+
stemmed_text = " ".join(stems)
|
247
|
+
else:
|
248
|
+
stems = [stemmer_portuguese.stem(word) for word in text.split()]
|
249
|
+
stemmed_text = " ".join(stems)
|
250
|
+
|
251
|
+
return stemmed_text
|
252
|
+
|
253
|
+
def correct_spelling(frase, detected_lang="pt"):
|
254
|
+
spell = SpellChecker(language=detected_lang)
|
255
|
+
words = frase.split()
|
256
|
+
fixed = [spell.correction(word) for word in words]
|
257
|
+
return " ".join(fixed)
|
258
|
+
|
233
259
|
def find_needle_in_haystack(
|
234
260
|
*, haystack: str, needle: str, block_size = 300,
|
235
261
|
semantic_embeddings_weight: float = 0.3,
|
236
|
-
textual_embeddings_weight: float = 0.7
|
262
|
+
textual_embeddings_weight: float = 0.7,
|
263
|
+
use_stemming: bool = False,
|
264
|
+
correct_spelling_needle: bool = False
|
237
265
|
):
|
238
266
|
"""
|
239
267
|
Finds the string block in the haystack that contains the needle.
|
@@ -244,7 +272,9 @@ def find_needle_in_haystack(
|
|
244
272
|
block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
|
245
273
|
semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
|
246
274
|
textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
|
247
|
-
|
275
|
+
use_stemming (bool, optional): Whether to use stemming for the text. Defaults to False.
|
276
|
+
correct_spelling_needle (bool, optional): Whether to correct the spelling of the needle. Defaults to False.
|
277
|
+
|
248
278
|
Returns:
|
249
279
|
str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
|
250
280
|
"""
|
@@ -252,14 +282,19 @@ def find_needle_in_haystack(
|
|
252
282
|
try:
|
253
283
|
# Split the haystack into blocks
|
254
284
|
blocks = structurize_text(haystack, tokens_per_chunk=block_size)
|
285
|
+
|
286
|
+
lang = detect_language(f"{needle}\n\n{haystack}")
|
287
|
+
|
288
|
+
if correct_spelling_needle:
|
289
|
+
needle = correct_spelling(needle, lang)
|
255
290
|
|
256
291
|
# Compute the embeddings of the needle
|
257
292
|
needle_semantic_embedding = extract_semantic_embeddings(needle)
|
258
|
-
needle_textual_embedding = extract_textual_embeddings(needle.lower())
|
293
|
+
needle_textual_embedding = extract_textual_embeddings(needle.lower() if not use_stemming else stem_text(needle, lang))
|
259
294
|
|
260
295
|
# Compute the embeddings of the haystack (each block)
|
261
296
|
haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
|
262
|
-
haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
|
297
|
+
haystack_textual_embeddings = [extract_textual_embeddings(block.lower() if not use_stemming else stem_text(block.lower(), lang)) for block in blocks]
|
263
298
|
|
264
299
|
# Compute the similarity between the needle and each block
|
265
300
|
semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: semantic_compressor
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.4
|
4
4
|
Author: Carlo Moro
|
5
5
|
Author-email: Carlo Moro <cnmoro@gmail.com>
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -15,6 +15,7 @@ Requires-Dist: scikit-learn
|
|
15
15
|
Requires-Dist: fasttext
|
16
16
|
Requires-Dist: onnxruntime
|
17
17
|
Requires-Dist: onnxruntime-extensions
|
18
|
+
Requires-Dist: pyspellchecker
|
18
19
|
|
19
20
|
```python
|
20
21
|
from compressor.semantic import compress_text, find_needle_in_haystack
|
@@ -1,5 +1,5 @@
|
|
1
1
|
compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
compressor/semantic.py,sha256=
|
2
|
+
compressor/semantic.py,sha256=OxqzVCAnICKD3W_P3SAe4JbJt-PyOs5VVR-go8taZVI,13701
|
3
3
|
compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
|
4
4
|
compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
|
5
5
|
compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
|
@@ -8,8 +8,8 @@ compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5
|
|
8
8
|
compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
|
9
9
|
compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
10
10
|
compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
|
11
|
-
semantic_compressor-1.
|
12
|
-
semantic_compressor-1.
|
13
|
-
semantic_compressor-1.
|
14
|
-
semantic_compressor-1.
|
15
|
-
semantic_compressor-1.
|
11
|
+
semantic_compressor-1.4.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
|
12
|
+
semantic_compressor-1.4.dist-info/METADATA,sha256=BEKlYCs7nYakGXQzbC_8_Gz-MKSAXzSp01pAD0HjIS0,6178
|
13
|
+
semantic_compressor-1.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
14
|
+
semantic_compressor-1.4.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
|
15
|
+
semantic_compressor-1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|