semantic-compressor 1.3__tar.gz → 1.4__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {semantic_compressor-1.3/semantic_compressor.egg-info → semantic_compressor-1.4}/PKG-INFO +2 -1
- {semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/semantic.py +39 -4
- {semantic_compressor-1.3 → semantic_compressor-1.4}/pyproject.toml +4 -3
- {semantic_compressor-1.3 → semantic_compressor-1.4/semantic_compressor.egg-info}/PKG-INFO +2 -1
- {semantic_compressor-1.3 → semantic_compressor-1.4}/semantic_compressor.egg-info/requires.txt +1 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/setup.py +3 -2
- {semantic_compressor-1.3 → semantic_compressor-1.4}/LICENSE +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/README.md +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/__init__.py +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/minbpe/__init__.py +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/minbpe/base.py +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/minbpe/basic.py +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/minbpe/regex.py +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/resources/embedding_model.onnx +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/resources/en_stopwords.pkl +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/resources/lid.176.ftz +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/resources/pt_stopwords.pkl +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/semantic_compressor.egg-info/SOURCES.txt +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/semantic_compressor.egg-info/dependency_links.txt +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/semantic_compressor.egg-info/top_level.txt +0 -0
- {semantic_compressor-1.3 → semantic_compressor-1.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: semantic_compressor
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.4
|
4
4
|
Author: Carlo Moro
|
5
5
|
Author-email: Carlo Moro <cnmoro@gmail.com>
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -15,6 +15,7 @@ Requires-Dist: scikit-learn
|
|
15
15
|
Requires-Dist: fasttext
|
16
16
|
Requires-Dist: onnxruntime
|
17
17
|
Requires-Dist: onnxruntime-extensions
|
18
|
+
Requires-Dist: pyspellchecker
|
18
19
|
|
19
20
|
```python
|
20
21
|
from compressor.semantic import compress_text, find_needle_in_haystack
|
@@ -6,14 +6,24 @@ from onnxruntime_extensions import get_library_path
|
|
6
6
|
from compressor.minbpe.regex import RegexTokenizer
|
7
7
|
from nltk.tokenize import sent_tokenize
|
8
8
|
from multiprocessing import cpu_count
|
9
|
+
from spellchecker import SpellChecker
|
10
|
+
from nltk.stem import PorterStemmer
|
11
|
+
from nltk.stem import RSLPStemmer
|
9
12
|
from collections import Counter
|
10
13
|
import onnxruntime as ort
|
14
|
+
import nltk
|
15
|
+
|
16
|
+
# Inicializando os stemmers
|
17
|
+
stemmer_english = PorterStemmer()
|
18
|
+
stemmer_portuguese = RSLPStemmer()
|
11
19
|
|
12
20
|
tokenizer = RegexTokenizer()
|
13
21
|
nltk_data_path = str(importlib.resources.files('compressor').joinpath('resources/nltk_data'))
|
14
22
|
|
15
23
|
os.environ['NLTK_DATA'] = nltk_data_path
|
16
24
|
|
25
|
+
nltk.download('rslp')
|
26
|
+
|
17
27
|
english_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/en_stopwords.pkl'))
|
18
28
|
portuguese_stopwords_path = str(importlib.resources.files('compressor').joinpath('resources/pt_stopwords.pkl'))
|
19
29
|
fasttext_model_path = str(importlib.resources.files('compressor').joinpath('resources/lid.176.ftz'))
|
@@ -230,10 +240,28 @@ def compress_text(text, *, target_token_count=None, compression_rate=0.7, refere
|
|
230
240
|
|
231
241
|
return text
|
232
242
|
|
243
|
+
def stem_text(text, lang='en'):
|
244
|
+
if lang == 'en':
|
245
|
+
stems = [stemmer_english.stem(word) for word in text.split()]
|
246
|
+
stemmed_text = " ".join(stems)
|
247
|
+
else:
|
248
|
+
stems = [stemmer_portuguese.stem(word) for word in text.split()]
|
249
|
+
stemmed_text = " ".join(stems)
|
250
|
+
|
251
|
+
return stemmed_text
|
252
|
+
|
253
|
+
def correct_spelling(frase, detected_lang="pt"):
|
254
|
+
spell = SpellChecker(language=detected_lang)
|
255
|
+
words = frase.split()
|
256
|
+
fixed = [spell.correction(word) for word in words]
|
257
|
+
return " ".join(fixed)
|
258
|
+
|
233
259
|
def find_needle_in_haystack(
|
234
260
|
*, haystack: str, needle: str, block_size = 300,
|
235
261
|
semantic_embeddings_weight: float = 0.3,
|
236
|
-
textual_embeddings_weight: float = 0.7
|
262
|
+
textual_embeddings_weight: float = 0.7,
|
263
|
+
use_stemming: bool = False,
|
264
|
+
correct_spelling_needle: bool = False
|
237
265
|
):
|
238
266
|
"""
|
239
267
|
Finds the string block in the haystack that contains the needle.
|
@@ -244,7 +272,9 @@ def find_needle_in_haystack(
|
|
244
272
|
block_size (int, optional): The size of each string block. The needle will be searched in each block. Defaults to 350.
|
245
273
|
semantic_embeddings_weight (float, optional): The weight of the semantic embeddings in the similarity calculation. Defaults to 0.3.
|
246
274
|
textual_embeddings_weight (float, optional): The weight of the textual embeddings in the similarity calculation. Defaults to 0.7.
|
247
|
-
|
275
|
+
use_stemming (bool, optional): Whether to use stemming for the text. Defaults to False.
|
276
|
+
correct_spelling_needle (bool, optional): Whether to correct the spelling of the needle. Defaults to False.
|
277
|
+
|
248
278
|
Returns:
|
249
279
|
str: The string block in the haystack that contains the needle. The size of the needle will be less than or equal to the block size.
|
250
280
|
"""
|
@@ -252,14 +282,19 @@ def find_needle_in_haystack(
|
|
252
282
|
try:
|
253
283
|
# Split the haystack into blocks
|
254
284
|
blocks = structurize_text(haystack, tokens_per_chunk=block_size)
|
285
|
+
|
286
|
+
lang = detect_language(f"{needle}\n\n{haystack}")
|
287
|
+
|
288
|
+
if correct_spelling_needle:
|
289
|
+
needle = correct_spelling(needle, lang)
|
255
290
|
|
256
291
|
# Compute the embeddings of the needle
|
257
292
|
needle_semantic_embedding = extract_semantic_embeddings(needle)
|
258
|
-
needle_textual_embedding = extract_textual_embeddings(needle.lower())
|
293
|
+
needle_textual_embedding = extract_textual_embeddings(needle.lower() if not use_stemming else stem_text(needle, lang))
|
259
294
|
|
260
295
|
# Compute the embeddings of the haystack (each block)
|
261
296
|
haystack_semantic_embeddings = [extract_semantic_embeddings(block) for block in blocks]
|
262
|
-
haystack_textual_embeddings = [extract_textual_embeddings(block.lower()) for block in blocks]
|
297
|
+
haystack_textual_embeddings = [extract_textual_embeddings(block.lower() if not use_stemming else stem_text(block.lower(), lang)) for block in blocks]
|
263
298
|
|
264
299
|
# Compute the similarity between the needle and each block
|
265
300
|
semantic_similarities = [calculate_similarity(needle_semantic_embedding, block_embedding) for block_embedding in haystack_semantic_embeddings]
|
@@ -1,10 +1,10 @@
|
|
1
1
|
[build-system]
|
2
|
-
requires = ["setuptools>=61.0", "numpy<2", "nltk", "scikit-learn", "fasttext", "onnxruntime", "onnxruntime-extensions"]
|
2
|
+
requires = ["setuptools>=61.0", "numpy<2", "nltk", "scikit-learn", "fasttext", "onnxruntime", "onnxruntime-extensions", "pyspellchecker"]
|
3
3
|
build-backend = "setuptools.build_meta"
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "semantic_compressor"
|
7
|
-
version = "1.
|
7
|
+
version = "1.4"
|
8
8
|
authors = [
|
9
9
|
{ name="Carlo Moro", email="cnmoro@gmail.com" },
|
10
10
|
]
|
@@ -22,5 +22,6 @@ dependencies = [
|
|
22
22
|
"scikit-learn",
|
23
23
|
"fasttext",
|
24
24
|
"onnxruntime",
|
25
|
-
"onnxruntime-extensions"
|
25
|
+
"onnxruntime-extensions",
|
26
|
+
"pyspellchecker"
|
26
27
|
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: semantic_compressor
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.4
|
4
4
|
Author: Carlo Moro
|
5
5
|
Author-email: Carlo Moro <cnmoro@gmail.com>
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -15,6 +15,7 @@ Requires-Dist: scikit-learn
|
|
15
15
|
Requires-Dist: fasttext
|
16
16
|
Requires-Dist: onnxruntime
|
17
17
|
Requires-Dist: onnxruntime-extensions
|
18
|
+
Requires-Dist: pyspellchecker
|
18
19
|
|
19
20
|
```python
|
20
21
|
from compressor.semantic import compress_text, find_needle_in_haystack
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
2
2
|
|
3
3
|
setup(
|
4
4
|
name='semantic_compressor',
|
5
|
-
version='1.
|
5
|
+
version='1.4',
|
6
6
|
author='Carlo Moro',
|
7
7
|
author_email='cnmoro@gmail.com',
|
8
8
|
description="Semantic text compression",
|
@@ -17,7 +17,8 @@ setup(
|
|
17
17
|
"scikit-learn",
|
18
18
|
"fasttext",
|
19
19
|
"onnxruntime",
|
20
|
-
"onnxruntime-extensions"
|
20
|
+
"onnxruntime-extensions",
|
21
|
+
"pyspellchecker"
|
21
22
|
],
|
22
23
|
classifiers=[
|
23
24
|
'Programming Language :: Python :: 3',
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{semantic_compressor-1.3 → semantic_compressor-1.4}/compressor/resources/embedding_model.onnx
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{semantic_compressor-1.3 → semantic_compressor-1.4}/semantic_compressor.egg-info/SOURCES.txt
RENAMED
File without changes
|
File without changes
|
{semantic_compressor-1.3 → semantic_compressor-1.4}/semantic_compressor.egg-info/top_level.txt
RENAMED
File without changes
|
File without changes
|