semantic-compressor 1.7__tar.gz → 1.9__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {semantic_compressor-1.7/semantic_compressor.egg-info → semantic_compressor-1.9}/PKG-INFO +2 -3
- {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/semantic.py +4 -16
- {semantic_compressor-1.7 → semantic_compressor-1.9}/pyproject.toml +2 -3
- {semantic_compressor-1.7 → semantic_compressor-1.9/semantic_compressor.egg-info}/PKG-INFO +2 -3
- {semantic_compressor-1.7 → semantic_compressor-1.9}/semantic_compressor.egg-info/SOURCES.txt +0 -1
- {semantic_compressor-1.7 → semantic_compressor-1.9}/semantic_compressor.egg-info/requires.txt +1 -2
- {semantic_compressor-1.7 → semantic_compressor-1.9}/setup.py +2 -3
- semantic_compressor-1.7/compressor/resources/embedding_model.onnx +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/LICENSE +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/README.md +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/__init__.py +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/minbpe/__init__.py +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/minbpe/base.py +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/minbpe/basic.py +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/minbpe/regex.py +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/resources/en_stopwords.pkl +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/resources/lid.176.ftz +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/resources/pt_stopwords.pkl +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/semantic_compressor.egg-info/dependency_links.txt +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/semantic_compressor.egg-info/top_level.txt +0 -0
- {semantic_compressor-1.7 → semantic_compressor-1.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: semantic_compressor
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.9
|
4
4
|
Author: Carlo Moro
|
5
5
|
Author-email: Carlo Moro <cnmoro@gmail.com>
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -13,8 +13,7 @@ Requires-Dist: numpy<2
|
|
13
13
|
Requires-Dist: nltk
|
14
14
|
Requires-Dist: scikit-learn
|
15
15
|
Requires-Dist: fasttext
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist: onnxruntime-extensions
|
16
|
+
Requires-Dist: model2vec
|
18
17
|
Requires-Dist: pyspellchecker
|
19
18
|
|
20
19
|
```python
|
@@ -2,7 +2,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
|
|
2
2
|
import numpy as np, pickle, fasttext, os, traceback, importlib
|
3
3
|
from sklearn.decomposition import LatentDirichletAllocation
|
4
4
|
from sklearn.metrics.pairwise import cosine_similarity
|
5
|
-
from onnxruntime_extensions import get_library_path
|
6
5
|
from compressor.minbpe.regex import RegexTokenizer
|
7
6
|
from concurrent.futures import ProcessPoolExecutor
|
8
7
|
from nltk.tokenize import sent_tokenize
|
@@ -11,7 +10,7 @@ from spellchecker import SpellChecker
|
|
11
10
|
from nltk.stem import PorterStemmer
|
12
11
|
from nltk.stem import RSLPStemmer
|
13
12
|
from collections import Counter
|
14
|
-
|
13
|
+
from model2vec import StaticModel
|
15
14
|
import nltk
|
16
15
|
|
17
16
|
tokenizer = RegexTokenizer()
|
@@ -32,18 +31,7 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
|
|
32
31
|
portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
|
33
32
|
langdetect_model = fasttext.load_model(fasttext_model_path)
|
34
33
|
|
35
|
-
|
36
|
-
|
37
|
-
_options = ort.SessionOptions()
|
38
|
-
_options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
|
39
|
-
_options.register_custom_ops_library(get_library_path())
|
40
|
-
_providers = ["CPUExecutionProvider"]
|
41
|
-
|
42
|
-
embedding_model = ort.InferenceSession(
|
43
|
-
path_or_bytes = str(importlib.resources.files('compressor').joinpath('resources/embedding_model.onnx')),
|
44
|
-
sess_options=_options,
|
45
|
-
providers=_providers
|
46
|
-
)
|
34
|
+
embedding_model = StaticModel.from_pretrained("cnmoro/multilingual-e5-small-distilled-16m")
|
47
35
|
|
48
36
|
hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
|
49
37
|
|
@@ -54,7 +42,7 @@ def extract_textual_embeddings(text):
|
|
54
42
|
return fixed_size_matrix.tolist()
|
55
43
|
|
56
44
|
def extract_semantic_embeddings(text):
|
57
|
-
return embedding_model.
|
45
|
+
return embedding_model.encode([text])[0]
|
58
46
|
|
59
47
|
def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
|
60
48
|
chunks = []
|
@@ -339,7 +327,7 @@ def find_needle_in_haystack(
|
|
339
327
|
haystack_semantic_embeddings = list(executor.map(extract_semantic_embeddings, blocks))
|
340
328
|
|
341
329
|
if embedding_mode in {'textual', 'both'}:
|
342
|
-
with ProcessPoolExecutor(max_workers=cpu_count()//1.5) as executor:
|
330
|
+
with ProcessPoolExecutor(max_workers=int(cpu_count()//1.5)) as executor:
|
343
331
|
haystack_textual_embeddings = list(
|
344
332
|
executor.map(preprocess_and_extract_textual_embedding, blocks, [use_stemming]*len(blocks), [lang]*len(blocks))
|
345
333
|
)
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "semantic_compressor"
|
7
|
-
version = "1.
|
7
|
+
version = "1.9"
|
8
8
|
authors = [
|
9
9
|
{ name="Carlo Moro", email="cnmoro@gmail.com" },
|
10
10
|
]
|
@@ -21,7 +21,6 @@ dependencies = [
|
|
21
21
|
"nltk",
|
22
22
|
"scikit-learn",
|
23
23
|
"fasttext",
|
24
|
-
"
|
25
|
-
"onnxruntime-extensions",
|
24
|
+
"model2vec",
|
26
25
|
"pyspellchecker"
|
27
26
|
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: semantic_compressor
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.9
|
4
4
|
Author: Carlo Moro
|
5
5
|
Author-email: Carlo Moro <cnmoro@gmail.com>
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -13,8 +13,7 @@ Requires-Dist: numpy<2
|
|
13
13
|
Requires-Dist: nltk
|
14
14
|
Requires-Dist: scikit-learn
|
15
15
|
Requires-Dist: fasttext
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist: onnxruntime-extensions
|
16
|
+
Requires-Dist: model2vec
|
18
17
|
Requires-Dist: pyspellchecker
|
19
18
|
|
20
19
|
```python
|
{semantic_compressor-1.7 → semantic_compressor-1.9}/semantic_compressor.egg-info/SOURCES.txt
RENAMED
@@ -8,7 +8,6 @@ compressor/minbpe/__init__.py
|
|
8
8
|
compressor/minbpe/base.py
|
9
9
|
compressor/minbpe/basic.py
|
10
10
|
compressor/minbpe/regex.py
|
11
|
-
compressor/resources/embedding_model.onnx
|
12
11
|
compressor/resources/en_stopwords.pkl
|
13
12
|
compressor/resources/lid.176.ftz
|
14
13
|
compressor/resources/pt_stopwords.pkl
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
2
2
|
|
3
3
|
setup(
|
4
4
|
name='semantic_compressor',
|
5
|
-
version='1.
|
5
|
+
version='1.9',
|
6
6
|
author='Carlo Moro',
|
7
7
|
author_email='cnmoro@gmail.com',
|
8
8
|
description="Semantic text compression",
|
@@ -16,8 +16,7 @@ setup(
|
|
16
16
|
"nltk",
|
17
17
|
"scikit-learn",
|
18
18
|
"fasttext",
|
19
|
-
"
|
20
|
-
"onnxruntime-extensions",
|
19
|
+
"model2vec",
|
21
20
|
"pyspellchecker"
|
22
21
|
],
|
23
22
|
classifiers=[
|
Binary file
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{semantic_compressor-1.7 → semantic_compressor-1.9}/semantic_compressor.egg-info/top_level.txt
RENAMED
File without changes
|
File without changes
|