semantic-compressor 1.7__py3-none-any.whl → 1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressor/semantic.py +4 -16
- {semantic_compressor-1.7.dist-info → semantic_compressor-1.9.dist-info}/METADATA +3 -4
- {semantic_compressor-1.7.dist-info → semantic_compressor-1.9.dist-info}/RECORD +6 -7
- {semantic_compressor-1.7.dist-info → semantic_compressor-1.9.dist-info}/WHEEL +1 -1
- compressor/resources/embedding_model.onnx +0 -0
- {semantic_compressor-1.7.dist-info → semantic_compressor-1.9.dist-info}/LICENSE +0 -0
- {semantic_compressor-1.7.dist-info → semantic_compressor-1.9.dist-info}/top_level.txt +0 -0
compressor/semantic.py
CHANGED
@@ -2,7 +2,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
|
|
2
2
|
import numpy as np, pickle, fasttext, os, traceback, importlib
|
3
3
|
from sklearn.decomposition import LatentDirichletAllocation
|
4
4
|
from sklearn.metrics.pairwise import cosine_similarity
|
5
|
-
from onnxruntime_extensions import get_library_path
|
6
5
|
from compressor.minbpe.regex import RegexTokenizer
|
7
6
|
from concurrent.futures import ProcessPoolExecutor
|
8
7
|
from nltk.tokenize import sent_tokenize
|
@@ -11,7 +10,7 @@ from spellchecker import SpellChecker
|
|
11
10
|
from nltk.stem import PorterStemmer
|
12
11
|
from nltk.stem import RSLPStemmer
|
13
12
|
from collections import Counter
|
14
|
-
|
13
|
+
from model2vec import StaticModel
|
15
14
|
import nltk
|
16
15
|
|
17
16
|
tokenizer = RegexTokenizer()
|
@@ -32,18 +31,7 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
|
|
32
31
|
portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
|
33
32
|
langdetect_model = fasttext.load_model(fasttext_model_path)
|
34
33
|
|
35
|
-
|
36
|
-
|
37
|
-
_options = ort.SessionOptions()
|
38
|
-
_options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
|
39
|
-
_options.register_custom_ops_library(get_library_path())
|
40
|
-
_providers = ["CPUExecutionProvider"]
|
41
|
-
|
42
|
-
embedding_model = ort.InferenceSession(
|
43
|
-
path_or_bytes = str(importlib.resources.files('compressor').joinpath('resources/embedding_model.onnx')),
|
44
|
-
sess_options=_options,
|
45
|
-
providers=_providers
|
46
|
-
)
|
34
|
+
embedding_model = StaticModel.from_pretrained("cnmoro/multilingual-e5-small-distilled-16m")
|
47
35
|
|
48
36
|
hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
|
49
37
|
|
@@ -54,7 +42,7 @@ def extract_textual_embeddings(text):
|
|
54
42
|
return fixed_size_matrix.tolist()
|
55
43
|
|
56
44
|
def extract_semantic_embeddings(text):
|
57
|
-
return embedding_model.
|
45
|
+
return embedding_model.encode([text])[0]
|
58
46
|
|
59
47
|
def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
|
60
48
|
chunks = []
|
@@ -339,7 +327,7 @@ def find_needle_in_haystack(
|
|
339
327
|
haystack_semantic_embeddings = list(executor.map(extract_semantic_embeddings, blocks))
|
340
328
|
|
341
329
|
if embedding_mode in {'textual', 'both'}:
|
342
|
-
with ProcessPoolExecutor(max_workers=cpu_count()//1.5) as executor:
|
330
|
+
with ProcessPoolExecutor(max_workers=int(cpu_count()//1.5)) as executor:
|
343
331
|
haystack_textual_embeddings = list(
|
344
332
|
executor.map(preprocess_and_extract_textual_embedding, blocks, [use_stemming]*len(blocks), [lang]*len(blocks))
|
345
333
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: semantic_compressor
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.9
|
4
4
|
Author: Carlo Moro
|
5
5
|
Author-email: Carlo Moro <cnmoro@gmail.com>
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -9,12 +9,11 @@ Classifier: Operating System :: OS Independent
|
|
9
9
|
Requires-Python: >=3.7
|
10
10
|
Description-Content-Type: text/markdown
|
11
11
|
License-File: LICENSE
|
12
|
-
Requires-Dist: numpy
|
12
|
+
Requires-Dist: numpy<2
|
13
13
|
Requires-Dist: nltk
|
14
14
|
Requires-Dist: scikit-learn
|
15
15
|
Requires-Dist: fasttext
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist: onnxruntime-extensions
|
16
|
+
Requires-Dist: model2vec
|
18
17
|
Requires-Dist: pyspellchecker
|
19
18
|
|
20
19
|
```python
|
@@ -1,15 +1,14 @@
|
|
1
1
|
compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
compressor/semantic.py,sha256=
|
2
|
+
compressor/semantic.py,sha256=QrCpAdOPnGuK8V-yjJ9mUZa0olZYj0Ul0wRJgNc5cI8,15738
|
3
3
|
compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
|
4
4
|
compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
|
5
5
|
compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
|
6
6
|
compressor/minbpe/regex.py,sha256=k3bllcxc5c7mi43tUEGg6jX-Zc4Cvfb1CCTGEp7ZcVM,5821
|
7
|
-
compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5lDSEIBdXKcPc,71794489
|
8
7
|
compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
|
9
8
|
compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
10
9
|
compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
|
11
|
-
semantic_compressor-1.
|
12
|
-
semantic_compressor-1.
|
13
|
-
semantic_compressor-1.
|
14
|
-
semantic_compressor-1.
|
15
|
-
semantic_compressor-1.
|
10
|
+
semantic_compressor-1.9.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
|
11
|
+
semantic_compressor-1.9.dist-info/METADATA,sha256=1BP6qPt8T8sR9NrHG-pgS3GJKiIHpOXCWxTHadvu2KA,6137
|
12
|
+
semantic_compressor-1.9.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
13
|
+
semantic_compressor-1.9.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
|
14
|
+
semantic_compressor-1.9.dist-info/RECORD,,
|
Binary file
|
File without changes
|
File without changes
|