semantic-compressor 1.7__tar.gz → 1.9__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (21) hide show
  1. {semantic_compressor-1.7/semantic_compressor.egg-info → semantic_compressor-1.9}/PKG-INFO +2 -3
  2. {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/semantic.py +4 -16
  3. {semantic_compressor-1.7 → semantic_compressor-1.9}/pyproject.toml +2 -3
  4. {semantic_compressor-1.7 → semantic_compressor-1.9/semantic_compressor.egg-info}/PKG-INFO +2 -3
  5. {semantic_compressor-1.7 → semantic_compressor-1.9}/semantic_compressor.egg-info/SOURCES.txt +0 -1
  6. {semantic_compressor-1.7 → semantic_compressor-1.9}/semantic_compressor.egg-info/requires.txt +1 -2
  7. {semantic_compressor-1.7 → semantic_compressor-1.9}/setup.py +2 -3
  8. semantic_compressor-1.7/compressor/resources/embedding_model.onnx +0 -0
  9. {semantic_compressor-1.7 → semantic_compressor-1.9}/LICENSE +0 -0
  10. {semantic_compressor-1.7 → semantic_compressor-1.9}/README.md +0 -0
  11. {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/__init__.py +0 -0
  12. {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/minbpe/__init__.py +0 -0
  13. {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/minbpe/base.py +0 -0
  14. {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/minbpe/basic.py +0 -0
  15. {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/minbpe/regex.py +0 -0
  16. {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/resources/en_stopwords.pkl +0 -0
  17. {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/resources/lid.176.ftz +0 -0
  18. {semantic_compressor-1.7 → semantic_compressor-1.9}/compressor/resources/pt_stopwords.pkl +0 -0
  19. {semantic_compressor-1.7 → semantic_compressor-1.9}/semantic_compressor.egg-info/dependency_links.txt +0 -0
  20. {semantic_compressor-1.7 → semantic_compressor-1.9}/semantic_compressor.egg-info/top_level.txt +0 -0
  21. {semantic_compressor-1.7 → semantic_compressor-1.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.7
3
+ Version: 1.9
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -13,8 +13,7 @@ Requires-Dist: numpy<2
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
- Requires-Dist: onnxruntime
17
- Requires-Dist: onnxruntime-extensions
16
+ Requires-Dist: model2vec
18
17
  Requires-Dist: pyspellchecker
19
18
 
20
19
  ```python
@@ -2,7 +2,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
2
2
  import numpy as np, pickle, fasttext, os, traceback, importlib
3
3
  from sklearn.decomposition import LatentDirichletAllocation
4
4
  from sklearn.metrics.pairwise import cosine_similarity
5
- from onnxruntime_extensions import get_library_path
6
5
  from compressor.minbpe.regex import RegexTokenizer
7
6
  from concurrent.futures import ProcessPoolExecutor
8
7
  from nltk.tokenize import sent_tokenize
@@ -11,7 +10,7 @@ from spellchecker import SpellChecker
11
10
  from nltk.stem import PorterStemmer
12
11
  from nltk.stem import RSLPStemmer
13
12
  from collections import Counter
14
- import onnxruntime as ort
13
+ from model2vec import StaticModel
15
14
  import nltk
16
15
 
17
16
  tokenizer = RegexTokenizer()
@@ -32,18 +31,7 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
32
31
  portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
33
32
  langdetect_model = fasttext.load_model(fasttext_model_path)
34
33
 
35
- embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', 1)
36
-
37
- _options = ort.SessionOptions()
38
- _options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
39
- _options.register_custom_ops_library(get_library_path())
40
- _providers = ["CPUExecutionProvider"]
41
-
42
- embedding_model = ort.InferenceSession(
43
- path_or_bytes = str(importlib.resources.files('compressor').joinpath('resources/embedding_model.onnx')),
44
- sess_options=_options,
45
- providers=_providers
46
- )
34
+ embedding_model = StaticModel.from_pretrained("cnmoro/multilingual-e5-small-distilled-16m")
47
35
 
48
36
  hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
49
37
 
@@ -54,7 +42,7 @@ def extract_textual_embeddings(text):
54
42
  return fixed_size_matrix.tolist()
55
43
 
56
44
  def extract_semantic_embeddings(text):
57
- return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
45
+ return embedding_model.encode([text])[0]
58
46
 
59
47
  def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
60
48
  chunks = []
@@ -339,7 +327,7 @@ def find_needle_in_haystack(
339
327
  haystack_semantic_embeddings = list(executor.map(extract_semantic_embeddings, blocks))
340
328
 
341
329
  if embedding_mode in {'textual', 'both'}:
342
- with ProcessPoolExecutor(max_workers=cpu_count()//1.5) as executor:
330
+ with ProcessPoolExecutor(max_workers=int(cpu_count()//1.5)) as executor:
343
331
  haystack_textual_embeddings = list(
344
332
  executor.map(preprocess_and_extract_textual_embedding, blocks, [use_stemming]*len(blocks), [lang]*len(blocks))
345
333
  )
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "semantic_compressor"
7
- version = "1.7"
7
+ version = "1.9"
8
8
  authors = [
9
9
  { name="Carlo Moro", email="cnmoro@gmail.com" },
10
10
  ]
@@ -21,7 +21,6 @@ dependencies = [
21
21
  "nltk",
22
22
  "scikit-learn",
23
23
  "fasttext",
24
- "onnxruntime",
25
- "onnxruntime-extensions",
24
+ "model2vec",
26
25
  "pyspellchecker"
27
26
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.7
3
+ Version: 1.9
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -13,8 +13,7 @@ Requires-Dist: numpy<2
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
- Requires-Dist: onnxruntime
17
- Requires-Dist: onnxruntime-extensions
16
+ Requires-Dist: model2vec
18
17
  Requires-Dist: pyspellchecker
19
18
 
20
19
  ```python
@@ -8,7 +8,6 @@ compressor/minbpe/__init__.py
8
8
  compressor/minbpe/base.py
9
9
  compressor/minbpe/basic.py
10
10
  compressor/minbpe/regex.py
11
- compressor/resources/embedding_model.onnx
12
11
  compressor/resources/en_stopwords.pkl
13
12
  compressor/resources/lid.176.ftz
14
13
  compressor/resources/pt_stopwords.pkl
@@ -2,6 +2,5 @@ numpy<2
2
2
  nltk
3
3
  scikit-learn
4
4
  fasttext
5
- onnxruntime
6
- onnxruntime-extensions
5
+ model2vec
7
6
  pyspellchecker
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='semantic_compressor',
5
- version='1.7',
5
+ version='1.9',
6
6
  author='Carlo Moro',
7
7
  author_email='cnmoro@gmail.com',
8
8
  description="Semantic text compression",
@@ -16,8 +16,7 @@ setup(
16
16
  "nltk",
17
17
  "scikit-learn",
18
18
  "fasttext",
19
- "onnxruntime",
20
- "onnxruntime-extensions",
19
+ "model2vec",
21
20
  "pyspellchecker"
22
21
  ],
23
22
  classifiers=[