semantic-compressor 1.8__tar.gz → 1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of semantic-compressor might be problematic. Click here for more details.

Files changed (21) hide show
  1. {semantic_compressor-1.8/semantic_compressor.egg-info → semantic_compressor-1.9}/PKG-INFO +2 -3
  2. {semantic_compressor-1.8 → semantic_compressor-1.9}/compressor/semantic.py +3 -15
  3. {semantic_compressor-1.8 → semantic_compressor-1.9}/pyproject.toml +2 -3
  4. {semantic_compressor-1.8 → semantic_compressor-1.9/semantic_compressor.egg-info}/PKG-INFO +2 -3
  5. {semantic_compressor-1.8 → semantic_compressor-1.9}/semantic_compressor.egg-info/SOURCES.txt +0 -1
  6. {semantic_compressor-1.8 → semantic_compressor-1.9}/semantic_compressor.egg-info/requires.txt +1 -2
  7. {semantic_compressor-1.8 → semantic_compressor-1.9}/setup.py +2 -3
  8. semantic_compressor-1.8/compressor/resources/embedding_model.onnx +0 -0
  9. {semantic_compressor-1.8 → semantic_compressor-1.9}/LICENSE +0 -0
  10. {semantic_compressor-1.8 → semantic_compressor-1.9}/README.md +0 -0
  11. {semantic_compressor-1.8 → semantic_compressor-1.9}/compressor/__init__.py +0 -0
  12. {semantic_compressor-1.8 → semantic_compressor-1.9}/compressor/minbpe/__init__.py +0 -0
  13. {semantic_compressor-1.8 → semantic_compressor-1.9}/compressor/minbpe/base.py +0 -0
  14. {semantic_compressor-1.8 → semantic_compressor-1.9}/compressor/minbpe/basic.py +0 -0
  15. {semantic_compressor-1.8 → semantic_compressor-1.9}/compressor/minbpe/regex.py +0 -0
  16. {semantic_compressor-1.8 → semantic_compressor-1.9}/compressor/resources/en_stopwords.pkl +0 -0
  17. {semantic_compressor-1.8 → semantic_compressor-1.9}/compressor/resources/lid.176.ftz +0 -0
  18. {semantic_compressor-1.8 → semantic_compressor-1.9}/compressor/resources/pt_stopwords.pkl +0 -0
  19. {semantic_compressor-1.8 → semantic_compressor-1.9}/semantic_compressor.egg-info/dependency_links.txt +0 -0
  20. {semantic_compressor-1.8 → semantic_compressor-1.9}/semantic_compressor.egg-info/top_level.txt +0 -0
  21. {semantic_compressor-1.8 → semantic_compressor-1.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.8
3
+ Version: 1.9
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -13,8 +13,7 @@ Requires-Dist: numpy<2
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
- Requires-Dist: onnxruntime
17
- Requires-Dist: onnxruntime-extensions
16
+ Requires-Dist: model2vec
18
17
  Requires-Dist: pyspellchecker
19
18
 
20
19
  ```python
@@ -2,7 +2,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
2
2
  import numpy as np, pickle, fasttext, os, traceback, importlib
3
3
  from sklearn.decomposition import LatentDirichletAllocation
4
4
  from sklearn.metrics.pairwise import cosine_similarity
5
- from onnxruntime_extensions import get_library_path
6
5
  from compressor.minbpe.regex import RegexTokenizer
7
6
  from concurrent.futures import ProcessPoolExecutor
8
7
  from nltk.tokenize import sent_tokenize
@@ -11,7 +10,7 @@ from spellchecker import SpellChecker
11
10
  from nltk.stem import PorterStemmer
12
11
  from nltk.stem import RSLPStemmer
13
12
  from collections import Counter
14
- import onnxruntime as ort
13
+ from model2vec import StaticModel
15
14
  import nltk
16
15
 
17
16
  tokenizer = RegexTokenizer()
@@ -32,18 +31,7 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
32
31
  portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
33
32
  langdetect_model = fasttext.load_model(fasttext_model_path)
34
33
 
35
- embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', 1)
36
-
37
- _options = ort.SessionOptions()
38
- _options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
39
- _options.register_custom_ops_library(get_library_path())
40
- _providers = ["CPUExecutionProvider"]
41
-
42
- embedding_model = ort.InferenceSession(
43
- path_or_bytes = str(importlib.resources.files('compressor').joinpath('resources/embedding_model.onnx')),
44
- sess_options=_options,
45
- providers=_providers
46
- )
34
+ embedding_model = StaticModel.from_pretrained("cnmoro/multilingual-e5-small-distilled-16m")
47
35
 
48
36
  hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
49
37
 
@@ -54,7 +42,7 @@ def extract_textual_embeddings(text):
54
42
  return fixed_size_matrix.tolist()
55
43
 
56
44
  def extract_semantic_embeddings(text):
57
- return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
45
+ return embedding_model.encode([text])[0]
58
46
 
59
47
  def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
60
48
  chunks = []
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "semantic_compressor"
7
- version = "1.8"
7
+ version = "1.9"
8
8
  authors = [
9
9
  { name="Carlo Moro", email="cnmoro@gmail.com" },
10
10
  ]
@@ -21,7 +21,6 @@ dependencies = [
21
21
  "nltk",
22
22
  "scikit-learn",
23
23
  "fasttext",
24
- "onnxruntime",
25
- "onnxruntime-extensions",
24
+ "model2vec",
26
25
  "pyspellchecker"
27
26
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.8
3
+ Version: 1.9
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -13,8 +13,7 @@ Requires-Dist: numpy<2
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
- Requires-Dist: onnxruntime
17
- Requires-Dist: onnxruntime-extensions
16
+ Requires-Dist: model2vec
18
17
  Requires-Dist: pyspellchecker
19
18
 
20
19
  ```python
@@ -8,7 +8,6 @@ compressor/minbpe/__init__.py
8
8
  compressor/minbpe/base.py
9
9
  compressor/minbpe/basic.py
10
10
  compressor/minbpe/regex.py
11
- compressor/resources/embedding_model.onnx
12
11
  compressor/resources/en_stopwords.pkl
13
12
  compressor/resources/lid.176.ftz
14
13
  compressor/resources/pt_stopwords.pkl
@@ -2,6 +2,5 @@ numpy<2
2
2
  nltk
3
3
  scikit-learn
4
4
  fasttext
5
- onnxruntime
6
- onnxruntime-extensions
5
+ model2vec
7
6
  pyspellchecker
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='semantic_compressor',
5
- version='1.8',
5
+ version='1.9',
6
6
  author='Carlo Moro',
7
7
  author_email='cnmoro@gmail.com',
8
8
  description="Semantic text compression",
@@ -16,8 +16,7 @@ setup(
16
16
  "nltk",
17
17
  "scikit-learn",
18
18
  "fasttext",
19
- "onnxruntime",
20
- "onnxruntime-extensions",
19
+ "model2vec",
21
20
  "pyspellchecker"
22
21
  ],
23
22
  classifiers=[