PyPI - semantic-compressor - Versions diffs - 1.7__py3-none-any.whl → 1.9__py3-none-any.whl - Mend

semantic-compressor 1.7py3-none-any.whl → 1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

compressor/semantic.py CHANGED Viewed

@@ -2,7 +2,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
 import numpy as np, pickle, fasttext, os, traceback, importlib
 from sklearn.decomposition import LatentDirichletAllocation
 from sklearn.metrics.pairwise import cosine_similarity
-from onnxruntime_extensions import get_library_path
 from compressor.minbpe.regex import RegexTokenizer
 from concurrent.futures import ProcessPoolExecutor
 from nltk.tokenize import sent_tokenize
@@ -11,7 +10,7 @@ from spellchecker import SpellChecker
 from nltk.stem import PorterStemmer
 from nltk.stem import RSLPStemmer
 from collections import Counter
-import onnxruntime as ort
+from model2vec import StaticModel
 import nltk
 tokenizer = RegexTokenizer()
@@ -32,18 +31,7 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
 portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
 langdetect_model = fasttext.load_model(fasttext_model_path)
-embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', 1)
-_options = ort.SessionOptions()
-_options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
-_options.register_custom_ops_library(get_library_path())
-_providers = ["CPUExecutionProvider"]
-embedding_model = ort.InferenceSession(
-    path_or_bytes = str(importlib.resources.files('compressor').joinpath('resources/embedding_model.onnx')),
-    sess_options=_options,
-    providers=_providers
-)
+embedding_model = StaticModel.from_pretrained("cnmoro/multilingual-e5-small-distilled-16m")
 hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
@@ -54,7 +42,7 @@ def extract_textual_embeddings(text):
     return fixed_size_matrix.tolist()
 def extract_semantic_embeddings(text):
-    return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
+    return embedding_model.encode([text])[0]
 def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
     chunks = []
@@ -339,7 +327,7 @@ def find_needle_in_haystack(
                 haystack_semantic_embeddings = list(executor.map(extract_semantic_embeddings, blocks))
         if embedding_mode in {'textual', 'both'}:
-            with ProcessPoolExecutor(max_workers=cpu_count()//1.5) as executor:
+            with ProcessPoolExecutor(max_workers=int(cpu_count()//1.5)) as executor:
                 haystack_textual_embeddings = list(
                     executor.map(preprocess_and_extract_textual_embedding, blocks, [use_stemming]*len(blocks), [lang]*len(blocks))
                 )

{semantic_compressor-1.7.dist-info → semantic_compressor-1.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: semantic_compressor
-Version: 1.7
+Version: 1.9
 Author: Carlo Moro
 Author-email: Carlo Moro <cnmoro@gmail.com>
 Classifier: Programming Language :: Python :: 3
@@ -9,12 +9,11 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: numpy <2
+Requires-Dist: numpy<2
 Requires-Dist: nltk
 Requires-Dist: scikit-learn
 Requires-Dist: fasttext
-Requires-Dist: onnxruntime
-Requires-Dist: onnxruntime-extensions
+Requires-Dist: model2vec
 Requires-Dist: pyspellchecker
 ```python

{semantic_compressor-1.7.dist-info → semantic_compressor-1.9.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,14 @@
 compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-compressor/semantic.py,sha256=CJ5WhWKDuBT19qB_5EvMqWw5mtU2jCqBmOkVWXODLX0,16257
+compressor/semantic.py,sha256=QrCpAdOPnGuK8V-yjJ9mUZa0olZYj0Ul0wRJgNc5cI8,15738
 compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
 compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
 compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
 compressor/minbpe/regex.py,sha256=k3bllcxc5c7mi43tUEGg6jX-Zc4Cvfb1CCTGEp7ZcVM,5821
-compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5lDSEIBdXKcPc,71794489
 compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
 compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
 compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
-semantic_compressor-1.7.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
-semantic_compressor-1.7.dist-info/METADATA,sha256=I4nO2VQxeIOJAAzs2DMhxmotVV6IvdVMfeheUwAFCTQ,6178
-semantic_compressor-1.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-semantic_compressor-1.7.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
-semantic_compressor-1.7.dist-info/RECORD,,
+semantic_compressor-1.9.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
+semantic_compressor-1.9.dist-info/METADATA,sha256=1BP6qPt8T8sR9NrHG-pgS3GJKiIHpOXCWxTHadvu2KA,6137
+semantic_compressor-1.9.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+semantic_compressor-1.9.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
+semantic_compressor-1.9.dist-info/RECORD,,

{semantic_compressor-1.7.dist-info → semantic_compressor-1.9.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (75.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

compressor/resources/embedding_model.onnx DELETED Viewed

Binary file

{semantic_compressor-1.7.dist-info → semantic_compressor-1.9.dist-info}/LICENSE RENAMED Viewed

File without changes

{semantic_compressor-1.7.dist-info → semantic_compressor-1.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

semantic-compressor 1.7__py3-none-any.whl → 1.9__py3-none-any.whl

semantic-compressor 1.7py3-none-any.whl → 1.9py3-none-any.whl