semantic-compressor 1.7__py3-none-any.whl → 1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
compressor/semantic.py CHANGED
@@ -2,7 +2,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
2
2
  import numpy as np, pickle, fasttext, os, traceback, importlib
3
3
  from sklearn.decomposition import LatentDirichletAllocation
4
4
  from sklearn.metrics.pairwise import cosine_similarity
5
- from onnxruntime_extensions import get_library_path
6
5
  from compressor.minbpe.regex import RegexTokenizer
7
6
  from concurrent.futures import ProcessPoolExecutor
8
7
  from nltk.tokenize import sent_tokenize
@@ -11,7 +10,7 @@ from spellchecker import SpellChecker
11
10
  from nltk.stem import PorterStemmer
12
11
  from nltk.stem import RSLPStemmer
13
12
  from collections import Counter
14
- import onnxruntime as ort
13
+ from model2vec import StaticModel
15
14
  import nltk
16
15
 
17
16
  tokenizer = RegexTokenizer()
@@ -32,18 +31,7 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
32
31
  portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
33
32
  langdetect_model = fasttext.load_model(fasttext_model_path)
34
33
 
35
- embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', 1)
36
-
37
- _options = ort.SessionOptions()
38
- _options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
39
- _options.register_custom_ops_library(get_library_path())
40
- _providers = ["CPUExecutionProvider"]
41
-
42
- embedding_model = ort.InferenceSession(
43
- path_or_bytes = str(importlib.resources.files('compressor').joinpath('resources/embedding_model.onnx')),
44
- sess_options=_options,
45
- providers=_providers
46
- )
34
+ embedding_model = StaticModel.from_pretrained("cnmoro/multilingual-e5-small-distilled-16m")
47
35
 
48
36
  hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
49
37
 
@@ -54,7 +42,7 @@ def extract_textual_embeddings(text):
54
42
  return fixed_size_matrix.tolist()
55
43
 
56
44
  def extract_semantic_embeddings(text):
57
- return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
45
+ return embedding_model.encode([text])[0]
58
46
 
59
47
  def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
60
48
  chunks = []
@@ -339,7 +327,7 @@ def find_needle_in_haystack(
339
327
  haystack_semantic_embeddings = list(executor.map(extract_semantic_embeddings, blocks))
340
328
 
341
329
  if embedding_mode in {'textual', 'both'}:
342
- with ProcessPoolExecutor(max_workers=cpu_count()//1.5) as executor:
330
+ with ProcessPoolExecutor(max_workers=int(cpu_count()//1.5)) as executor:
343
331
  haystack_textual_embeddings = list(
344
332
  executor.map(preprocess_and_extract_textual_embedding, blocks, [use_stemming]*len(blocks), [lang]*len(blocks))
345
333
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.7
3
+ Version: 1.9
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -9,12 +9,11 @@ Classifier: Operating System :: OS Independent
9
9
  Requires-Python: >=3.7
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
- Requires-Dist: numpy <2
12
+ Requires-Dist: numpy<2
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
- Requires-Dist: onnxruntime
17
- Requires-Dist: onnxruntime-extensions
16
+ Requires-Dist: model2vec
18
17
  Requires-Dist: pyspellchecker
19
18
 
20
19
  ```python
@@ -1,15 +1,14 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=CJ5WhWKDuBT19qB_5EvMqWw5mtU2jCqBmOkVWXODLX0,16257
2
+ compressor/semantic.py,sha256=QrCpAdOPnGuK8V-yjJ9mUZa0olZYj0Ul0wRJgNc5cI8,15738
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
6
6
  compressor/minbpe/regex.py,sha256=k3bllcxc5c7mi43tUEGg6jX-Zc4Cvfb1CCTGEp7ZcVM,5821
7
- compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5lDSEIBdXKcPc,71794489
8
7
  compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
8
  compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
9
  compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
- semantic_compressor-1.7.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
- semantic_compressor-1.7.dist-info/METADATA,sha256=I4nO2VQxeIOJAAzs2DMhxmotVV6IvdVMfeheUwAFCTQ,6178
13
- semantic_compressor-1.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
- semantic_compressor-1.7.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
- semantic_compressor-1.7.dist-info/RECORD,,
10
+ semantic_compressor-1.9.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
11
+ semantic_compressor-1.9.dist-info/METADATA,sha256=1BP6qPt8T8sR9NrHG-pgS3GJKiIHpOXCWxTHadvu2KA,6137
12
+ semantic_compressor-1.9.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
13
+ semantic_compressor-1.9.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
14
+ semantic_compressor-1.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
Binary file