semantic-compressor 1.7__py3-none-any.whl → 1.9__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
compressor/semantic.py CHANGED
@@ -2,7 +2,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
2
2
  import numpy as np, pickle, fasttext, os, traceback, importlib
3
3
  from sklearn.decomposition import LatentDirichletAllocation
4
4
  from sklearn.metrics.pairwise import cosine_similarity
5
- from onnxruntime_extensions import get_library_path
6
5
  from compressor.minbpe.regex import RegexTokenizer
7
6
  from concurrent.futures import ProcessPoolExecutor
8
7
  from nltk.tokenize import sent_tokenize
@@ -11,7 +10,7 @@ from spellchecker import SpellChecker
11
10
  from nltk.stem import PorterStemmer
12
11
  from nltk.stem import RSLPStemmer
13
12
  from collections import Counter
14
- import onnxruntime as ort
13
+ from model2vec import StaticModel
15
14
  import nltk
16
15
 
17
16
  tokenizer = RegexTokenizer()
@@ -32,18 +31,7 @@ english_stopwords = pickle.load(open(english_stopwords_path, "rb"))
32
31
  portuguese_stopwords = pickle.load(open(portuguese_stopwords_path, "rb"))
33
32
  langdetect_model = fasttext.load_model(fasttext_model_path)
34
33
 
35
- embedding_model_cpu_count = os.environ.get('EMBEDDING_MODEL_CPU_COUNT', 1)
36
-
37
- _options = ort.SessionOptions()
38
- _options.inter_op_num_threads, _options.intra_op_num_threads = embedding_model_cpu_count, embedding_model_cpu_count
39
- _options.register_custom_ops_library(get_library_path())
40
- _providers = ["CPUExecutionProvider"]
41
-
42
- embedding_model = ort.InferenceSession(
43
- path_or_bytes = str(importlib.resources.files('compressor').joinpath('resources/embedding_model.onnx')),
44
- sess_options=_options,
45
- providers=_providers
46
- )
34
+ embedding_model = StaticModel.from_pretrained("cnmoro/multilingual-e5-small-distilled-16m")
47
35
 
48
36
  hashing_vectorizer = HashingVectorizer(ngram_range=(1, 6), analyzer='char', n_features=512)
49
37
 
@@ -54,7 +42,7 @@ def extract_textual_embeddings(text):
54
42
  return fixed_size_matrix.tolist()
55
43
 
56
44
  def extract_semantic_embeddings(text):
57
- return embedding_model.run(output_names=["outputs"], input_feed={"inputs": [text]})[0][0]
45
+ return embedding_model.encode([text])[0]
58
46
 
59
47
  def structurize_text(full_text, tokens_per_chunk=300, chunk_overlap=0):
60
48
  chunks = []
@@ -339,7 +327,7 @@ def find_needle_in_haystack(
339
327
  haystack_semantic_embeddings = list(executor.map(extract_semantic_embeddings, blocks))
340
328
 
341
329
  if embedding_mode in {'textual', 'both'}:
342
- with ProcessPoolExecutor(max_workers=cpu_count()//1.5) as executor:
330
+ with ProcessPoolExecutor(max_workers=int(cpu_count()//1.5)) as executor:
343
331
  haystack_textual_embeddings = list(
344
332
  executor.map(preprocess_and_extract_textual_embedding, blocks, [use_stemming]*len(blocks), [lang]*len(blocks))
345
333
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: semantic_compressor
3
- Version: 1.7
3
+ Version: 1.9
4
4
  Author: Carlo Moro
5
5
  Author-email: Carlo Moro <cnmoro@gmail.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -9,12 +9,11 @@ Classifier: Operating System :: OS Independent
9
9
  Requires-Python: >=3.7
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
- Requires-Dist: numpy <2
12
+ Requires-Dist: numpy<2
13
13
  Requires-Dist: nltk
14
14
  Requires-Dist: scikit-learn
15
15
  Requires-Dist: fasttext
16
- Requires-Dist: onnxruntime
17
- Requires-Dist: onnxruntime-extensions
16
+ Requires-Dist: model2vec
18
17
  Requires-Dist: pyspellchecker
19
18
 
20
19
  ```python
@@ -1,15 +1,14 @@
1
1
  compressor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- compressor/semantic.py,sha256=CJ5WhWKDuBT19qB_5EvMqWw5mtU2jCqBmOkVWXODLX0,16257
2
+ compressor/semantic.py,sha256=QrCpAdOPnGuK8V-yjJ9mUZa0olZYj0Ul0wRJgNc5cI8,15738
3
3
  compressor/minbpe/__init__.py,sha256=wZ1z2QKkncvGgiZDBc91AP5m7-M-MVenPStKbS6xylE,95
4
4
  compressor/minbpe/base.py,sha256=tTKag04RRFnc4ppoieBbDV0V6thzi_ZvZTlhOYIoY7Q,6881
5
5
  compressor/minbpe/basic.py,sha256=0kD4tU8l2MZegfPaHMfDo5CnaSzb9i1v9tDBy6GwMbg,2883
6
6
  compressor/minbpe/regex.py,sha256=k3bllcxc5c7mi43tUEGg6jX-Zc4Cvfb1CCTGEp7ZcVM,5821
7
- compressor/resources/embedding_model.onnx,sha256=uLBbAfCGEJTwR1yyiK0bMDroruLr6W5lDSEIBdXKcPc,71794489
8
7
  compressor/resources/en_stopwords.pkl,sha256=Q2PyGQnphPUs_jxN9NMSqp2EQjYv4b4oMJY2aMYvbSY,1310
9
8
  compressor/resources/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
10
9
  compressor/resources/pt_stopwords.pkl,sha256=-9bJaxJWjeOFxLHLT9D-rI3XTzGC0iLJfMiwBDnkCYI,1716
11
- semantic_compressor-1.7.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
12
- semantic_compressor-1.7.dist-info/METADATA,sha256=I4nO2VQxeIOJAAzs2DMhxmotVV6IvdVMfeheUwAFCTQ,6178
13
- semantic_compressor-1.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
- semantic_compressor-1.7.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
15
- semantic_compressor-1.7.dist-info/RECORD,,
10
+ semantic_compressor-1.9.dist-info/LICENSE,sha256=DFRihXonZ3qVRaTrzuXNaDI_-h2jyT2SqWqjtTDHfqI,1067
11
+ semantic_compressor-1.9.dist-info/METADATA,sha256=1BP6qPt8T8sR9NrHG-pgS3GJKiIHpOXCWxTHadvu2KA,6137
12
+ semantic_compressor-1.9.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
13
+ semantic_compressor-1.9.dist-info/top_level.txt,sha256=qb2SlKrEmMrQDVrhwxu3Wr7U6JupPXtDGrJpIQr8xSc,11
14
+ semantic_compressor-1.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
Binary file