ebm4subjects 0.5.1__tar.gz → 0.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/PKG-INFO +1 -1
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/pyproject.toml +1 -1
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/src/ebm4subjects/ebm_model.py +33 -12
- ebm4subjects-0.5.2/src/ebm4subjects/embedding_generator.py +201 -0
- ebm4subjects-0.5.1/src/ebm4subjects/embedding_generator.py +0 -70
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/.gitignore +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/.python-version +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/LICENSE +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/README.md +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/docs/Makefile +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/docs/make.bat +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/docs/source/README.md +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/docs/source/conf.py +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/docs/source/ebm4subjects.rst +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/docs/source/index.rst +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/ebm-sketch.svg +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/src/ebm4subjects/__init__.py +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/src/ebm4subjects/analyzer.py +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/src/ebm4subjects/chunker.py +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/src/ebm4subjects/duckdb_client.py +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/src/ebm4subjects/ebm_logging.py +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/src/ebm4subjects/prepare_data.py +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/tests/__init__.py +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/tests/data/vocab.ttl +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/tests/test_hello.py +0 -0
- {ebm4subjects-0.5.1 → ebm4subjects-0.5.2}/tests/test_prepare_data.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ebm4subjects
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: Embedding Based Matching for Automated Subject Indexing
|
|
5
5
|
Author: Deutsche Nationalbibliothek
|
|
6
6
|
Maintainer-email: Clemens Rietdorf <c.rietdorf@dnb.de>, Maximilian Kähler <m.kaehler@dnb.de>
|
|
@@ -13,13 +13,16 @@ from ebm4subjects import prepare_data
|
|
|
13
13
|
from ebm4subjects.chunker import Chunker
|
|
14
14
|
from ebm4subjects.duckdb_client import Duckdb_client
|
|
15
15
|
from ebm4subjects.ebm_logging import EbmLogger, NullLogger, XGBLogging
|
|
16
|
-
from ebm4subjects.embedding_generator import
|
|
16
|
+
from ebm4subjects.embedding_generator import (
|
|
17
|
+
EmbeddingGeneratorHuggingFaceTEI,
|
|
18
|
+
EmbeddingGeneratorInternal,
|
|
19
|
+
EmbeddingGeneratorMock,
|
|
20
|
+
)
|
|
17
21
|
|
|
18
22
|
|
|
19
23
|
class EbmModel:
|
|
20
24
|
def __init__(
|
|
21
25
|
self,
|
|
22
|
-
embedding_model_name: str | Any,
|
|
23
26
|
embedding_dimensions: int | str,
|
|
24
27
|
chunk_tokenizer: str | Any,
|
|
25
28
|
max_chunk_count: int | str,
|
|
@@ -39,7 +42,9 @@ class EbmModel:
|
|
|
39
42
|
collection_name: str = "my_collection",
|
|
40
43
|
use_altLabels: bool = True,
|
|
41
44
|
hnsw_index_params: dict | str | None = None,
|
|
42
|
-
|
|
45
|
+
embedding_model_name: str | None = None,
|
|
46
|
+
embedding_model_type: str = "internal",
|
|
47
|
+
embedding_model_args: dict | str | None = None,
|
|
43
48
|
encode_args_vocab: dict | str | None = None,
|
|
44
49
|
encode_args_documents: dict | str | None = None,
|
|
45
50
|
log_path: str | None = None,
|
|
@@ -94,11 +99,14 @@ class EbmModel:
|
|
|
94
99
|
|
|
95
100
|
# Parameters for embedding generator
|
|
96
101
|
self.generator = None
|
|
102
|
+
self.embedding_model_type = embedding_model_type
|
|
97
103
|
self.embedding_model_name = embedding_model_name
|
|
98
104
|
self.embedding_dimensions = int(embedding_dimensions)
|
|
99
|
-
if isinstance(
|
|
100
|
-
|
|
101
|
-
|
|
105
|
+
if isinstance(embedding_model_args, str) or not embedding_model_args:
|
|
106
|
+
embedding_model_args = (
|
|
107
|
+
ast.literal_eval(embedding_model_args) if embedding_model_args else {}
|
|
108
|
+
)
|
|
109
|
+
self.embedding_model_args = embedding_model_args
|
|
102
110
|
if isinstance(encode_args_vocab, str) or not encode_args_vocab:
|
|
103
111
|
encode_args_vocab = (
|
|
104
112
|
ast.literal_eval(encode_args_vocab) if encode_args_vocab else {}
|
|
@@ -171,12 +179,25 @@ class EbmModel:
|
|
|
171
179
|
None
|
|
172
180
|
"""
|
|
173
181
|
if self.generator is None:
|
|
174
|
-
self.
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
182
|
+
if self.embedding_model_type == "internal":
|
|
183
|
+
self.logger.info("initializing internal embedding generator")
|
|
184
|
+
self.generator = EmbeddingGeneratorInternal(
|
|
185
|
+
model_name=self.embedding_model_name,
|
|
186
|
+
embedding_dimensions=self.embedding_dimensions,
|
|
187
|
+
**self.embedding_model_args,
|
|
188
|
+
)
|
|
189
|
+
elif self.embedding_model_type == "mock":
|
|
190
|
+
self.logger.info("initializing mock embedding generator")
|
|
191
|
+
self.generator = EmbeddingGeneratorMock(self.embedding_dimensions)
|
|
192
|
+
elif self.embedding_model_type == "HuggingFaceTEI":
|
|
193
|
+
self.logger.info("initializing API embedding generator")
|
|
194
|
+
self.generator = EmbeddingGeneratorHuggingFaceTEI(
|
|
195
|
+
embedding_dimensions=self.embedding_dimensions,
|
|
196
|
+
**self.embedding_model_args,
|
|
197
|
+
)
|
|
198
|
+
else:
|
|
199
|
+
self.logger.error("unsupportet API for embedding generator")
|
|
200
|
+
raise NotImplementedError
|
|
180
201
|
|
|
181
202
|
def init_logger(
|
|
182
203
|
self, log_path: str | None = None, logger: logging.Logger | None = None
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import requests
|
|
5
|
+
from sentence_transformers import SentenceTransformer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EmbeddingGenerator:
|
|
9
|
+
"""
|
|
10
|
+
A base class for embedding generators.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
"""
|
|
15
|
+
Base method fot the initialization of an EmbeddingGenerator.
|
|
16
|
+
"""
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
|
|
20
|
+
"""
|
|
21
|
+
Base method fot the creating embeddings with an EmbeddingGenerator.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
texts (list[str]): A list of input texts.
|
|
25
|
+
**kwargs: Additional keyword arguments.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
|
|
29
|
+
containing the generated embeddings.
|
|
30
|
+
"""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class EmbeddingGeneratorAPI(EmbeddingGenerator):
|
|
35
|
+
"""
|
|
36
|
+
A base class for API embedding generators.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
embedding_dimensions (int): The dimensionality of the generated embeddings.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
embedding_dimensions: int,
|
|
45
|
+
**kwargs,
|
|
46
|
+
) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Initializes the API EmbeddingGenerator.
|
|
49
|
+
|
|
50
|
+
Sets the embedding dimensions, and initiliazes and
|
|
51
|
+
prepares a session with the API.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
self.embedding_dimensions = embedding_dimensions
|
|
55
|
+
|
|
56
|
+
self.session = requests.Session()
|
|
57
|
+
self.api_address = kwargs.get("api_address")
|
|
58
|
+
self.headers = kwargs.get("headers", {"Content-Type": "application/json"})
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class EmbeddingGeneratorHuggingFaceTEI(EmbeddingGeneratorAPI):
|
|
62
|
+
"""
|
|
63
|
+
A class for generating embeddings using the HuggingFaceTEI API.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
|
|
67
|
+
"""
|
|
68
|
+
Generates embeddings for a list of input texts using a model
|
|
69
|
+
via the HuggingFaceTEI API.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
texts (list[str]): A list of input texts.
|
|
73
|
+
**kwargs: Additional keyword arguments to pass to the
|
|
74
|
+
SentenceTransformer model.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
|
|
78
|
+
containing the generated embeddings.
|
|
79
|
+
"""
|
|
80
|
+
# prepare list for return
|
|
81
|
+
embeddings = []
|
|
82
|
+
|
|
83
|
+
# Check if the input list is empty
|
|
84
|
+
if not texts:
|
|
85
|
+
# If empty, return an empty numpy array with the correct shape
|
|
86
|
+
return np.empty((0, self.embedding_dimensions))
|
|
87
|
+
|
|
88
|
+
# process each text
|
|
89
|
+
for text in texts:
|
|
90
|
+
# send a request to the HuggingFaceTEI API
|
|
91
|
+
data = {"inputs": text}
|
|
92
|
+
response = self.session.post(
|
|
93
|
+
self.api_address, headers=self.headers, json=data
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# add generated embeddings to return list if request was successfull
|
|
97
|
+
if response.status_code == 200:
|
|
98
|
+
embeddings.append(response.json()[0])
|
|
99
|
+
else:
|
|
100
|
+
embeddings.append([0 for _ in range(self.embedding_dimensions)])
|
|
101
|
+
|
|
102
|
+
return np.array(embeddings)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class EmbeddingGeneratorInternal(EmbeddingGenerator):
|
|
106
|
+
"""
|
|
107
|
+
A class for generating embeddings using a given SentenceTransformer model.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
model_name (str): The name of the SentenceTransformer model.
|
|
111
|
+
embedding_dimensions (int): The dimensionality of the generated embeddings.
|
|
112
|
+
**kwargs: Additional keyword arguments to pass to the model.
|
|
113
|
+
|
|
114
|
+
Attributes:
|
|
115
|
+
model_name (str): The name of the SentenceTransformer model.
|
|
116
|
+
embedding_dimensions (int): The dimensionality of the generated embeddings.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
def __init__(self, model_name: str, embedding_dimensions: int, **kwargs) -> None:
|
|
120
|
+
"""
|
|
121
|
+
Initializes the internal EmbeddingGenerator.
|
|
122
|
+
|
|
123
|
+
Sets the model name, embedding dimensions, and creates a
|
|
124
|
+
SentenceTransformer model instance.
|
|
125
|
+
"""
|
|
126
|
+
self.model_name = model_name
|
|
127
|
+
self.embedding_dimensions = embedding_dimensions
|
|
128
|
+
|
|
129
|
+
# Create a SentenceTransformer model instance with the given
|
|
130
|
+
# model name and embedding dimensions
|
|
131
|
+
self.model = SentenceTransformer(
|
|
132
|
+
model_name, truncate_dim=embedding_dimensions, **kwargs
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Disabel parallelism for tokenizer
|
|
136
|
+
# Needed because process might be already parallelized
|
|
137
|
+
# before embedding creation
|
|
138
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
139
|
+
|
|
140
|
+
def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
|
|
141
|
+
"""
|
|
142
|
+
Generates embeddings for a list of input texts using the
|
|
143
|
+
SentenceTransformer model.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
texts (list[str]): A list of input texts.
|
|
147
|
+
**kwargs: Additional keyword arguments to pass to the
|
|
148
|
+
SentenceTransformer model.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
|
|
152
|
+
containing the generated embeddings.
|
|
153
|
+
"""
|
|
154
|
+
# Check if the input list is empty
|
|
155
|
+
if not texts:
|
|
156
|
+
# If empty, return an empty numpy array with the correct shape
|
|
157
|
+
return np.empty((0, self.embedding_dimensions))
|
|
158
|
+
|
|
159
|
+
# Generate embeddings using the SentenceTransformer model and return them
|
|
160
|
+
return self.model.encode(texts, **kwargs)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class EmbeddingGeneratorMock(EmbeddingGenerator):
|
|
164
|
+
"""
|
|
165
|
+
A mock class for generating fake embeddings. Used for testing.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
embedding_dimensions (int): The dimensionality of the generated embeddings.
|
|
169
|
+
**kwargs: Additional keyword arguments to pass to the model.
|
|
170
|
+
|
|
171
|
+
Attributes:
|
|
172
|
+
embedding_dimensions (int): The dimensionality of the generated embeddings.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
def __init__(self, embedding_dimensions: int, **kwargs) -> None:
|
|
176
|
+
"""
|
|
177
|
+
Initializes the mock EmbeddingGenerator.
|
|
178
|
+
|
|
179
|
+
Sets the embedding dimensions.
|
|
180
|
+
"""
|
|
181
|
+
self.embedding_dimensions = embedding_dimensions
|
|
182
|
+
|
|
183
|
+
def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
|
|
184
|
+
"""
|
|
185
|
+
Generates embeddings for a list of input texts.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
texts (list[str]): A list of input texts.
|
|
189
|
+
**kwargs: Additional keyword arguments.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
|
|
193
|
+
containing the generated embeddings.
|
|
194
|
+
"""
|
|
195
|
+
# Check if the input list is empty
|
|
196
|
+
if not texts:
|
|
197
|
+
# If empty, return an empty numpy array with the correct shape
|
|
198
|
+
return np.empty((0, self.embedding_dimensions))
|
|
199
|
+
|
|
200
|
+
# Generate mock embeddings return them
|
|
201
|
+
return np.ones((len(texts), 1024))
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import numpy as np
|
|
4
|
-
from sentence_transformers import SentenceTransformer
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class EmbeddingGenerator:
|
|
8
|
-
"""
|
|
9
|
-
A class for generating embeddings using a given SentenceTransformer model.
|
|
10
|
-
|
|
11
|
-
Args:
|
|
12
|
-
model_name (str, SentenceTransformer): The name of the SentenceTransformer
|
|
13
|
-
model or an SentenceTransformer model to use.
|
|
14
|
-
embedding_dimensions (int): The dimensionality of the generated embeddings.
|
|
15
|
-
**kwargs: Additional keyword arguments to pass to the model.
|
|
16
|
-
|
|
17
|
-
Attributes:
|
|
18
|
-
model_name (str): The name of the SentenceTransformer model.
|
|
19
|
-
embedding_dimensions (int): The dimensionality of the generated embeddings.
|
|
20
|
-
model (SentenceTransformer): The SentenceTransformer model instance.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
def __init__(
|
|
24
|
-
self, model_name: str | SentenceTransformer, embedding_dimensions: int, **kwargs
|
|
25
|
-
) -> None:
|
|
26
|
-
"""
|
|
27
|
-
Initializes the EmbeddingGenerator.
|
|
28
|
-
|
|
29
|
-
Sets the model name, embedding dimensions, and creates a
|
|
30
|
-
SentenceTransformer model instance.
|
|
31
|
-
"""
|
|
32
|
-
self.model_name = model_name
|
|
33
|
-
self.embedding_dimensions = embedding_dimensions
|
|
34
|
-
|
|
35
|
-
# Create a SentenceTransformer model instance with the given
|
|
36
|
-
# model name and embedding dimensions
|
|
37
|
-
# or set model to the given SentenceTransformer
|
|
38
|
-
if type(model_name) is str:
|
|
39
|
-
self.model = SentenceTransformer(
|
|
40
|
-
model_name, truncate_dim=embedding_dimensions, **kwargs
|
|
41
|
-
)
|
|
42
|
-
else:
|
|
43
|
-
self.model = model_name
|
|
44
|
-
|
|
45
|
-
# Disabel parallelism for tokenizer
|
|
46
|
-
# Needed because process might be already parallelized
|
|
47
|
-
# before embedding creation
|
|
48
|
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
49
|
-
|
|
50
|
-
def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
|
|
51
|
-
"""
|
|
52
|
-
Generates embeddings for a list of input texts using the
|
|
53
|
-
SentenceTransformer model.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
texts (list[str]): A list of input texts.
|
|
57
|
-
**kwargs: Additional keyword arguments to pass to the
|
|
58
|
-
SentenceTransformer model.
|
|
59
|
-
|
|
60
|
-
Returns:
|
|
61
|
-
np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
|
|
62
|
-
containing the generated embeddings.
|
|
63
|
-
"""
|
|
64
|
-
# Check if the input list is empty
|
|
65
|
-
if not texts:
|
|
66
|
-
# If empty, return an empty numpy array with the correct shape
|
|
67
|
-
return np.empty((0, self.embedding_dimensions))
|
|
68
|
-
|
|
69
|
-
# Generate embeddings using the SentenceTransformer model and return them
|
|
70
|
-
return self.model.encode(texts, **kwargs)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|