ebm4subjects 0.5.2__tar.gz → 0.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/PKG-INFO +1 -1
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/pyproject.toml +1 -1
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/src/ebm4subjects/ebm_model.py +18 -9
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/src/ebm4subjects/embedding_generator.py +76 -9
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/tests/test_prepare_data.py +3 -1
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/.gitignore +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/.python-version +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/LICENSE +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/README.md +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/docs/Makefile +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/docs/make.bat +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/docs/source/README.md +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/docs/source/conf.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/docs/source/ebm4subjects.rst +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/docs/source/index.rst +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/ebm-sketch.svg +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/src/ebm4subjects/__init__.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/src/ebm4subjects/analyzer.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/src/ebm4subjects/chunker.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/src/ebm4subjects/duckdb_client.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/src/ebm4subjects/ebm_logging.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/src/ebm4subjects/prepare_data.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/tests/__init__.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/tests/data/vocab.ttl +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.4}/tests/test_hello.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ebm4subjects
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: Embedding Based Matching for Automated Subject Indexing
|
|
5
5
|
Author: Deutsche Nationalbibliothek
|
|
6
6
|
Maintainer-email: Clemens Rietdorf <c.rietdorf@dnb.de>, Maximilian Kähler <m.kaehler@dnb.de>
|
|
@@ -15,8 +15,9 @@ from ebm4subjects.duckdb_client import Duckdb_client
|
|
|
15
15
|
from ebm4subjects.ebm_logging import EbmLogger, NullLogger, XGBLogging
|
|
16
16
|
from ebm4subjects.embedding_generator import (
|
|
17
17
|
EmbeddingGeneratorHuggingFaceTEI,
|
|
18
|
-
EmbeddingGeneratorInternal,
|
|
19
18
|
EmbeddingGeneratorMock,
|
|
19
|
+
EmbeddingGeneratorOfflineInference,
|
|
20
|
+
EmbeddingGeneratorOpenAI,
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
|
|
@@ -43,7 +44,7 @@ class EbmModel:
|
|
|
43
44
|
use_altLabels: bool = True,
|
|
44
45
|
hnsw_index_params: dict | str | None = None,
|
|
45
46
|
embedding_model_name: str | None = None,
|
|
46
|
-
|
|
47
|
+
embedding_model_deployment: str = "offline-inference",
|
|
47
48
|
embedding_model_args: dict | str | None = None,
|
|
48
49
|
encode_args_vocab: dict | str | None = None,
|
|
49
50
|
encode_args_documents: dict | str | None = None,
|
|
@@ -99,7 +100,7 @@ class EbmModel:
|
|
|
99
100
|
|
|
100
101
|
# Parameters for embedding generator
|
|
101
102
|
self.generator = None
|
|
102
|
-
self.
|
|
103
|
+
self.embedding_model_deployment = embedding_model_deployment
|
|
103
104
|
self.embedding_model_name = embedding_model_name
|
|
104
105
|
self.embedding_dimensions = int(embedding_dimensions)
|
|
105
106
|
if isinstance(embedding_model_args, str) or not embedding_model_args:
|
|
@@ -179,19 +180,27 @@ class EbmModel:
|
|
|
179
180
|
None
|
|
180
181
|
"""
|
|
181
182
|
if self.generator is None:
|
|
182
|
-
if self.
|
|
183
|
-
self.logger.info("initializing
|
|
184
|
-
self.generator =
|
|
183
|
+
if self.embedding_model_deployment == "offline-inference":
|
|
184
|
+
self.logger.info("initializing offline-inference embedding generator")
|
|
185
|
+
self.generator = EmbeddingGeneratorOfflineInference(
|
|
185
186
|
model_name=self.embedding_model_name,
|
|
186
187
|
embedding_dimensions=self.embedding_dimensions,
|
|
187
188
|
**self.embedding_model_args,
|
|
188
189
|
)
|
|
189
|
-
elif self.
|
|
190
|
+
elif self.embedding_model_deployment == "mock":
|
|
190
191
|
self.logger.info("initializing mock embedding generator")
|
|
191
192
|
self.generator = EmbeddingGeneratorMock(self.embedding_dimensions)
|
|
192
|
-
elif self.
|
|
193
|
+
elif self.embedding_model_deployment == "HuggingFaceTEI":
|
|
193
194
|
self.logger.info("initializing API embedding generator")
|
|
194
195
|
self.generator = EmbeddingGeneratorHuggingFaceTEI(
|
|
196
|
+
model_name=self.embedding_model_name,
|
|
197
|
+
embedding_dimensions=self.embedding_dimensions,
|
|
198
|
+
**self.embedding_model_args,
|
|
199
|
+
)
|
|
200
|
+
elif self.embedding_model_deployment == "OpenAI":
|
|
201
|
+
self.logger.info("initializing API embedding generator")
|
|
202
|
+
self.generator = EmbeddingGeneratorOpenAI(
|
|
203
|
+
model_name=self.embedding_model_name,
|
|
195
204
|
embedding_dimensions=self.embedding_dimensions,
|
|
196
205
|
**self.embedding_model_args,
|
|
197
206
|
)
|
|
@@ -404,7 +413,7 @@ class EbmModel:
|
|
|
404
413
|
.join(
|
|
405
414
|
other=gold_standard.with_columns(pl.lit(True).alias("gold")),
|
|
406
415
|
on=["doc_id", "label_id"],
|
|
407
|
-
how="
|
|
416
|
+
how="full",
|
|
408
417
|
)
|
|
409
418
|
# Fill dataframe so that all not suggested labels which are not part of
|
|
410
419
|
# the gold standard and all gold standard labels which where not
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import requests
|
|
5
5
|
from sentence_transformers import SentenceTransformer
|
|
6
|
+
from tqdm import tqdm
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class EmbeddingGenerator:
|
|
@@ -41,6 +42,7 @@ class EmbeddingGeneratorAPI(EmbeddingGenerator):
|
|
|
41
42
|
|
|
42
43
|
def __init__(
|
|
43
44
|
self,
|
|
45
|
+
model_name: str,
|
|
44
46
|
embedding_dimensions: int,
|
|
45
47
|
**kwargs,
|
|
46
48
|
) -> None:
|
|
@@ -52,7 +54,7 @@ class EmbeddingGeneratorAPI(EmbeddingGenerator):
|
|
|
52
54
|
"""
|
|
53
55
|
|
|
54
56
|
self.embedding_dimensions = embedding_dimensions
|
|
55
|
-
|
|
57
|
+
self.model_name = model_name
|
|
56
58
|
self.session = requests.Session()
|
|
57
59
|
self.api_address = kwargs.get("api_address")
|
|
58
60
|
self.headers = kwargs.get("headers", {"Content-Type": "application/json"})
|
|
@@ -85,26 +87,91 @@ class EmbeddingGeneratorHuggingFaceTEI(EmbeddingGeneratorAPI):
|
|
|
85
87
|
# If empty, return an empty numpy array with the correct shape
|
|
86
88
|
return np.empty((0, self.embedding_dimensions))
|
|
87
89
|
|
|
88
|
-
#
|
|
89
|
-
|
|
90
|
+
# Process in smaller batches to avoid memory overload
|
|
91
|
+
batch_size = min(32, len(texts)) # HuggingFaceTEI has a limit of 32 as default
|
|
92
|
+
|
|
93
|
+
for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
|
|
94
|
+
batch_texts = texts[i : i + batch_size]
|
|
90
95
|
# send a request to the HuggingFaceTEI API
|
|
91
|
-
data = {"inputs":
|
|
96
|
+
data = {"inputs": batch_texts, "truncate": True}
|
|
92
97
|
response = self.session.post(
|
|
93
98
|
self.api_address, headers=self.headers, json=data
|
|
94
99
|
)
|
|
95
100
|
|
|
96
101
|
# add generated embeddings to return list if request was successfull
|
|
97
102
|
if response.status_code == 200:
|
|
98
|
-
embeddings.
|
|
103
|
+
embeddings.extend(response.json())
|
|
104
|
+
else:
|
|
105
|
+
# TODO: write warning to logger
|
|
106
|
+
for _ in batch_texts:
|
|
107
|
+
# TODO: ensure same format as true case and truncate dim
|
|
108
|
+
embeddings.append([0 for _ in range(self.embedding_dimensions)])
|
|
109
|
+
|
|
110
|
+
return np.array(embeddings)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class EmbeddingGeneratorOpenAI(EmbeddingGeneratorAPI):
|
|
114
|
+
"""
|
|
115
|
+
A class for generating embeddings using any OpenAI compatibleAPI.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
|
|
119
|
+
"""
|
|
120
|
+
Generates embeddings for a list of input texts using a model
|
|
121
|
+
via an OpenAI compatible API.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
texts (list[str]): A list of input texts.
|
|
125
|
+
**kwargs: Additional keyword arguments to pass to the
|
|
126
|
+
SentenceTransformer model.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
|
|
130
|
+
containing the generated embeddings.
|
|
131
|
+
"""
|
|
132
|
+
# prepare list for return
|
|
133
|
+
embeddings = []
|
|
134
|
+
|
|
135
|
+
# Check if the input list is empty
|
|
136
|
+
if not texts:
|
|
137
|
+
# If empty, return an empty numpy array with the correct shape
|
|
138
|
+
return np.empty((0, self.embedding_dimensions))
|
|
139
|
+
|
|
140
|
+
# Process in smaller batches to avoid memory overload
|
|
141
|
+
batch_size = min(200, len(texts))
|
|
142
|
+
embeddings = []
|
|
143
|
+
|
|
144
|
+
for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
|
|
145
|
+
batch_texts = texts[i : i + batch_size]
|
|
146
|
+
data = {
|
|
147
|
+
"input": batch_texts,
|
|
148
|
+
"model": self.model_name,
|
|
149
|
+
"encoding_format": "float",
|
|
150
|
+
**kwargs,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
response = self.session.post(
|
|
154
|
+
self.api_address, headers=self.headers, json=data
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Process all embeddings from the batch response
|
|
158
|
+
if response.status_code == 200:
|
|
159
|
+
response_data = response.json()
|
|
160
|
+
for i, _ in enumerate(batch_texts):
|
|
161
|
+
embedding = response_data["data"][i]["embedding"]
|
|
162
|
+
embeddings.append(embedding)
|
|
99
163
|
else:
|
|
100
|
-
|
|
164
|
+
# TODO: write warning to logger
|
|
165
|
+
for _ in batch_texts:
|
|
166
|
+
embeddings.append([0 for _ in range(self.embedding_dimensions)])
|
|
101
167
|
|
|
102
168
|
return np.array(embeddings)
|
|
103
169
|
|
|
104
170
|
|
|
105
|
-
class
|
|
171
|
+
class EmbeddingGeneratorOfflineInference(EmbeddingGenerator):
|
|
106
172
|
"""
|
|
107
|
-
A class for generating embeddings using a given SentenceTransformer model
|
|
173
|
+
A class for generating embeddings using a given SentenceTransformer model
|
|
174
|
+
loaded offline with SentenceTransformer.
|
|
108
175
|
|
|
109
176
|
Args:
|
|
110
177
|
model_name (str): The name of the SentenceTransformer model.
|
|
@@ -118,7 +185,7 @@ class EmbeddingGeneratorInternal(EmbeddingGenerator):
|
|
|
118
185
|
|
|
119
186
|
def __init__(self, model_name: str, embedding_dimensions: int, **kwargs) -> None:
|
|
120
187
|
"""
|
|
121
|
-
Initializes the
|
|
188
|
+
Initializes the EmbeddingGenerator in offline inference mode.
|
|
122
189
|
|
|
123
190
|
Sets the model name, embedding dimensions, and creates a
|
|
124
191
|
SentenceTransformer model instance.
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
import polars as pl
|
|
2
1
|
from pathlib import Path
|
|
3
2
|
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
4
5
|
from ebm4subjects.prepare_data import parse_vocab
|
|
5
6
|
|
|
7
|
+
|
|
6
8
|
def test_parse_vocab_reads_ttl_and_returns_dataframe(tmp_path):
|
|
7
9
|
# Copy the sample vocab.ttl to a temp location
|
|
8
10
|
vocab_src = Path(__file__).parent / "data/vocab.ttl"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|