ebm4subjects 0.5.5__tar.gz → 0.5.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/PKG-INFO +1 -1
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/pyproject.toml +1 -1
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/src/ebm4subjects/ebm_model.py +7 -8
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/src/ebm4subjects/embedding_generator.py +6 -33
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/.gitignore +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/.python-version +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/LICENSE +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/README.md +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/docs/Makefile +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/docs/make.bat +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/docs/source/README.md +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/docs/source/conf.py +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/docs/source/ebm4subjects.rst +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/docs/source/index.rst +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/ebm-sketch.svg +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/src/ebm4subjects/__init__.py +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/src/ebm4subjects/analyzer.py +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/src/ebm4subjects/chunker.py +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/src/ebm4subjects/duckdb_client.py +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/src/ebm4subjects/ebm_logging.py +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/src/ebm4subjects/prepare_data.py +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/tests/__init__.py +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/tests/data/vocab.ttl +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/tests/test_hello.py +0 -0
- {ebm4subjects-0.5.5 → ebm4subjects-0.5.6}/tests/test_prepare_data.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ebm4subjects
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.6
|
|
4
4
|
Summary: Embedding Based Matching for Automated Subject Indexing
|
|
5
5
|
Author: Deutsche Nationalbibliothek
|
|
6
6
|
Maintainer-email: Clemens Rietdorf <c.rietdorf@dnb.de>, Maximilian Kähler <m.kaehler@dnb.de>
|
|
@@ -44,7 +44,7 @@ class EbmModel:
|
|
|
44
44
|
use_altLabels: bool = True,
|
|
45
45
|
hnsw_index_params: dict | str | None = None,
|
|
46
46
|
embedding_model_name: str | None = None,
|
|
47
|
-
embedding_model_deployment: str = "
|
|
47
|
+
embedding_model_deployment: str = "mock",
|
|
48
48
|
embedding_model_args: dict | str | None = None,
|
|
49
49
|
encode_args_vocab: dict | str | None = None,
|
|
50
50
|
encode_args_documents: dict | str | None = None,
|
|
@@ -101,7 +101,7 @@ class EbmModel:
|
|
|
101
101
|
|
|
102
102
|
# Parameters for embedding generator
|
|
103
103
|
self.generator = None
|
|
104
|
-
self.embedding_model_deployment = embedding_model_deployment
|
|
104
|
+
self.embedding_model_deployment = embedding_model_deployment.lower()
|
|
105
105
|
self.embedding_model_name = embedding_model_name
|
|
106
106
|
self.embedding_dimensions = int(embedding_dimensions)
|
|
107
107
|
if isinstance(embedding_model_args, str) or not embedding_model_args:
|
|
@@ -182,7 +182,7 @@ class EbmModel:
|
|
|
182
182
|
"""
|
|
183
183
|
if self.generator is None:
|
|
184
184
|
if self.embedding_model_deployment == "in-process":
|
|
185
|
-
self.logger.info("initializing
|
|
185
|
+
self.logger.info("initializing in-process embedding generator")
|
|
186
186
|
self.generator = EmbeddingGeneratorInProcess(
|
|
187
187
|
model_name=self.embedding_model_name,
|
|
188
188
|
embedding_dimensions=self.embedding_dimensions,
|
|
@@ -192,7 +192,7 @@ class EbmModel:
|
|
|
192
192
|
elif self.embedding_model_deployment == "mock":
|
|
193
193
|
self.logger.info("initializing mock embedding generator")
|
|
194
194
|
self.generator = EmbeddingGeneratorMock(self.embedding_dimensions)
|
|
195
|
-
elif self.embedding_model_deployment == "
|
|
195
|
+
elif self.embedding_model_deployment == "huggingfacetei":
|
|
196
196
|
self.logger.info("initializing API embedding generator")
|
|
197
197
|
self.generator = EmbeddingGeneratorHuggingFaceTEI(
|
|
198
198
|
model_name=self.embedding_model_name,
|
|
@@ -200,7 +200,7 @@ class EbmModel:
|
|
|
200
200
|
logger=self.logger,
|
|
201
201
|
**self.embedding_model_args,
|
|
202
202
|
)
|
|
203
|
-
elif self.embedding_model_deployment == "
|
|
203
|
+
elif self.embedding_model_deployment == "openai":
|
|
204
204
|
self.logger.info("initializing API embedding generator")
|
|
205
205
|
self.generator = EmbeddingGeneratorOpenAI(
|
|
206
206
|
model_name=self.embedding_model_name,
|
|
@@ -209,8 +209,7 @@ class EbmModel:
|
|
|
209
209
|
**self.embedding_model_args,
|
|
210
210
|
)
|
|
211
211
|
else:
|
|
212
|
-
|
|
213
|
-
raise NotImplementedError
|
|
212
|
+
raise NotImplementedError("Unsupportet API for embedding generator")
|
|
214
213
|
|
|
215
214
|
def init_logger(
|
|
216
215
|
self,
|
|
@@ -670,7 +669,7 @@ class EbmModel:
|
|
|
670
669
|
)
|
|
671
670
|
self.logger.info("training successful finished")
|
|
672
671
|
except xgb.core.XGBoostError:
|
|
673
|
-
self.logger.
|
|
672
|
+
self.logger.warn(
|
|
674
673
|
"XGBoost can't train with candidates equal to gold standard "
|
|
675
674
|
"or candidates with no match to gold standard at all - "
|
|
676
675
|
"Check if your training data and gold standard are correct"
|
|
@@ -80,17 +80,7 @@ class EmbeddingGeneratorHuggingFaceTEI(EmbeddingGenerator):
|
|
|
80
80
|
self.logger.debug(
|
|
81
81
|
"API call successful. Everything seems to be working fine."
|
|
82
82
|
)
|
|
83
|
-
elif response.status_code == 404:
|
|
84
|
-
self.logger.error(
|
|
85
|
-
"API not found under given adress! Please check the corresponding parameter!"
|
|
86
|
-
)
|
|
87
|
-
raise RuntimeError(
|
|
88
|
-
"API not found under given adress! Please check the corresponding parameter!"
|
|
89
|
-
)
|
|
90
83
|
else:
|
|
91
|
-
self.logger.error(
|
|
92
|
-
"Request to API not possible! Please check the corresponding parameters!"
|
|
93
|
-
)
|
|
94
84
|
raise RuntimeError(
|
|
95
85
|
"Request to API not possible! Please check the corresponding parameters!"
|
|
96
86
|
)
|
|
@@ -188,29 +178,12 @@ class EmbeddingGeneratorOpenAI(EmbeddingGenerator):
|
|
|
188
178
|
"""
|
|
189
179
|
Tests if the API is working with the given parameters
|
|
190
180
|
"""
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
self.logger.debug(
|
|
198
|
-
"API call successful. Everything seems to be working fine."
|
|
199
|
-
)
|
|
200
|
-
except NotFoundError:
|
|
201
|
-
self.logger.error(
|
|
202
|
-
"API not found under given adress! Please check the corresponding parameter!"
|
|
203
|
-
)
|
|
204
|
-
raise RuntimeError(
|
|
205
|
-
"API not found under given adress! Please check the corresponding parameter!"
|
|
206
|
-
)
|
|
207
|
-
except BadRequestError:
|
|
208
|
-
self.logger.error(
|
|
209
|
-
"Request to API not possible! Please check the corresponding parameters!"
|
|
210
|
-
)
|
|
211
|
-
raise RuntimeError(
|
|
212
|
-
"Request to API not possible! Please check the corresponding parameters!"
|
|
213
|
-
)
|
|
181
|
+
_ = self.client.embeddings.create(
|
|
182
|
+
input="This is a test request!",
|
|
183
|
+
model=self.model_name,
|
|
184
|
+
encoding_format="float",
|
|
185
|
+
)
|
|
186
|
+
self.logger.debug("API call successful. Everything seems to be working fine.")
|
|
214
187
|
|
|
215
188
|
def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
|
|
216
189
|
"""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|