ebm4subjects 0.5.2__tar.gz → 0.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/PKG-INFO +1 -1
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/pyproject.toml +1 -1
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/ebm_model.py +6 -6
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/embedding_generator.py +4 -3
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/.gitignore +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/.python-version +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/LICENSE +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/README.md +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/Makefile +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/make.bat +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/source/README.md +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/source/conf.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/source/ebm4subjects.rst +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/source/index.rst +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/ebm-sketch.svg +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/__init__.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/analyzer.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/chunker.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/duckdb_client.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/ebm_logging.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/prepare_data.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/tests/__init__.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/tests/data/vocab.ttl +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/tests/test_hello.py +0 -0
- {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/tests/test_prepare_data.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ebm4subjects
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary: Embedding Based Matching for Automated Subject Indexing
|
|
5
5
|
Author: Deutsche Nationalbibliothek
|
|
6
6
|
Maintainer-email: Clemens Rietdorf <c.rietdorf@dnb.de>, Maximilian Kähler <m.kaehler@dnb.de>
|
|
@@ -15,7 +15,7 @@ from ebm4subjects.duckdb_client import Duckdb_client
|
|
|
15
15
|
from ebm4subjects.ebm_logging import EbmLogger, NullLogger, XGBLogging
|
|
16
16
|
from ebm4subjects.embedding_generator import (
|
|
17
17
|
EmbeddingGeneratorHuggingFaceTEI,
|
|
18
|
-
|
|
18
|
+
EmbeddingGeneratorOfflineInference,
|
|
19
19
|
EmbeddingGeneratorMock,
|
|
20
20
|
)
|
|
21
21
|
|
|
@@ -43,7 +43,7 @@ class EbmModel:
|
|
|
43
43
|
use_altLabels: bool = True,
|
|
44
44
|
hnsw_index_params: dict | str | None = None,
|
|
45
45
|
embedding_model_name: str | None = None,
|
|
46
|
-
embedding_model_type: str = "
|
|
46
|
+
embedding_model_type: str = "offline-inference",
|
|
47
47
|
embedding_model_args: dict | str | None = None,
|
|
48
48
|
encode_args_vocab: dict | str | None = None,
|
|
49
49
|
encode_args_documents: dict | str | None = None,
|
|
@@ -179,9 +179,9 @@ class EbmModel:
|
|
|
179
179
|
None
|
|
180
180
|
"""
|
|
181
181
|
if self.generator is None:
|
|
182
|
-
if self.embedding_model_type == "
|
|
183
|
-
self.logger.info("initializing
|
|
184
|
-
self.generator =
|
|
182
|
+
if self.embedding_model_type == "offline-inference":
|
|
183
|
+
self.logger.info("initializing offline-inference embedding generator")
|
|
184
|
+
self.generator = EmbeddingGeneratorOfflineInference(
|
|
185
185
|
model_name=self.embedding_model_name,
|
|
186
186
|
embedding_dimensions=self.embedding_dimensions,
|
|
187
187
|
**self.embedding_model_args,
|
|
@@ -404,7 +404,7 @@ class EbmModel:
|
|
|
404
404
|
.join(
|
|
405
405
|
other=gold_standard.with_columns(pl.lit(True).alias("gold")),
|
|
406
406
|
on=["doc_id", "label_id"],
|
|
407
|
-
how="
|
|
407
|
+
how="full",
|
|
408
408
|
)
|
|
409
409
|
# Fill dataframe so that all not suggested labels which are not part of
|
|
410
410
|
# the gold standard and all gold standard labels which where not
|
|
@@ -102,9 +102,10 @@ class EmbeddingGeneratorHuggingFaceTEI(EmbeddingGeneratorAPI):
|
|
|
102
102
|
return np.array(embeddings)
|
|
103
103
|
|
|
104
104
|
|
|
105
|
-
class
|
|
105
|
+
class EmbeddingGeneratorOfflineInference(EmbeddingGenerator):
|
|
106
106
|
"""
|
|
107
|
-
A class for generating embeddings using a given SentenceTransformer model
|
|
107
|
+
A class for generating embeddings using a given SentenceTransformer model
|
|
108
|
+
loaded offline with SentenceTransformer.
|
|
108
109
|
|
|
109
110
|
Args:
|
|
110
111
|
model_name (str): The name of the SentenceTransformer model.
|
|
@@ -118,7 +119,7 @@ class EmbeddingGeneratorInternal(EmbeddingGenerator):
|
|
|
118
119
|
|
|
119
120
|
def __init__(self, model_name: str, embedding_dimensions: int, **kwargs) -> None:
|
|
120
121
|
"""
|
|
121
|
-
Initializes the
|
|
122
|
+
Initializes the EmbeddingGenerator in offline inference mode.
|
|
122
123
|
|
|
123
124
|
Sets the model name, embedding dimensions, and creates a
|
|
124
125
|
SentenceTransformer model instance.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|