ebm4subjects 0.5.2__tar.gz → 0.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/PKG-INFO +1 -1
  2. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/pyproject.toml +1 -1
  3. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/ebm_model.py +6 -6
  4. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/embedding_generator.py +4 -3
  5. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/.gitignore +0 -0
  6. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/.python-version +0 -0
  7. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/LICENSE +0 -0
  8. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/README.md +0 -0
  9. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/Makefile +0 -0
  10. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/make.bat +0 -0
  11. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/source/README.md +0 -0
  12. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/source/conf.py +0 -0
  13. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/source/ebm4subjects.rst +0 -0
  14. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/docs/source/index.rst +0 -0
  15. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/ebm-sketch.svg +0 -0
  16. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/__init__.py +0 -0
  17. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/analyzer.py +0 -0
  18. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/chunker.py +0 -0
  19. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/duckdb_client.py +0 -0
  20. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/ebm_logging.py +0 -0
  21. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/src/ebm4subjects/prepare_data.py +0 -0
  22. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/tests/__init__.py +0 -0
  23. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/tests/data/vocab.ttl +0 -0
  24. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/tests/test_hello.py +0 -0
  25. {ebm4subjects-0.5.2 → ebm4subjects-0.5.3}/tests/test_prepare_data.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ebm4subjects
3
- Version: 0.5.2
3
+ Version: 0.5.3
4
4
  Summary: Embedding Based Matching for Automated Subject Indexing
5
5
  Author: Deutsche Nationalbibliothek
6
6
  Maintainer-email: Clemens Rietdorf <c.rietdorf@dnb.de>, Maximilian Kähler <m.kaehler@dnb.de>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ebm4subjects"
3
- version = "0.5.2"
3
+ version = "0.5.3"
4
4
  description = "Embedding Based Matching for Automated Subject Indexing"
5
5
  authors = [
6
6
  {name = "Deutsche Nationalbibliothek"},
@@ -15,7 +15,7 @@ from ebm4subjects.duckdb_client import Duckdb_client
15
15
  from ebm4subjects.ebm_logging import EbmLogger, NullLogger, XGBLogging
16
16
  from ebm4subjects.embedding_generator import (
17
17
  EmbeddingGeneratorHuggingFaceTEI,
18
- EmbeddingGeneratorInternal,
18
+ EmbeddingGeneratorOfflineInference,
19
19
  EmbeddingGeneratorMock,
20
20
  )
21
21
 
@@ -43,7 +43,7 @@ class EbmModel:
43
43
  use_altLabels: bool = True,
44
44
  hnsw_index_params: dict | str | None = None,
45
45
  embedding_model_name: str | None = None,
46
- embedding_model_type: str = "internal",
46
+ embedding_model_type: str = "offline-inference",
47
47
  embedding_model_args: dict | str | None = None,
48
48
  encode_args_vocab: dict | str | None = None,
49
49
  encode_args_documents: dict | str | None = None,
@@ -179,9 +179,9 @@ class EbmModel:
179
179
  None
180
180
  """
181
181
  if self.generator is None:
182
- if self.embedding_model_type == "internal":
183
- self.logger.info("initializing internal embedding generator")
184
- self.generator = EmbeddingGeneratorInternal(
182
+ if self.embedding_model_type == "offline-inference":
183
+ self.logger.info("initializing offline-inference embedding generator")
184
+ self.generator = EmbeddingGeneratorOfflineInference(
185
185
  model_name=self.embedding_model_name,
186
186
  embedding_dimensions=self.embedding_dimensions,
187
187
  **self.embedding_model_args,
@@ -404,7 +404,7 @@ class EbmModel:
404
404
  .join(
405
405
  other=gold_standard.with_columns(pl.lit(True).alias("gold")),
406
406
  on=["doc_id", "label_id"],
407
- how="outer",
407
+ how="full",
408
408
  )
409
409
  # Fill dataframe so that all not suggested labels which are not part of
410
410
  # the gold standard and all gold standard labels which where not
@@ -102,9 +102,10 @@ class EmbeddingGeneratorHuggingFaceTEI(EmbeddingGeneratorAPI):
102
102
  return np.array(embeddings)
103
103
 
104
104
 
105
- class EmbeddingGeneratorInternal(EmbeddingGenerator):
105
+ class EmbeddingGeneratorOfflineInference(EmbeddingGenerator):
106
106
  """
107
- A class for generating embeddings using a given SentenceTransformer model.
107
+ A class for generating embeddings using a given SentenceTransformer model
108
+ loaded offline with SentenceTransformer.
108
109
 
109
110
  Args:
110
111
  model_name (str): The name of the SentenceTransformer model.
@@ -118,7 +119,7 @@ class EmbeddingGeneratorInternal(EmbeddingGenerator):
118
119
 
119
120
  def __init__(self, model_name: str, embedding_dimensions: int, **kwargs) -> None:
120
121
  """
121
- Initializes the internal EmbeddingGenerator.
122
+ Initializes the EmbeddingGenerator in offline inference mode.
122
123
 
123
124
  Sets the model name, embedding dimensions, and creates a
124
125
  SentenceTransformer model instance.
File without changes
File without changes
File without changes
File without changes
File without changes