ebm4subjects 0.5.1__tar.gz → 0.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/PKG-INFO +1 -1
  2. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/pyproject.toml +1 -1
  3. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/src/ebm4subjects/ebm_model.py +34 -13
  4. ebm4subjects-0.5.3/src/ebm4subjects/embedding_generator.py +202 -0
  5. ebm4subjects-0.5.1/src/ebm4subjects/embedding_generator.py +0 -70
  6. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/.gitignore +0 -0
  7. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/.python-version +0 -0
  8. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/LICENSE +0 -0
  9. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/README.md +0 -0
  10. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/docs/Makefile +0 -0
  11. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/docs/make.bat +0 -0
  12. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/docs/source/README.md +0 -0
  13. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/docs/source/conf.py +0 -0
  14. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/docs/source/ebm4subjects.rst +0 -0
  15. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/docs/source/index.rst +0 -0
  16. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/ebm-sketch.svg +0 -0
  17. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/src/ebm4subjects/__init__.py +0 -0
  18. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/src/ebm4subjects/analyzer.py +0 -0
  19. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/src/ebm4subjects/chunker.py +0 -0
  20. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/src/ebm4subjects/duckdb_client.py +0 -0
  21. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/src/ebm4subjects/ebm_logging.py +0 -0
  22. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/src/ebm4subjects/prepare_data.py +0 -0
  23. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/tests/__init__.py +0 -0
  24. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/tests/data/vocab.ttl +0 -0
  25. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/tests/test_hello.py +0 -0
  26. {ebm4subjects-0.5.1 → ebm4subjects-0.5.3}/tests/test_prepare_data.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ebm4subjects
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: Embedding Based Matching for Automated Subject Indexing
5
5
  Author: Deutsche Nationalbibliothek
6
6
  Maintainer-email: Clemens Rietdorf <c.rietdorf@dnb.de>, Maximilian Kähler <m.kaehler@dnb.de>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ebm4subjects"
3
- version = "0.5.1"
3
+ version = "0.5.3"
4
4
  description = "Embedding Based Matching for Automated Subject Indexing"
5
5
  authors = [
6
6
  {name = "Deutsche Nationalbibliothek"},
@@ -13,13 +13,16 @@ from ebm4subjects import prepare_data
13
13
  from ebm4subjects.chunker import Chunker
14
14
  from ebm4subjects.duckdb_client import Duckdb_client
15
15
  from ebm4subjects.ebm_logging import EbmLogger, NullLogger, XGBLogging
16
- from ebm4subjects.embedding_generator import EmbeddingGenerator
16
+ from ebm4subjects.embedding_generator import (
17
+ EmbeddingGeneratorHuggingFaceTEI,
18
+ EmbeddingGeneratorOfflineInference,
19
+ EmbeddingGeneratorMock,
20
+ )
17
21
 
18
22
 
19
23
  class EbmModel:
20
24
  def __init__(
21
25
  self,
22
- embedding_model_name: str | Any,
23
26
  embedding_dimensions: int | str,
24
27
  chunk_tokenizer: str | Any,
25
28
  max_chunk_count: int | str,
@@ -39,7 +42,9 @@ class EbmModel:
39
42
  collection_name: str = "my_collection",
40
43
  use_altLabels: bool = True,
41
44
  hnsw_index_params: dict | str | None = None,
42
- model_args: dict | str | None = None,
45
+ embedding_model_name: str | None = None,
46
+ embedding_model_type: str = "offline-inference",
47
+ embedding_model_args: dict | str | None = None,
43
48
  encode_args_vocab: dict | str | None = None,
44
49
  encode_args_documents: dict | str | None = None,
45
50
  log_path: str | None = None,
@@ -94,11 +99,14 @@ class EbmModel:
94
99
 
95
100
  # Parameters for embedding generator
96
101
  self.generator = None
102
+ self.embedding_model_type = embedding_model_type
97
103
  self.embedding_model_name = embedding_model_name
98
104
  self.embedding_dimensions = int(embedding_dimensions)
99
- if isinstance(model_args, str) or not model_args:
100
- model_args = ast.literal_eval(model_args) if model_args else {}
101
- self.model_args = model_args
105
+ if isinstance(embedding_model_args, str) or not embedding_model_args:
106
+ embedding_model_args = (
107
+ ast.literal_eval(embedding_model_args) if embedding_model_args else {}
108
+ )
109
+ self.embedding_model_args = embedding_model_args
102
110
  if isinstance(encode_args_vocab, str) or not encode_args_vocab:
103
111
  encode_args_vocab = (
104
112
  ast.literal_eval(encode_args_vocab) if encode_args_vocab else {}
@@ -171,12 +179,25 @@ class EbmModel:
171
179
  None
172
180
  """
173
181
  if self.generator is None:
174
- self.logger.info("initializing embedding generator")
175
- self.generator = EmbeddingGenerator(
176
- model_name=self.embedding_model_name,
177
- embedding_dimensions=self.embedding_dimensions,
178
- **self.model_args,
179
- )
182
+ if self.embedding_model_type == "offline-inference":
183
+ self.logger.info("initializing offline-inference embedding generator")
184
+ self.generator = EmbeddingGeneratorOfflineInference(
185
+ model_name=self.embedding_model_name,
186
+ embedding_dimensions=self.embedding_dimensions,
187
+ **self.embedding_model_args,
188
+ )
189
+ elif self.embedding_model_type == "mock":
190
+ self.logger.info("initializing mock embedding generator")
191
+ self.generator = EmbeddingGeneratorMock(self.embedding_dimensions)
192
+ elif self.embedding_model_type == "HuggingFaceTEI":
193
+ self.logger.info("initializing API embedding generator")
194
+ self.generator = EmbeddingGeneratorHuggingFaceTEI(
195
+ embedding_dimensions=self.embedding_dimensions,
196
+ **self.embedding_model_args,
197
+ )
198
+ else:
199
+ self.logger.error("unsupportet API for embedding generator")
200
+ raise NotImplementedError
180
201
 
181
202
  def init_logger(
182
203
  self, log_path: str | None = None, logger: logging.Logger | None = None
@@ -383,7 +404,7 @@ class EbmModel:
383
404
  .join(
384
405
  other=gold_standard.with_columns(pl.lit(True).alias("gold")),
385
406
  on=["doc_id", "label_id"],
386
- how="outer",
407
+ how="full",
387
408
  )
388
409
  # Fill dataframe so that all not suggested labels which are not part of
389
410
  # the gold standard and all gold standard labels which where not
@@ -0,0 +1,202 @@
1
+ import os
2
+
3
+ import numpy as np
4
+ import requests
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+
8
+ class EmbeddingGenerator:
9
+ """
10
+ A base class for embedding generators.
11
+ """
12
+
13
+ def __init__(self) -> None:
14
+ """
15
+ Base method fot the initialization of an EmbeddingGenerator.
16
+ """
17
+ pass
18
+
19
+ def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
20
+ """
21
+ Base method fot the creating embeddings with an EmbeddingGenerator.
22
+
23
+ Args:
24
+ texts (list[str]): A list of input texts.
25
+ **kwargs: Additional keyword arguments.
26
+
27
+ Returns:
28
+ np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
29
+ containing the generated embeddings.
30
+ """
31
+ pass
32
+
33
+
34
+ class EmbeddingGeneratorAPI(EmbeddingGenerator):
35
+ """
36
+ A base class for API embedding generators.
37
+
38
+ Attributes:
39
+ embedding_dimensions (int): The dimensionality of the generated embeddings.
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ embedding_dimensions: int,
45
+ **kwargs,
46
+ ) -> None:
47
+ """
48
+ Initializes the API EmbeddingGenerator.
49
+
50
+ Sets the embedding dimensions, and initiliazes and
51
+ prepares a session with the API.
52
+ """
53
+
54
+ self.embedding_dimensions = embedding_dimensions
55
+
56
+ self.session = requests.Session()
57
+ self.api_address = kwargs.get("api_address")
58
+ self.headers = kwargs.get("headers", {"Content-Type": "application/json"})
59
+
60
+
61
+ class EmbeddingGeneratorHuggingFaceTEI(EmbeddingGeneratorAPI):
62
+ """
63
+ A class for generating embeddings using the HuggingFaceTEI API.
64
+ """
65
+
66
+ def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
67
+ """
68
+ Generates embeddings for a list of input texts using a model
69
+ via the HuggingFaceTEI API.
70
+
71
+ Args:
72
+ texts (list[str]): A list of input texts.
73
+ **kwargs: Additional keyword arguments to pass to the
74
+ SentenceTransformer model.
75
+
76
+ Returns:
77
+ np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
78
+ containing the generated embeddings.
79
+ """
80
+ # prepare list for return
81
+ embeddings = []
82
+
83
+ # Check if the input list is empty
84
+ if not texts:
85
+ # If empty, return an empty numpy array with the correct shape
86
+ return np.empty((0, self.embedding_dimensions))
87
+
88
+ # process each text
89
+ for text in texts:
90
+ # send a request to the HuggingFaceTEI API
91
+ data = {"inputs": text}
92
+ response = self.session.post(
93
+ self.api_address, headers=self.headers, json=data
94
+ )
95
+
96
+ # add generated embeddings to return list if request was successfull
97
+ if response.status_code == 200:
98
+ embeddings.append(response.json()[0])
99
+ else:
100
+ embeddings.append([0 for _ in range(self.embedding_dimensions)])
101
+
102
+ return np.array(embeddings)
103
+
104
+
105
+ class EmbeddingGeneratorOfflineInference(EmbeddingGenerator):
106
+ """
107
+ A class for generating embeddings using a given SentenceTransformer model
108
+ loaded offline with SentenceTransformer.
109
+
110
+ Args:
111
+ model_name (str): The name of the SentenceTransformer model.
112
+ embedding_dimensions (int): The dimensionality of the generated embeddings.
113
+ **kwargs: Additional keyword arguments to pass to the model.
114
+
115
+ Attributes:
116
+ model_name (str): The name of the SentenceTransformer model.
117
+ embedding_dimensions (int): The dimensionality of the generated embeddings.
118
+ """
119
+
120
+ def __init__(self, model_name: str, embedding_dimensions: int, **kwargs) -> None:
121
+ """
122
+ Initializes the EmbeddingGenerator in offline inference mode.
123
+
124
+ Sets the model name, embedding dimensions, and creates a
125
+ SentenceTransformer model instance.
126
+ """
127
+ self.model_name = model_name
128
+ self.embedding_dimensions = embedding_dimensions
129
+
130
+ # Create a SentenceTransformer model instance with the given
131
+ # model name and embedding dimensions
132
+ self.model = SentenceTransformer(
133
+ model_name, truncate_dim=embedding_dimensions, **kwargs
134
+ )
135
+
136
+ # Disabel parallelism for tokenizer
137
+ # Needed because process might be already parallelized
138
+ # before embedding creation
139
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
140
+
141
+ def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
142
+ """
143
+ Generates embeddings for a list of input texts using the
144
+ SentenceTransformer model.
145
+
146
+ Args:
147
+ texts (list[str]): A list of input texts.
148
+ **kwargs: Additional keyword arguments to pass to the
149
+ SentenceTransformer model.
150
+
151
+ Returns:
152
+ np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
153
+ containing the generated embeddings.
154
+ """
155
+ # Check if the input list is empty
156
+ if not texts:
157
+ # If empty, return an empty numpy array with the correct shape
158
+ return np.empty((0, self.embedding_dimensions))
159
+
160
+ # Generate embeddings using the SentenceTransformer model and return them
161
+ return self.model.encode(texts, **kwargs)
162
+
163
+
164
+ class EmbeddingGeneratorMock(EmbeddingGenerator):
165
+ """
166
+ A mock class for generating fake embeddings. Used for testing.
167
+
168
+ Args:
169
+ embedding_dimensions (int): The dimensionality of the generated embeddings.
170
+ **kwargs: Additional keyword arguments to pass to the model.
171
+
172
+ Attributes:
173
+ embedding_dimensions (int): The dimensionality of the generated embeddings.
174
+ """
175
+
176
+ def __init__(self, embedding_dimensions: int, **kwargs) -> None:
177
+ """
178
+ Initializes the mock EmbeddingGenerator.
179
+
180
+ Sets the embedding dimensions.
181
+ """
182
+ self.embedding_dimensions = embedding_dimensions
183
+
184
+ def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
185
+ """
186
+ Generates embeddings for a list of input texts.
187
+
188
+ Args:
189
+ texts (list[str]): A list of input texts.
190
+ **kwargs: Additional keyword arguments.
191
+
192
+ Returns:
193
+ np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
194
+ containing the generated embeddings.
195
+ """
196
+ # Check if the input list is empty
197
+ if not texts:
198
+ # If empty, return an empty numpy array with the correct shape
199
+ return np.empty((0, self.embedding_dimensions))
200
+
201
+ # Generate mock embeddings return them
202
+ return np.ones((len(texts), 1024))
@@ -1,70 +0,0 @@
1
- import os
2
-
3
- import numpy as np
4
- from sentence_transformers import SentenceTransformer
5
-
6
-
7
- class EmbeddingGenerator:
8
- """
9
- A class for generating embeddings using a given SentenceTransformer model.
10
-
11
- Args:
12
- model_name (str, SentenceTransformer): The name of the SentenceTransformer
13
- model or an SentenceTransformer model to use.
14
- embedding_dimensions (int): The dimensionality of the generated embeddings.
15
- **kwargs: Additional keyword arguments to pass to the model.
16
-
17
- Attributes:
18
- model_name (str): The name of the SentenceTransformer model.
19
- embedding_dimensions (int): The dimensionality of the generated embeddings.
20
- model (SentenceTransformer): The SentenceTransformer model instance.
21
- """
22
-
23
- def __init__(
24
- self, model_name: str | SentenceTransformer, embedding_dimensions: int, **kwargs
25
- ) -> None:
26
- """
27
- Initializes the EmbeddingGenerator.
28
-
29
- Sets the model name, embedding dimensions, and creates a
30
- SentenceTransformer model instance.
31
- """
32
- self.model_name = model_name
33
- self.embedding_dimensions = embedding_dimensions
34
-
35
- # Create a SentenceTransformer model instance with the given
36
- # model name and embedding dimensions
37
- # or set model to the given SentenceTransformer
38
- if type(model_name) is str:
39
- self.model = SentenceTransformer(
40
- model_name, truncate_dim=embedding_dimensions, **kwargs
41
- )
42
- else:
43
- self.model = model_name
44
-
45
- # Disabel parallelism for tokenizer
46
- # Needed because process might be already parallelized
47
- # before embedding creation
48
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
49
-
50
- def generate_embeddings(self, texts: list[str], **kwargs) -> np.ndarray:
51
- """
52
- Generates embeddings for a list of input texts using the
53
- SentenceTransformer model.
54
-
55
- Args:
56
- texts (list[str]): A list of input texts.
57
- **kwargs: Additional keyword arguments to pass to the
58
- SentenceTransformer model.
59
-
60
- Returns:
61
- np.ndarray: A numpy array of shape (len(texts), embedding_dimensions)
62
- containing the generated embeddings.
63
- """
64
- # Check if the input list is empty
65
- if not texts:
66
- # If empty, return an empty numpy array with the correct shape
67
- return np.empty((0, self.embedding_dimensions))
68
-
69
- # Generate embeddings using the SentenceTransformer model and return them
70
- return self.model.encode(texts, **kwargs)
File without changes
File without changes
File without changes
File without changes
File without changes