fastembed-bio 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. fastembed/__init__.py +24 -0
  2. fastembed/bio/__init__.py +3 -0
  3. fastembed/bio/protein_embedding.py +456 -0
  4. fastembed/common/__init__.py +3 -0
  5. fastembed/common/model_description.py +52 -0
  6. fastembed/common/model_management.py +471 -0
  7. fastembed/common/onnx_model.py +188 -0
  8. fastembed/common/preprocessor_utils.py +84 -0
  9. fastembed/common/types.py +27 -0
  10. fastembed/common/utils.py +69 -0
  11. fastembed/embedding.py +24 -0
  12. fastembed/image/__init__.py +3 -0
  13. fastembed/image/image_embedding.py +135 -0
  14. fastembed/image/image_embedding_base.py +55 -0
  15. fastembed/image/onnx_embedding.py +217 -0
  16. fastembed/image/onnx_image_model.py +156 -0
  17. fastembed/image/transform/functional.py +221 -0
  18. fastembed/image/transform/operators.py +499 -0
  19. fastembed/late_interaction/__init__.py +5 -0
  20. fastembed/late_interaction/colbert.py +301 -0
  21. fastembed/late_interaction/jina_colbert.py +58 -0
  22. fastembed/late_interaction/late_interaction_embedding_base.py +80 -0
  23. fastembed/late_interaction/late_interaction_text_embedding.py +180 -0
  24. fastembed/late_interaction/token_embeddings.py +83 -0
  25. fastembed/late_interaction_multimodal/__init__.py +5 -0
  26. fastembed/late_interaction_multimodal/colmodernvbert.py +532 -0
  27. fastembed/late_interaction_multimodal/colpali.py +327 -0
  28. fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +189 -0
  29. fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +86 -0
  30. fastembed/late_interaction_multimodal/onnx_multimodal_model.py +291 -0
  31. fastembed/parallel_processor.py +253 -0
  32. fastembed/postprocess/__init__.py +3 -0
  33. fastembed/postprocess/muvera.py +362 -0
  34. fastembed/py.typed +1 -0
  35. fastembed/rerank/cross_encoder/__init__.py +3 -0
  36. fastembed/rerank/cross_encoder/custom_text_cross_encoder.py +47 -0
  37. fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +239 -0
  38. fastembed/rerank/cross_encoder/onnx_text_model.py +204 -0
  39. fastembed/rerank/cross_encoder/text_cross_encoder.py +178 -0
  40. fastembed/rerank/cross_encoder/text_cross_encoder_base.py +63 -0
  41. fastembed/sparse/__init__.py +4 -0
  42. fastembed/sparse/bm25.py +359 -0
  43. fastembed/sparse/bm42.py +369 -0
  44. fastembed/sparse/minicoil.py +372 -0
  45. fastembed/sparse/sparse_embedding_base.py +90 -0
  46. fastembed/sparse/sparse_text_embedding.py +143 -0
  47. fastembed/sparse/splade_pp.py +196 -0
  48. fastembed/sparse/utils/minicoil_encoder.py +146 -0
  49. fastembed/sparse/utils/sparse_vectors_converter.py +244 -0
  50. fastembed/sparse/utils/tokenizer.py +120 -0
  51. fastembed/sparse/utils/vocab_resolver.py +202 -0
  52. fastembed/text/__init__.py +3 -0
  53. fastembed/text/clip_embedding.py +56 -0
  54. fastembed/text/custom_text_embedding.py +97 -0
  55. fastembed/text/multitask_embedding.py +109 -0
  56. fastembed/text/onnx_embedding.py +353 -0
  57. fastembed/text/onnx_text_model.py +180 -0
  58. fastembed/text/pooled_embedding.py +136 -0
  59. fastembed/text/pooled_normalized_embedding.py +164 -0
  60. fastembed/text/text_embedding.py +228 -0
  61. fastembed/text/text_embedding_base.py +75 -0
  62. fastembed_bio-0.1.0.dist-info/METADATA +339 -0
  63. fastembed_bio-0.1.0.dist-info/RECORD +66 -0
  64. fastembed_bio-0.1.0.dist-info/WHEEL +4 -0
  65. fastembed_bio-0.1.0.dist-info/licenses/LICENSE +201 -0
  66. fastembed_bio-0.1.0.dist-info/licenses/NOTICE +22 -0
@@ -0,0 +1,196 @@
1
+ from typing import Any, Iterable, Sequence, Type
2
+
3
+ import numpy as np
4
+ from fastembed.common import OnnxProvider
5
+ from fastembed.common.onnx_model import OnnxOutputContext
6
+ from fastembed.common.types import Device
7
+ from fastembed.common.utils import define_cache_dir
8
+ from fastembed.sparse.sparse_embedding_base import (
9
+ SparseEmbedding,
10
+ SparseTextEmbeddingBase,
11
+ )
12
+ from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker
13
+ from fastembed.common.model_description import SparseModelDescription, ModelSource
14
+
15
+ supported_splade_models: list[SparseModelDescription] = [
16
+ SparseModelDescription(
17
+ model="prithivida/Splade_PP_en_v1",
18
+ vocab_size=30522,
19
+ description="Independent Implementation of SPLADE++ Model for English.",
20
+ license="apache-2.0",
21
+ size_in_GB=0.532,
22
+ sources=ModelSource(hf="Qdrant/Splade_PP_en_v1"),
23
+ model_file="model.onnx",
24
+ ),
25
+ SparseModelDescription(
26
+ model="prithvida/Splade_PP_en_v1",
27
+ vocab_size=30522,
28
+ description="Independent Implementation of SPLADE++ Model for English.",
29
+ license="apache-2.0",
30
+ size_in_GB=0.532,
31
+ sources=ModelSource(hf="Qdrant/Splade_PP_en_v1"),
32
+ model_file="model.onnx",
33
+ ),
34
+ ]
35
+
36
+
37
+ class SpladePP(SparseTextEmbeddingBase, OnnxTextModel[SparseEmbedding]):
38
+ def _post_process_onnx_output(
39
+ self, output: OnnxOutputContext, **kwargs: Any
40
+ ) -> Iterable[SparseEmbedding]:
41
+ if output.attention_mask is None:
42
+ raise ValueError("attention_mask must be provided for document post-processing")
43
+
44
+ relu_log = np.log(1 + np.maximum(output.model_output, 0))
45
+
46
+ weighted_log = relu_log * np.expand_dims(output.attention_mask, axis=-1)
47
+
48
+ scores = np.max(weighted_log, axis=1)
49
+
50
+ # Score matrix of shape (batch_size, vocab_size)
51
+ # Most of the values are 0, only a few are non-zero
52
+ for row_scores in scores:
53
+ indices = row_scores.nonzero()[0]
54
+ scores = row_scores[indices]
55
+ yield SparseEmbedding(values=scores, indices=indices)
56
+
57
+ def token_count(
58
+ self, texts: str | Iterable[str], batch_size: int = 1024, **kwargs: Any
59
+ ) -> int:
60
+ return self._token_count(texts, batch_size=batch_size, **kwargs)
61
+
62
+ @classmethod
63
+ def _list_supported_models(cls) -> list[SparseModelDescription]:
64
+ """Lists the supported models.
65
+
66
+ Returns:
67
+ list[SparseModelDescription]: A list of SparseModelDescription objects containing the model information.
68
+ """
69
+ return supported_splade_models
70
+
71
+ def __init__(
72
+ self,
73
+ model_name: str,
74
+ cache_dir: str | None = None,
75
+ threads: int | None = None,
76
+ providers: Sequence[OnnxProvider] | None = None,
77
+ cuda: bool | Device = Device.AUTO,
78
+ device_ids: list[int] | None = None,
79
+ lazy_load: bool = False,
80
+ device_id: int | None = None,
81
+ specific_model_path: str | None = None,
82
+ **kwargs: Any,
83
+ ):
84
+ """
85
+ Args:
86
+ model_name (str): The name of the model to use.
87
+ cache_dir (str, optional): The path to the cache directory.
88
+ Can be set using the `FASTEMBED_CACHE_PATH` env variable.
89
+ Defaults to `fastembed_cache` in the system's temp directory.
90
+ threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
91
+ providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
92
+ Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
93
+ cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
94
+ Defaults to Device.
95
+ device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
96
+ workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
97
+ with `providers`. Defaults to None.
98
+ lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
99
+ Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
100
+ device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
101
+ specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else
102
+
103
+ Raises:
104
+ ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
105
+ """
106
+ super().__init__(model_name, cache_dir, threads, **kwargs)
107
+ self.providers = providers
108
+ self.lazy_load = lazy_load
109
+ self._extra_session_options = self._select_exposed_session_options(kwargs)
110
+
111
+ # List of device ids, that can be used for data parallel processing in workers
112
+ self.device_ids = device_ids
113
+ self.cuda = cuda
114
+
115
+ # This device_id will be used if we need to load model in current process
116
+ self.device_id: int | None = None
117
+ if device_id is not None:
118
+ self.device_id = device_id
119
+ elif self.device_ids is not None:
120
+ self.device_id = self.device_ids[0]
121
+
122
+ self.model_description = self._get_model_description(model_name)
123
+ self.cache_dir = str(define_cache_dir(cache_dir))
124
+
125
+ self._specific_model_path = specific_model_path
126
+ self._model_dir = self.download_model(
127
+ self.model_description,
128
+ self.cache_dir,
129
+ local_files_only=self._local_files_only,
130
+ specific_model_path=self._specific_model_path,
131
+ )
132
+
133
+ if not self.lazy_load:
134
+ self.load_onnx_model()
135
+
136
+ def load_onnx_model(self) -> None:
137
+ self._load_onnx_model(
138
+ model_dir=self._model_dir,
139
+ model_file=self.model_description.model_file,
140
+ threads=self.threads,
141
+ providers=self.providers,
142
+ cuda=self.cuda,
143
+ device_id=self.device_id,
144
+ extra_session_options=self._extra_session_options,
145
+ )
146
+
147
+ def embed(
148
+ self,
149
+ documents: str | Iterable[str],
150
+ batch_size: int = 256,
151
+ parallel: int | None = None,
152
+ **kwargs: Any,
153
+ ) -> Iterable[SparseEmbedding]:
154
+ """
155
+ Encode a list of documents into list of embeddings.
156
+ We use mean pooling with attention so that the model can handle variable-length inputs.
157
+
158
+ Args:
159
+ documents: Iterator of documents or single document to embed
160
+ batch_size: Batch size for encoding -- higher values will use more memory, but be faster
161
+ parallel:
162
+ If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
163
+ If 0, use all available cores.
164
+ If None, don't use data-parallel processing, use default onnxruntime threading instead.
165
+
166
+ Returns:
167
+ List of embeddings, one per document
168
+ """
169
+ yield from self._embed_documents(
170
+ model_name=self.model_name,
171
+ cache_dir=str(self.cache_dir),
172
+ documents=documents,
173
+ batch_size=batch_size,
174
+ parallel=parallel,
175
+ providers=self.providers,
176
+ cuda=self.cuda,
177
+ device_ids=self.device_ids,
178
+ local_files_only=self._local_files_only,
179
+ specific_model_path=self._specific_model_path,
180
+ extra_session_options=self._extra_session_options,
181
+ **kwargs,
182
+ )
183
+
184
+ @classmethod
185
+ def _get_worker_class(cls) -> Type[TextEmbeddingWorker[SparseEmbedding]]:
186
+ return SpladePPEmbeddingWorker
187
+
188
+
189
+ class SpladePPEmbeddingWorker(TextEmbeddingWorker[SparseEmbedding]):
190
+ def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> SpladePP:
191
+ return SpladePP(
192
+ model_name=model_name,
193
+ cache_dir=cache_dir,
194
+ threads=1,
195
+ **kwargs,
196
+ )
@@ -0,0 +1,146 @@
1
+ """
2
+ Pure numpy implementation of encoder model for a single word.
3
+
4
+ This model is not trainable, and should only be used for inference.
5
+ """
6
+
7
+ import numpy as np
8
+ from fastembed.common.types import NumpyArray
9
+
10
+
11
+ class Encoder:
12
+ """
13
+ Encoder(768, 4, 10000)
14
+
15
+ Will look like this:
16
+
17
+
18
+ Per-word
19
+ Encoder Matrix
20
+ ┌─────────────────────┐
21
+ │ Token Embedding(768)├──────┐ (10k, 768, 4)
22
+ └─────────────────────┘ │ ┌─────────┐
23
+ │ │ │
24
+ ┌─────────────────────┐ │ ┌─┴───────┐ │
25
+ │ │ │ │ │ │
26
+ └─────────────────────┘ │ ┌─┴───────┐ │ │ ┌─────────┐
27
+ └────►│ │ │ ├─────►│Tanh │
28
+ ┌─────────────────────┐ │ │ │ │ └─────────┘
29
+ │ │ │ │ ├─┘
30
+ └─────────────────────┘ │ ├─┘
31
+ │ │
32
+ ┌─────────────────────┐ └─────────┘
33
+ │ │
34
+ └─────────────────────┘
35
+
36
+ Final linear transformation is accompanied by a non-linear activation function: Tanh.
37
+
38
+ Tanh is used to ensure that the output is in the range [-1, 1].
39
+ It would be easier to visually interpret the output of the model, assuming that each dimension
40
+ would need to encode a type of semantic cluster.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ weights: NumpyArray,
46
+ ):
47
+ self.weights = weights
48
+ self.vocab_size, self.input_dim, self.output_dim = weights.shape
49
+
50
+ self.encoder_weights: NumpyArray = weights
51
+
52
+ # Activation function
53
+ self.activation = np.tanh
54
+
55
+ @staticmethod
56
+ def convert_vocab_ids(vocab_ids: NumpyArray) -> NumpyArray:
57
+ """
58
+ Convert vocab_ids of shape (batch_size, seq_len) into (batch_size, seq_len, 2)
59
+ by appending batch_id alongside each vocab_id.
60
+ """
61
+ batch_size, seq_len = vocab_ids.shape
62
+ batch_ids = np.arange(batch_size, dtype=vocab_ids.dtype).reshape(batch_size, 1)
63
+ batch_ids = np.repeat(batch_ids, seq_len, axis=1)
64
+ # Stack vocab_ids and batch_ids along the last dimension
65
+ combined: NumpyArray = np.stack((vocab_ids, batch_ids), axis=2).astype(np.int32)
66
+ return combined
67
+
68
+ @classmethod
69
+ def avg_by_vocab_ids(
70
+ cls, vocab_ids: NumpyArray, embeddings: NumpyArray
71
+ ) -> tuple[NumpyArray, NumpyArray]:
72
+ """
73
+ Takes:
74
+ vocab_ids: (batch_size, seq_len) int array
75
+ embeddings: (batch_size, seq_len, input_dim) float array
76
+
77
+ Returns:
78
+ unique_flattened_vocab_ids: (total_unique, 2) array of [vocab_id, batch_id]
79
+ unique_flattened_embeddings: (total_unique, input_dim) averaged embeddings
80
+ """
81
+ input_dim = embeddings.shape[2]
82
+
83
+ # Flatten vocab_ids and embeddings
84
+ # flattened_vocab_ids: (batch_size*seq_len, 2)
85
+ flattened_vocab_ids = cls.convert_vocab_ids(vocab_ids).reshape(-1, 2)
86
+
87
+ # flattened_embeddings: (batch_size*seq_len, input_dim)
88
+ flattened_embeddings = embeddings.reshape(-1, input_dim)
89
+
90
+ # Find unique (vocab_id, batch_id) pairs
91
+ unique_flattened_vocab_ids, inverse_indices = np.unique(
92
+ flattened_vocab_ids, axis=0, return_inverse=True
93
+ )
94
+
95
+ # Prepare arrays to accumulate sums
96
+ unique_count = unique_flattened_vocab_ids.shape[0]
97
+ unique_flattened_embeddings = np.zeros((unique_count, input_dim), dtype=np.float32)
98
+ unique_flattened_count = np.zeros(unique_count, dtype=np.int32)
99
+
100
+ # Use np.add.at to accumulate sums based on inverse indices
101
+ np.add.at(unique_flattened_embeddings, inverse_indices, flattened_embeddings)
102
+ np.add.at(unique_flattened_count, inverse_indices, 1)
103
+
104
+ # Compute averages
105
+ unique_flattened_embeddings /= unique_flattened_count[:, None]
106
+
107
+ return unique_flattened_vocab_ids.astype(np.int32), unique_flattened_embeddings.astype(
108
+ np.float32
109
+ )
110
+
111
+ def forward(
112
+ self, vocab_ids: NumpyArray, embeddings: NumpyArray
113
+ ) -> tuple[NumpyArray, NumpyArray]:
114
+ """
115
+ Args:
116
+ vocab_ids: (batch_size, seq_len) int array
117
+ embeddings: (batch_size, seq_len, input_dim) float array
118
+
119
+ Returns:
120
+ unique_flattened_vocab_ids_and_batch_ids: (total_unique, 2)
121
+ unique_flattened_encoded: (total_unique, output_dim)
122
+ """
123
+ # Average embeddings for duplicate vocab_ids
124
+ unique_flattened_vocab_ids_and_batch_ids, unique_flattened_embeddings = (
125
+ self.avg_by_vocab_ids(vocab_ids, embeddings)
126
+ )
127
+
128
+ # Select the encoder weights for each unique vocab_id
129
+ unique_flattened_vocab_ids = unique_flattened_vocab_ids_and_batch_ids[:, 0].astype(
130
+ np.int32
131
+ )
132
+
133
+ # unique_encoder_weights: (total_unique, input_dim, output_dim)
134
+ unique_encoder_weights = self.encoder_weights[unique_flattened_vocab_ids]
135
+
136
+ # Compute linear transform: (total_unique, output_dim)
137
+ # Using Einstein summation for matrix multiplication:
138
+ # 'bi,bio->bo' means: for each "b" (batch element), multiply embeddings (b,i) by weights (b,i,o) -> (b,o)
139
+ unique_flattened_encoded = np.einsum(
140
+ "bi,bio->bo", unique_flattened_embeddings, unique_encoder_weights
141
+ )
142
+
143
+ # Apply Tanh activation and ensure float32 type
144
+ unique_flattened_encoded = self.activation(unique_flattened_encoded).astype(np.float32)
145
+
146
+ return unique_flattened_vocab_ids_and_batch_ids.astype(np.int32), unique_flattened_encoded
@@ -0,0 +1,244 @@
1
+ import copy
2
+ from dataclasses import dataclass
3
+
4
+ import mmh3
5
+ import numpy as np
6
+ from py_rust_stemmers import SnowballStemmer
7
+
8
+ from fastembed.common.utils import get_all_punctuation, remove_non_alphanumeric
9
+ from fastembed.sparse.sparse_embedding_base import SparseEmbedding
10
+
11
+ GAP = 32000
12
+ INT32_MAX = 2**31 - 1
13
+
14
+
15
+ @dataclass
16
+ class WordEmbedding:
17
+ word: str
18
+ forms: list[str]
19
+ count: int
20
+ word_id: int
21
+ embedding: list[float]
22
+
23
+
24
+ class SparseVectorConverter:
25
+ def __init__(
26
+ self,
27
+ stopwords: set[str],
28
+ stemmer: SnowballStemmer,
29
+ k: float = 1.2,
30
+ b: float = 0.75,
31
+ avg_len: float = 150.0,
32
+ ):
33
+ punctuation = set(get_all_punctuation())
34
+ special_tokens = {"[CLS]", "[SEP]", "[PAD]", "[UNK]", "[MASK]"}
35
+
36
+ self.stemmer = stemmer
37
+ self.unwanted_tokens = punctuation | special_tokens | stopwords
38
+
39
+ self.k = k
40
+ self.b = b
41
+ self.avg_len = avg_len
42
+
43
+ @classmethod
44
+ def unkn_word_token_id(
45
+ cls, word: str, shift: int
46
+ ) -> int: # 2-3 words can collide in 1 index with this mapping, not considering mm3 collisions
47
+ token_hash = abs(mmh3.hash(word))
48
+
49
+ range_size = INT32_MAX - shift
50
+ remapped_hash = shift + (token_hash % range_size)
51
+
52
+ return remapped_hash
53
+
54
+ def bm25_tf(self, num_occurrences: int, sentence_len: int) -> float:
55
+ res = num_occurrences * (self.k + 1)
56
+ res /= num_occurrences + self.k * (1 - self.b + self.b * sentence_len / self.avg_len)
57
+ return res
58
+
59
+ @classmethod
60
+ def normalize_vector(cls, vector: list[float]) -> list[float]:
61
+ norm = sum([x**2 for x in vector]) ** 0.5
62
+ if norm < 1e-8:
63
+ return vector
64
+ return [x / norm for x in vector]
65
+
66
+ def clean_words(
67
+ self, sentence_embedding: dict[str, WordEmbedding], token_max_length: int = 40
68
+ ) -> dict[str, WordEmbedding]:
69
+ """
70
+ Clean miniCOIL-produced sentence_embedding, as unknown to the miniCOIL's stemmer tokens should fully resemble
71
+ our BM25 token representation.
72
+
73
+ sentence_embedding = {"9°": {"word": "9°", "word_id": -1, "count": 2, "embedding": [1], "forms": ["9°"]},
74
+ "9": {"word": "9", "word_id": -1, "count": 2, "embedding": [1], "forms": ["9"]},
75
+ "bat": {"word": "bat", "word_id": 2, "count": 3, "embedding": [0.2, 0.1, -0.2, -0.2], "forms": ["bats", "bat"]},
76
+ "9°9": {"word": "9°9", "word_id": -1, "count": 1, "embedding": [1], "forms": ["9°9"]},
77
+ "screech": {"word": "screech", "word_id": -1, "count": 1, "embedding": [1], "forms": ["screech"]},
78
+ "screeched": {"word": "screeched", "word_id": -1, "count": 1, "embedding": [1], "forms": ["screeched"]}
79
+ }
80
+ cleaned_embedding_ground_truth = {
81
+ "9": {"word": "9", "word_id": -1, "count": 6, "embedding": [1], "forms": ["9°", "9", "9°9", "9°9"]},
82
+ "bat": {"word": "bat", "word_id": 2, "count": 3, "embedding": [0.2, 0.1, -0.2, -0.2], "forms": ["bats", "bat"]},
83
+ "screech": {"word": "screech", "word_id": -1, "count": 2, "embedding": [1], "forms": ["screech", "screeched"]}
84
+ }
85
+ """
86
+
87
+ new_sentence_embedding: dict[str, WordEmbedding] = {}
88
+
89
+ for word, embedding in sentence_embedding.items():
90
+ # embedding = {
91
+ # "word": "vector",
92
+ # "forms": ["vector", "vectors"],
93
+ # "count": 2,
94
+ # "word_id": 1231,
95
+ # "embedding": [0.1, 0.2, 0.3, 0.4]
96
+ # }
97
+ if embedding.word_id > 0:
98
+ # Known word, no need to clean
99
+ new_sentence_embedding[word] = embedding
100
+ else:
101
+ # Unknown word
102
+ if word in self.unwanted_tokens:
103
+ continue
104
+
105
+ # Example complex word split:
106
+ # word = `word^vec`
107
+ word_cleaned = remove_non_alphanumeric(word).strip()
108
+ # word_cleaned = `word vec`
109
+
110
+ if len(word_cleaned) > 0:
111
+ # Subwords: ['word', 'vec']
112
+ for subword in word_cleaned.split():
113
+ stemmed_subword: str = self.stemmer.stem_word(subword)
114
+ if (
115
+ len(stemmed_subword) <= token_max_length
116
+ and stemmed_subword not in self.unwanted_tokens
117
+ ):
118
+ if stemmed_subword not in new_sentence_embedding:
119
+ new_sentence_embedding[stemmed_subword] = copy.deepcopy(embedding)
120
+ new_sentence_embedding[stemmed_subword].word = stemmed_subword
121
+ else:
122
+ new_sentence_embedding[stemmed_subword].count += embedding.count
123
+ new_sentence_embedding[stemmed_subword].forms += embedding.forms
124
+
125
+ return new_sentence_embedding
126
+
127
+ def embedding_to_vector(
128
+ self,
129
+ sentence_embedding: dict[str, WordEmbedding],
130
+ embedding_size: int,
131
+ vocab_size: int,
132
+ ) -> SparseEmbedding:
133
+ """
134
+ Convert miniCOIL sentence embedding to Qdrant sparse vector
135
+
136
+ Example input:
137
+
138
+ ```
139
+ {
140
+ "vector": WordEmbedding({ // Vocabulary word, encoded with miniCOIL normally
141
+ "word": "vector",
142
+ "forms": ["vector", "vectors"],
143
+ "count": 2,
144
+ "word_id": 1231,
145
+ "embedding": [0.1, 0.2, 0.3, 0.4]
146
+ }),
147
+ "axiotic": WordEmbedding({ // Out-of-vocabulary word, fallback to BM25
148
+ "word": "axiotic",
149
+ "forms": ["axiotics"],
150
+ "count": 1,
151
+ "word_id": -1,
152
+ })
153
+ }
154
+ ```
155
+
156
+ """
157
+
158
+ indices: list[int] = []
159
+ values: list[float] = []
160
+
161
+ # Example:
162
+ # vocab_size = 10000
163
+ # embedding_size = 4
164
+ # GAP = 32000
165
+ #
166
+ # We want to start random words section from the bucket, that is guaranteed to not
167
+ # include any vocab words.
168
+ # We need (vocab_size * embedding_size) slots for vocab words.
169
+ # Therefore we need (vocab_size * embedding_size) // GAP + 1 buckets for vocab words.
170
+ # Therefore, we can start random words from bucket (vocab_size * embedding_size) // GAP + 1 + 1
171
+
172
+ # ID at which the scope of OOV words starts
173
+ unknown_words_shift = ((vocab_size * embedding_size) // GAP + 2) * GAP
174
+ sentence_embedding_cleaned = self.clean_words(sentence_embedding)
175
+
176
+ # Calculate sentence length after cleaning
177
+ sentence_len = 0
178
+ for embedding in sentence_embedding_cleaned.values():
179
+ sentence_len += embedding.count
180
+
181
+ for embedding in sentence_embedding_cleaned.values():
182
+ word_id = embedding.word_id
183
+ num_occurrences = embedding.count
184
+ tf = self.bm25_tf(num_occurrences, sentence_len)
185
+ if (
186
+ word_id > 0
187
+ ): # miniCOIL starts with ID 1, we generally won't have word_id == 0 (UNK), as we don't add
188
+ # these words to sentence_embedding
189
+ embedding_values = embedding.embedding
190
+ normalized_embedding = self.normalize_vector(embedding_values)
191
+
192
+ for val_id, value in enumerate(normalized_embedding):
193
+ indices.append(
194
+ word_id * embedding_size + val_id
195
+ ) # since miniCOIL IDs start with 1
196
+ values.append(value * tf)
197
+ else:
198
+ indices.append(self.unkn_word_token_id(embedding.word, unknown_words_shift))
199
+ values.append(tf)
200
+
201
+ return SparseEmbedding(
202
+ indices=np.array(indices, dtype=np.int32),
203
+ values=np.array(values, dtype=np.float32),
204
+ )
205
+
206
+ def embedding_to_vector_query(
207
+ self,
208
+ sentence_embedding: dict[str, WordEmbedding],
209
+ embedding_size: int,
210
+ vocab_size: int,
211
+ ) -> SparseEmbedding:
212
+ """
213
+ Same as `embedding_to_vector`, but no TF
214
+ """
215
+
216
+ indices: list[int] = []
217
+ values: list[float] = []
218
+
219
+ # ID at which the scope of OOV words starts
220
+ unknown_words_shift = ((vocab_size * embedding_size) // GAP + 2) * GAP
221
+
222
+ sentence_embedding_cleaned = self.clean_words(sentence_embedding)
223
+
224
+ for embedding in sentence_embedding_cleaned.values():
225
+ word_id = embedding.word_id
226
+ tf = 1.0
227
+
228
+ if word_id >= 0: # miniCOIL starts with ID 1
229
+ embedding_values = embedding.embedding
230
+ normalized_embedding = self.normalize_vector(embedding_values)
231
+
232
+ for val_id, value in enumerate(normalized_embedding):
233
+ indices.append(
234
+ word_id * embedding_size + val_id
235
+ ) # since miniCOIL IDs start with 1
236
+ values.append(value * tf)
237
+ else:
238
+ indices.append(self.unkn_word_token_id(embedding.word, unknown_words_shift))
239
+ values.append(tf)
240
+
241
+ return SparseEmbedding(
242
+ indices=np.array(indices, dtype=np.int32),
243
+ values=np.array(values, dtype=np.float32),
244
+ )