PyPI - fastembed-bio - Versions diffs - 0.1.0__py3-none-any.whl - Mend

fastembed-bio 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

fastembed/__init__.py +24 -0
fastembed/bio/__init__.py +3 -0
fastembed/bio/protein_embedding.py +456 -0
fastembed/common/__init__.py +3 -0
fastembed/common/model_description.py +52 -0
fastembed/common/model_management.py +471 -0
fastembed/common/onnx_model.py +188 -0
fastembed/common/preprocessor_utils.py +84 -0
fastembed/common/types.py +27 -0
fastembed/common/utils.py +69 -0
fastembed/embedding.py +24 -0
fastembed/image/__init__.py +3 -0
fastembed/image/image_embedding.py +135 -0
fastembed/image/image_embedding_base.py +55 -0
fastembed/image/onnx_embedding.py +217 -0
fastembed/image/onnx_image_model.py +156 -0
fastembed/image/transform/functional.py +221 -0
fastembed/image/transform/operators.py +499 -0
fastembed/late_interaction/__init__.py +5 -0
fastembed/late_interaction/colbert.py +301 -0
fastembed/late_interaction/jina_colbert.py +58 -0
fastembed/late_interaction/late_interaction_embedding_base.py +80 -0
fastembed/late_interaction/late_interaction_text_embedding.py +180 -0
fastembed/late_interaction/token_embeddings.py +83 -0
fastembed/late_interaction_multimodal/__init__.py +5 -0
fastembed/late_interaction_multimodal/colmodernvbert.py +532 -0
fastembed/late_interaction_multimodal/colpali.py +327 -0
fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +189 -0
fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +86 -0
fastembed/late_interaction_multimodal/onnx_multimodal_model.py +291 -0
fastembed/parallel_processor.py +253 -0
fastembed/postprocess/__init__.py +3 -0
fastembed/postprocess/muvera.py +362 -0
fastembed/py.typed +1 -0
fastembed/rerank/cross_encoder/__init__.py +3 -0
fastembed/rerank/cross_encoder/custom_text_cross_encoder.py +47 -0
fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +239 -0
fastembed/rerank/cross_encoder/onnx_text_model.py +204 -0
fastembed/rerank/cross_encoder/text_cross_encoder.py +178 -0
fastembed/rerank/cross_encoder/text_cross_encoder_base.py +63 -0
fastembed/sparse/__init__.py +4 -0
fastembed/sparse/bm25.py +359 -0
fastembed/sparse/bm42.py +369 -0
fastembed/sparse/minicoil.py +372 -0
fastembed/sparse/sparse_embedding_base.py +90 -0
fastembed/sparse/sparse_text_embedding.py +143 -0
fastembed/sparse/splade_pp.py +196 -0
fastembed/sparse/utils/minicoil_encoder.py +146 -0
fastembed/sparse/utils/sparse_vectors_converter.py +244 -0
fastembed/sparse/utils/tokenizer.py +120 -0
fastembed/sparse/utils/vocab_resolver.py +202 -0
fastembed/text/__init__.py +3 -0
fastembed/text/clip_embedding.py +56 -0
fastembed/text/custom_text_embedding.py +97 -0
fastembed/text/multitask_embedding.py +109 -0
fastembed/text/onnx_embedding.py +353 -0
fastembed/text/onnx_text_model.py +180 -0
fastembed/text/pooled_embedding.py +136 -0
fastembed/text/pooled_normalized_embedding.py +164 -0
fastembed/text/text_embedding.py +228 -0
fastembed/text/text_embedding_base.py +75 -0
fastembed_bio-0.1.0.dist-info/METADATA +339 -0
fastembed_bio-0.1.0.dist-info/RECORD +66 -0
fastembed_bio-0.1.0.dist-info/WHEEL +4 -0
fastembed_bio-0.1.0.dist-info/licenses/LICENSE +201 -0
fastembed_bio-0.1.0.dist-info/licenses/NOTICE +22 -0

fastembed/late_interaction/colbert.py ADDED Viewed

@@ -0,0 +1,301 @@
+import string
+from typing import Any, Iterable, Sequence, Type
+import numpy as np
+from tokenizers import Encoding, Tokenizer
+from fastembed.common.preprocessor_utils import load_tokenizer
+from fastembed.common.types import NumpyArray, Device
+from fastembed.common import OnnxProvider
+from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.common.utils import define_cache_dir, iter_batch
+from fastembed.late_interaction.late_interaction_embedding_base import (
+    LateInteractionTextEmbeddingBase,
+)
+from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker
+from fastembed.common.model_description import DenseModelDescription, ModelSource
+supported_colbert_models: list[DenseModelDescription] = [
+    DenseModelDescription(
+        model="colbert-ir/colbertv2.0",
+        dim=128,
+        description="Text embeddings, Unimodal (text), English, 512 input tokens truncation, 2023 year",
+        license="mit",
+        size_in_GB=0.44,
+        sources=ModelSource(hf="colbert-ir/colbertv2.0"),
+        model_file="model.onnx",
+    ),
+    DenseModelDescription(
+        model="answerdotai/answerai-colbert-small-v1",
+        dim=96,
+        description="Text embeddings, Unimodal (text), English, 512 input tokens truncation, 2024 year",
+        license="apache-2.0",
+        size_in_GB=0.13,
+        sources=ModelSource(hf="answerdotai/answerai-colbert-small-v1"),
+        model_file="vespa_colbert.onnx",
+    ),
+]
+class Colbert(LateInteractionTextEmbeddingBase, OnnxTextModel[NumpyArray]):
+    QUERY_MARKER_TOKEN_ID = 1
+    DOCUMENT_MARKER_TOKEN_ID = 2
+    MIN_QUERY_LENGTH = 31  # it's 32, we add one additional special token in the beginning
+    MASK_TOKEN = "[MASK]"
+    def _post_process_onnx_output(
+        self, output: OnnxOutputContext, is_doc: bool = True, **kwargs: Any
+    ) -> Iterable[NumpyArray]:
+        if not is_doc:
+            for embedding in output.model_output:
+                yield embedding
+        else:
+            if output.input_ids is None or output.attention_mask is None:
+                raise ValueError(
+                    "input_ids and attention_mask must be provided for document post-processing"
+                )
+            for i, token_sequence in enumerate(output.input_ids):
+                for j, token_id in enumerate(token_sequence):  # type: ignore
+                    if token_id in self.skip_list or token_id == self.pad_token_id:
+                        output.attention_mask[i, j] = 0
+            output.model_output *= np.expand_dims(output.attention_mask, 2)
+            norm = np.linalg.norm(output.model_output, ord=2, axis=2, keepdims=True)
+            norm_clamped = np.maximum(norm, 1e-12)
+            output.model_output /= norm_clamped
+            for embedding, attention_mask in zip(output.model_output, output.attention_mask):
+                yield embedding[attention_mask == 1]
+    def _preprocess_onnx_input(
+        self, onnx_input: dict[str, NumpyArray], is_doc: bool = True, **kwargs: Any
+    ) -> dict[str, NumpyArray]:
+        marker_token = self.DOCUMENT_MARKER_TOKEN_ID if is_doc else self.QUERY_MARKER_TOKEN_ID
+        onnx_input["input_ids"] = np.insert(
+            onnx_input["input_ids"].astype(np.int64), 1, marker_token, axis=1
+        )
+        onnx_input["attention_mask"] = np.insert(
+            onnx_input["attention_mask"].astype(np.int64), 1, 1, axis=1
+        )
+        return onnx_input
+    def tokenize(self, documents: list[str], is_doc: bool = True, **kwargs: Any) -> list[Encoding]:
+        return (
+            self._tokenize_documents(documents=documents)
+            if is_doc
+            else self._tokenize_query(query=next(iter(documents)))
+        )
+    def _tokenize_query(self, query: str) -> list[Encoding]:
+        assert self.query_tokenizer is not None
+        encoded = self.query_tokenizer.encode_batch([query])
+        return encoded
+    def _tokenize_documents(self, documents: list[str]) -> list[Encoding]:
+        encoded = self.tokenizer.encode_batch(documents)  # type: ignore[union-attr]
+        return encoded
+    def token_count(
+        self,
+        texts: str | Iterable[str],
+        batch_size: int = 1024,
+        is_doc: bool = True,
+        include_extension: bool = False,
+        **kwargs: Any,
+    ) -> int:
+        if not hasattr(self, "model") or self.model is None:
+            self.load_onnx_model()  # loads the tokenizer as well
+        token_num = 0
+        texts = [texts] if isinstance(texts, str) else texts
+        tokenizer = self.tokenizer if is_doc else self.query_tokenizer
+        assert tokenizer is not None
+        for batch in iter_batch(texts, batch_size):
+            for tokens in tokenizer.encode_batch(batch):
+                if is_doc:
+                    token_num += sum(tokens.attention_mask)
+                else:
+                    attend_count = sum(tokens.attention_mask)
+                    if include_extension:
+                        token_num += max(attend_count, self.MIN_QUERY_LENGTH)
+                    else:
+                        token_num += attend_count
+            if include_extension:
+                token_num += len(
+                    batch
+                )  # add 1 for each cls.DOC_MARKER_TOKEN_ID or cls.QUERY_MARKER_TOKEN_ID
+        return token_num
+    @classmethod
+    def _list_supported_models(cls) -> list[DenseModelDescription]:
+        """Lists the supported models.
+        Returns:
+            list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information.
+        """
+        return supported_colbert_models
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: str | None = None,
+        threads: int | None = None,
+        providers: Sequence[OnnxProvider] | None = None,
+        cuda: bool | Device = Device.AUTO,
+        device_ids: list[int] | None = None,
+        lazy_load: bool = False,
+        device_id: int | None = None,
+        specific_model_path: str | None = None,
+        **kwargs: Any,
+    ):
+        """
+        Args:
+            model_name (str): The name of the model to use.
+            cache_dir (str, optional): The path to the cache directory.
+                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
+                                       Defaults to `fastembed_cache` in the system's temp directory.
+            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
+            providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
+                Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
+            device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
+            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
+                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
+            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
+            specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        """
+        super().__init__(model_name, cache_dir, threads, **kwargs)
+        self.providers = providers
+        self.lazy_load = lazy_load
+        self._extra_session_options = self._select_exposed_session_options(kwargs)
+        # List of device ids, that can be used for data parallel processing in workers
+        self.device_ids = device_ids
+        self.cuda = cuda
+        # This device_id will be used if we need to load model in current process
+        self.device_id: int | None = None
+        if device_id is not None:
+            self.device_id = device_id
+        elif self.device_ids is not None:
+            self.device_id = self.device_ids[0]
+        self.model_description = self._get_model_description(model_name)
+        self.cache_dir = str(define_cache_dir(cache_dir))
+        self._specific_model_path = specific_model_path
+        self._model_dir = self.download_model(
+            self.model_description,
+            self.cache_dir,
+            local_files_only=self._local_files_only,
+            specific_model_path=self._specific_model_path,
+        )
+        self.mask_token_id: int | None = None
+        self.pad_token_id: int | None = None
+        self.skip_list: set[int] = set()
+        self.query_tokenizer: Tokenizer | None = None
+        if not self.lazy_load:
+            self.load_onnx_model()
+    def load_onnx_model(self) -> None:
+        self._load_onnx_model(
+            model_dir=self._model_dir,
+            model_file=self.model_description.model_file,
+            threads=self.threads,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_id=self.device_id,
+            extra_session_options=self._extra_session_options,
+        )
+        self.query_tokenizer, _ = load_tokenizer(model_dir=self._model_dir)
+        assert self.tokenizer is not None
+        self.mask_token_id = self.special_token_to_id[self.MASK_TOKEN]
+        self.pad_token_id = self.tokenizer.padding["pad_id"]
+        self.skip_list = {
+            self.tokenizer.encode(symbol, add_special_tokens=False).ids[0]
+            for symbol in string.punctuation
+        }
+        current_max_length = self.tokenizer.truncation["max_length"]
+        # ensure not to overflow after adding document-marker
+        self.tokenizer.enable_truncation(max_length=current_max_length - 1)
+        self.query_tokenizer.enable_truncation(max_length=current_max_length - 1)
+        self.query_tokenizer.enable_padding(
+            pad_token=self.MASK_TOKEN,
+            pad_id=self.mask_token_id,
+            length=self.MIN_QUERY_LENGTH,
+        )
+    def embed(
+        self,
+        documents: str | Iterable[str],
+        batch_size: int = 256,
+        parallel: int | None = None,
+        **kwargs: Any,
+    ) -> Iterable[NumpyArray]:
+        """
+        Encode a list of documents into list of embeddings.
+        We use mean pooling with attention so that the model can handle variable-length inputs.
+        Args:
+            documents: Iterator of documents or single document to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self._embed_documents(
+            model_name=self.model_name,
+            cache_dir=str(self.cache_dir),
+            documents=documents,
+            batch_size=batch_size,
+            parallel=parallel,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_ids=self.device_ids,
+            local_files_only=self._local_files_only,
+            specific_model_path=self._specific_model_path,
+            extra_session_options=self._extra_session_options,
+            **kwargs,
+        )
+    def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]:
+        if isinstance(query, str):
+            query = [query]
+        if not hasattr(self, "model") or self.model is None:
+            self.load_onnx_model()
+        for text in query:
+            yield from self._post_process_onnx_output(
+                self.onnx_embed([text], is_doc=False), is_doc=False
+            )
+    @classmethod
+    def _get_worker_class(cls) -> Type[TextEmbeddingWorker[NumpyArray]]:
+        return ColbertEmbeddingWorker
+class ColbertEmbeddingWorker(TextEmbeddingWorker[NumpyArray]):
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> Colbert:
+        return Colbert(
+            model_name=model_name,
+            cache_dir=cache_dir,
+            threads=1,
+            **kwargs,
+        )

fastembed/late_interaction/jina_colbert.py ADDED Viewed

@@ -0,0 +1,58 @@
+from typing import Any, Type
+from fastembed.common.types import NumpyArray
+from fastembed.late_interaction.colbert import Colbert, ColbertEmbeddingWorker
+from fastembed.common.model_description import DenseModelDescription, ModelSource
+supported_jina_colbert_models: list[DenseModelDescription] = [
+    DenseModelDescription(
+        model="jinaai/jina-colbert-v2",
+        dim=128,
+        description="New model that expands capabilities of colbert-v1 with multilingual and context length of 8192, 2024 year",
+        license="cc-by-nc-4.0",
+        size_in_GB=2.24,
+        sources=ModelSource(hf="jinaai/jina-colbert-v2"),
+        model_file="onnx/model.onnx",
+        additional_files=["onnx/model.onnx_data"],
+    )
+]
+class JinaColbert(Colbert):
+    QUERY_MARKER_TOKEN_ID = 250002
+    DOCUMENT_MARKER_TOKEN_ID = 250003
+    MIN_QUERY_LENGTH = 31  # it's 32, we add one additional special token in the beginning
+    MASK_TOKEN = "<mask>"
+    @classmethod
+    def _get_worker_class(cls) -> Type[ColbertEmbeddingWorker]:
+        return JinaColbertEmbeddingWorker
+    @classmethod
+    def _list_supported_models(cls) -> list[DenseModelDescription]:
+        """Lists the supported models.
+        Returns:
+            list[DenseModelDescription]: A list of DenseModelDescription objects containing the model information.
+        """
+        return supported_jina_colbert_models
+    def _preprocess_onnx_input(
+        self, onnx_input: dict[str, NumpyArray], is_doc: bool = True, **kwargs: Any
+    ) -> dict[str, NumpyArray]:
+        onnx_input = super()._preprocess_onnx_input(onnx_input, is_doc)
+        # the attention mask for jina-colbert-v2 is always 1 in queries
+        if not is_doc:
+            onnx_input["attention_mask"][:] = 1
+        return onnx_input
+class JinaColbertEmbeddingWorker(ColbertEmbeddingWorker):
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> JinaColbert:
+        return JinaColbert(
+            model_name=model_name,
+            cache_dir=cache_dir,
+            threads=1,
+            **kwargs,
+        )

fastembed/late_interaction/late_interaction_embedding_base.py ADDED Viewed

@@ -0,0 +1,80 @@
+from typing import Iterable, Any
+from fastembed.common.model_description import DenseModelDescription
+from fastembed.common.types import NumpyArray
+from fastembed.common.model_management import ModelManagement
+class LateInteractionTextEmbeddingBase(ModelManagement[DenseModelDescription]):
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: str | None = None,
+        threads: int | None = None,
+        **kwargs: Any,
+    ):
+        self.model_name = model_name
+        self.cache_dir = cache_dir
+        self.threads = threads
+        self._local_files_only = kwargs.pop("local_files_only", False)
+        self._embedding_size: int | None = None
+    def embed(
+        self,
+        documents: str | Iterable[str],
+        batch_size: int = 256,
+        parallel: int | None = None,
+        **kwargs: Any,
+    ) -> Iterable[NumpyArray]:
+        raise NotImplementedError()
+    def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]:
+        """
+        Embeds a list of text passages into a list of embeddings.
+        Args:
+            texts (Iterable[str]): The list of texts to embed.
+            **kwargs: Additional keyword argument to pass to the embed method.
+        Yields:
+            Iterable[NdArray]: The embeddings.
+        """
+        # This is model-specific, so that different models can have specialized implementations
+        yield from self.embed(texts, **kwargs)
+    def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]:
+        """
+        Embeds queries
+        Args:
+            query (Union[str, Iterable[str]]): The query to embed, or an iterable e.g. list of queries.
+        Returns:
+            Iterable[NdArray]: The embeddings.
+        """
+        # This is model-specific, so that different models can have specialized implementations
+        if isinstance(query, str):
+            yield from self.embed([query], **kwargs)
+        else:
+            yield from self.embed(query, **kwargs)
+    @classmethod
+    def get_embedding_size(cls, model_name: str) -> int:
+        """Returns embedding size of the chosen model."""
+        raise NotImplementedError("Subclasses must implement this method")
+    @property
+    def embedding_size(self) -> int:
+        """Returns embedding size for the current model"""
+        raise NotImplementedError("Subclasses must implement this method")
+    def token_count(
+        self,
+        texts: str | Iterable[str],
+        batch_size: int = 1024,
+        **kwargs: Any,
+    ) -> int:
+        """Returns the number of tokens in the texts."""
+        raise NotImplementedError("Subclasses must implement this method")

fastembed/late_interaction/late_interaction_text_embedding.py ADDED Viewed

@@ -0,0 +1,180 @@
+from typing import Any, Iterable, Sequence, Type
+from dataclasses import asdict
+from fastembed.common.model_description import DenseModelDescription
+from fastembed.common.types import NumpyArray, Device
+from fastembed.common import OnnxProvider
+from fastembed.late_interaction.colbert import Colbert
+from fastembed.late_interaction.jina_colbert import JinaColbert
+from fastembed.late_interaction.late_interaction_embedding_base import (
+    LateInteractionTextEmbeddingBase,
+)
+class LateInteractionTextEmbedding(LateInteractionTextEmbeddingBase):
+    EMBEDDINGS_REGISTRY: list[Type[LateInteractionTextEmbeddingBase]] = [Colbert, JinaColbert]
+    @classmethod
+    def list_supported_models(cls) -> list[dict[str, Any]]:
+        """
+        Lists the supported models.
+        Returns:
+            list[dict[str, Any]]: A list of dictionaries containing the model information.
+            Example:
+                ```
+                [
+                    {
+                        "model": "colbert-ir/colbertv2.0",
+                        "dim": 128,
+                        "description": "Late interaction model",
+                        "license": "mit",
+                        "size_in_GB": 0.44,
+                        "sources": {
+                            "hf": "colbert-ir/colbertv2.0",
+                        },
+                        "model_file": "model.onnx",
+                    },
+                ]
+                ```
+        """
+        return [asdict(model) for model in cls._list_supported_models()]
+    @classmethod
+    def _list_supported_models(cls) -> list[DenseModelDescription]:
+        result: list[DenseModelDescription] = []
+        for embedding in cls.EMBEDDINGS_REGISTRY:
+            result.extend(embedding._list_supported_models())
+        return result
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: str | None = None,
+        threads: int | None = None,
+        providers: Sequence[OnnxProvider] | None = None,
+        cuda: bool | Device = Device.AUTO,
+        device_ids: list[int] | None = None,
+        lazy_load: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(model_name, cache_dir, threads, **kwargs)
+        for EMBEDDING_MODEL_TYPE in self.EMBEDDINGS_REGISTRY:
+            supported_models = EMBEDDING_MODEL_TYPE._list_supported_models()
+            if any(model_name.lower() == model.model.lower() for model in supported_models):
+                self.model = EMBEDDING_MODEL_TYPE(
+                    model_name,
+                    cache_dir,
+                    threads=threads,
+                    providers=providers,
+                    cuda=cuda,
+                    device_ids=device_ids,
+                    lazy_load=lazy_load,
+                    **kwargs,
+                )
+                return
+        raise ValueError(
+            f"Model {model_name} is not supported in LateInteractionTextEmbedding."
+            "Please check the supported models using `LateInteractionTextEmbedding.list_supported_models()`"
+        )
+    @property
+    def embedding_size(self) -> int:
+        """Get the embedding size of the current model"""
+        if self._embedding_size is None:
+            self._embedding_size = self.get_embedding_size(self.model_name)
+        return self._embedding_size
+    @classmethod
+    def get_embedding_size(cls, model_name: str) -> int:
+        """Get the embedding size of the passed model
+        Args:
+            model_name (str): The name of the model to get embedding size for.
+        Returns:
+            int: The size of the embedding.
+        Raises:
+            ValueError: If the model name is not found in the supported models.
+        """
+        descriptions = cls._list_supported_models()
+        embedding_size: int | None = None
+        for description in descriptions:
+            if description.model.lower() == model_name.lower():
+                embedding_size = description.dim
+                break
+        if embedding_size is None:
+            model_names = [description.model for description in descriptions]
+            raise ValueError(
+                f"Embedding size for model {model_name} was None. "
+                f"Available model names: {model_names}"
+            )
+        return embedding_size
+    def embed(
+        self,
+        documents: str | Iterable[str],
+        batch_size: int = 256,
+        parallel: int | None = None,
+        **kwargs: Any,
+    ) -> Iterable[NumpyArray]:
+        """
+        Encode a list of documents into list of embeddings.
+        We use mean pooling with attention so that the model can handle variable-length inputs.
+        Args:
+            documents: Iterator of documents or single document to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self.model.embed(documents, batch_size, parallel, **kwargs)
+    def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]:
+        """
+        Embeds queries
+        Args:
+            query (Union[str, Iterable[str]]): The query to embed, or an iterable e.g. list of queries.
+        Returns:
+            Iterable[NdArray]: The embeddings.
+        """
+        # This is model-specific, so that different models can have specialized implementations
+        yield from self.model.query_embed(query, **kwargs)
+    def token_count(
+        self,
+        texts: str | Iterable[str],
+        batch_size: int = 1024,
+        is_doc: bool = True,
+        include_extension: bool = False,
+        **kwargs: Any,
+    ) -> int:
+        """Returns the number of tokens in the texts.
+        Args:
+            texts (str | Iterable[str]): The list of texts to embed.
+            batch_size (int): Batch size for encoding
+            is_doc (bool): Whether the texts are documents (disable embedding a query with include_mask=True).
+            include_extension (bool): Turn on to count DOC / QUERY marker tokens, and [MASK] token in query mode.
+        Returns:
+            int: Sum of number of tokens in the texts.
+        """
+        return self.model.token_count(
+            texts,
+            batch_size=batch_size,
+            is_doc=is_doc,
+            include_extension=include_extension,
+            **kwargs,
+        )