PyPI - fastembed-bio - Versions diffs - 0.1.0__py3-none-any.whl - Mend

fastembed-bio 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

fastembed/__init__.py +24 -0
fastembed/bio/__init__.py +3 -0
fastembed/bio/protein_embedding.py +456 -0
fastembed/common/__init__.py +3 -0
fastembed/common/model_description.py +52 -0
fastembed/common/model_management.py +471 -0
fastembed/common/onnx_model.py +188 -0
fastembed/common/preprocessor_utils.py +84 -0
fastembed/common/types.py +27 -0
fastembed/common/utils.py +69 -0
fastembed/embedding.py +24 -0
fastembed/image/__init__.py +3 -0
fastembed/image/image_embedding.py +135 -0
fastembed/image/image_embedding_base.py +55 -0
fastembed/image/onnx_embedding.py +217 -0
fastembed/image/onnx_image_model.py +156 -0
fastembed/image/transform/functional.py +221 -0
fastembed/image/transform/operators.py +499 -0
fastembed/late_interaction/__init__.py +5 -0
fastembed/late_interaction/colbert.py +301 -0
fastembed/late_interaction/jina_colbert.py +58 -0
fastembed/late_interaction/late_interaction_embedding_base.py +80 -0
fastembed/late_interaction/late_interaction_text_embedding.py +180 -0
fastembed/late_interaction/token_embeddings.py +83 -0
fastembed/late_interaction_multimodal/__init__.py +5 -0
fastembed/late_interaction_multimodal/colmodernvbert.py +532 -0
fastembed/late_interaction_multimodal/colpali.py +327 -0
fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +189 -0
fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +86 -0
fastembed/late_interaction_multimodal/onnx_multimodal_model.py +291 -0
fastembed/parallel_processor.py +253 -0
fastembed/postprocess/__init__.py +3 -0
fastembed/postprocess/muvera.py +362 -0
fastembed/py.typed +1 -0
fastembed/rerank/cross_encoder/__init__.py +3 -0
fastembed/rerank/cross_encoder/custom_text_cross_encoder.py +47 -0
fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +239 -0
fastembed/rerank/cross_encoder/onnx_text_model.py +204 -0
fastembed/rerank/cross_encoder/text_cross_encoder.py +178 -0
fastembed/rerank/cross_encoder/text_cross_encoder_base.py +63 -0
fastembed/sparse/__init__.py +4 -0
fastembed/sparse/bm25.py +359 -0
fastembed/sparse/bm42.py +369 -0
fastembed/sparse/minicoil.py +372 -0
fastembed/sparse/sparse_embedding_base.py +90 -0
fastembed/sparse/sparse_text_embedding.py +143 -0
fastembed/sparse/splade_pp.py +196 -0
fastembed/sparse/utils/minicoil_encoder.py +146 -0
fastembed/sparse/utils/sparse_vectors_converter.py +244 -0
fastembed/sparse/utils/tokenizer.py +120 -0
fastembed/sparse/utils/vocab_resolver.py +202 -0
fastembed/text/__init__.py +3 -0
fastembed/text/clip_embedding.py +56 -0
fastembed/text/custom_text_embedding.py +97 -0
fastembed/text/multitask_embedding.py +109 -0
fastembed/text/onnx_embedding.py +353 -0
fastembed/text/onnx_text_model.py +180 -0
fastembed/text/pooled_embedding.py +136 -0
fastembed/text/pooled_normalized_embedding.py +164 -0
fastembed/text/text_embedding.py +228 -0
fastembed/text/text_embedding_base.py +75 -0
fastembed_bio-0.1.0.dist-info/METADATA +339 -0
fastembed_bio-0.1.0.dist-info/RECORD +66 -0
fastembed_bio-0.1.0.dist-info/WHEEL +4 -0
fastembed_bio-0.1.0.dist-info/licenses/LICENSE +201 -0
fastembed_bio-0.1.0.dist-info/licenses/NOTICE +22 -0

fastembed/postprocess/muvera.py ADDED Viewed

@@ -0,0 +1,362 @@
+import numpy as np
+from fastembed.common.types import NumpyArray
+from fastembed.late_interaction.late_interaction_embedding_base import (
+    LateInteractionTextEmbeddingBase,
+)
+from fastembed.late_interaction_multimodal.late_interaction_multimodal_embedding_base import (
+    LateInteractionMultimodalEmbeddingBase,
+)
+MultiVectorModel = LateInteractionTextEmbeddingBase | LateInteractionMultimodalEmbeddingBase
+MAX_HAMMING_DISTANCE = 65  # 64 bits + 1
+POPCOUNT_LUT = np.array([bin(x).count("1") for x in range(256)], dtype=np.uint8)
+def hamming_distance_matrix(ids: np.ndarray) -> np.ndarray:
+    """Compute full Hamming distance matrix
+    Args:
+    ids: shape (n,) - array of ids, only size of the array matters
+    Return:
+        np.ndarray (n, n) - hamming distance matrix
+    """
+    n = len(ids)
+    xor_vals = np.bitwise_xor(ids[:, None], ids[None, :])  # (n, n) uint64
+    bytes_view = xor_vals.view(np.uint8).reshape(n, n, 8)  # (n, n, 8)
+    return POPCOUNT_LUT[bytes_view].sum(axis=2)
+class SimHashProjection:
+    """
+    SimHash projection component for MUVERA clustering.
+    This class implements locality-sensitive hashing using random hyperplanes
+    to partition the vector space into 2^k_sim clusters. Each vector is assigned
+    to a cluster based on which side of k_sim random hyperplanes it falls on.
+    Attributes:
+        k_sim (int): Number of SimHash functions (hyperplanes)
+        dim (int): Dimensionality of input vectors
+        simhash_vectors (np.ndarray): Random hyperplane normal vectors of shape (dim, k_sim)
+    """
+    def __init__(self, k_sim: int, dim: int, random_generator: np.random.Generator):
+        """
+        Initialize SimHash projection with random hyperplanes.
+        Args:
+            k_sim (int): Number of SimHash functions, determines 2^k_sim clusters
+            dim (int): Dimensionality of input vectors
+            random_generator (np.random.Generator): Random number generator for reproducibility
+        """
+        self.k_sim = k_sim
+        self.dim = dim
+        # Generate k_sim random hyperplanes (normal vectors) from standard normal distribution
+        self.simhash_vectors = random_generator.normal(size=(dim, k_sim))
+    def get_cluster_ids(self, vectors: np.ndarray) -> np.ndarray:
+        """
+        Compute the cluster IDs for a given vector using SimHash.
+        The cluster ID is determined by computing the dot product of the vector
+        with each hyperplane normal vector, taking the sign, and interpreting
+        the resulting binary string as an integer.
+        Args:
+            vectors (np.ndarray): Input vectors of shape (n, dim,)
+        Returns:
+            np.ndarray: Cluster IDs in range [0, 2^k_sim - 1]
+        Raises:
+            AssertionError: If a vector shape doesn't match expected dimensionality
+        """
+        dot_product = (
+            vectors @ self.simhash_vectors
+        )  # (token_num, dim) x (dim, k_sim) -> (token_num, k_sim)
+        cluster_ids = (dot_product > 0) @ (1 << np.arange(self.k_sim))
+        return cluster_ids
+class Muvera:
+    """
+    MUVERA (Multi-Vector Retrieval Architecture) algorithm implementation.
+    This class creates Fixed Dimensional Encodings (FDEs) from variable-length
+    sequences of vectors by using SimHash clustering and random projections.
+    The process involves:
+    1. Clustering vectors using multiple SimHash projections
+    2. Computing cluster centers (with different strategies for docs vs queries)
+    3. Applying random projections for dimensionality reduction
+    4. Concatenating results from all projections
+    Attributes:
+        k_sim (int): Number of SimHash functions per projection
+        dim (int): Input vector dimensionality
+        dim_proj (int): Output dimensionality after random projection
+        r_reps (int): Number of random projection repetitions
+        random_seed (int): Random seed for consistent random matrix generation
+        simhash_projections (List[SimHashProjection]): SimHash instances for clustering
+        dim_reduction_projections (np.ndarray): Random projection matrices of shape (R_reps, d, d_proj)
+    """
+    def __init__(
+        self,
+        dim: int,
+        k_sim: int = 5,
+        dim_proj: int = 16,
+        r_reps: int = 20,
+        random_seed: int = 42,
+    ):
+        """
+        Initialize MUVERA algorithm with specified parameters.
+        Args:
+            dim (int): Dimensionality of individual input vectors
+            k_sim (int, optional): Number of SimHash functions (creates 2^k_sim clusters).
+                                   Defaults to 5.
+            dim_proj (int, optional): Dimensionality after random projection (must be <= dim).
+                                    Defaults to 16.
+            r_reps (int, optional): Number of random projection repetitions for robustness.
+                                    Defaults to 20.
+            random_seed (int, optional): Seed for random number generator to ensure
+                                         reproducible results. Defaults to 42.
+        Raises:
+            ValueError: If dim_proj > dim (cannot project to higher dimensionality)
+        """
+        if dim_proj > dim:
+            raise ValueError(
+                f"Cannot project to a higher dimensionality (dim_proj={dim_proj} > dim={dim})"
+            )
+        self.k_sim = k_sim
+        self.dim = dim
+        self.dim_proj = dim_proj
+        self.r_reps = r_reps
+        # Create r_reps independent SimHash projections for robustness
+        generator = np.random.default_rng(random_seed)
+        self.simhash_projections = [
+            SimHashProjection(k_sim=self.k_sim, dim=self.dim, random_generator=generator)
+            for _ in range(r_reps)
+        ]
+        # Random projection matrices with entries from {-1, +1} for each repetition
+        self.dim_reduction_projections = generator.choice([-1, 1], size=(r_reps, dim, dim_proj))
+    @classmethod
+    def from_multivector_model(
+        cls,
+        model: MultiVectorModel,
+        k_sim: int = 5,
+        dim_proj: int = 16,
+        r_reps: int = 20,  # noqa[naming]
+        random_seed: int = 42,
+    ) -> "Muvera":
+        """
+        Create a Muvera instance from a multi-vector embedding model.
+        This class method provides a convenient way to initialize a MUVERA
+        that is compatible with a given multi-vector model by automatically extracting
+        the embedding dimensionality from the model.
+        Args:
+            model (MultiVectorModel): A late interaction text or multimodal embedding model
+                                    that provides multi-vector embeddings. Must have an
+                                    `embedding_size` attribute specifying the dimensionality
+                                    of individual vectors.
+            k_sim (int, optional): Number of SimHash functions (creates 2^k_sim clusters).
+                                   Defaults to 5.
+            dim_proj (int, optional): Dimensionality after random projection (must be <= model's
+                                    embedding_size). Defaults to 16.
+            r_reps (int, optional): Number of random projection repetitions for robustness.
+                                    Defaults to 20.
+            random_seed (int, optional): Seed for random number generator to ensure
+                                         reproducible results. Defaults to 42.
+        Returns:
+            Muvera: A configured MUVERA instance ready to process embeddings from the given model.
+        Raises:
+            ValueError: If dim_proj > model.embedding_size (cannot project to higher dimensionality)
+        Example:
+            >>> from fastembed import LateInteractionTextEmbedding
+            >>> model = LateInteractionTextEmbedding(model_name="colbert-ir/colbertv2.0")
+            >>> muvera = Muvera.from_multivector_model(
+            ...     model=model,
+            ...     k_sim=6,
+            ...     dim_proj=32
+            ... )
+            >>> # Now use postprocessor with embeddings from the model
+            >>> embeddings = np.array(list(model.embed(["sample text"])))
+            >>> fde = muvera.process_document(embeddings[0])
+        """
+        return cls(
+            dim=model.embedding_size,
+            k_sim=k_sim,
+            dim_proj=dim_proj,
+            r_reps=r_reps,
+            random_seed=random_seed,
+        )
+    def _get_output_dimension(self) -> int:
+        """
+        Get the output dimension of the MUVERA algorithm.
+        Returns:
+            int: Output dimension (r_reps * num_partitions * dim_proj) where b = 2^k_sim
+        """
+        num_partitions = 2**self.k_sim
+        return self.r_reps * num_partitions * self.dim_proj
+    @property
+    def embedding_size(self) -> int:
+        return self._get_output_dimension()
+    def process_document(self, vectors: NumpyArray) -> NumpyArray:
+        """
+        Encode a document's vectors into a Fixed Dimensional Encoding (FDE).
+        Uses document-specific settings: normalizes cluster centers by vector count
+        and fills empty clusters using Hamming distance-based selection.
+        Args:
+            vectors (NumpyArray): Document vectors of shape (n_tokens, dim)
+        Returns:
+            NumpyArray: Fixed dimensional encodings of shape (r_reps * b * dim_proj,)
+        """
+        return self.process(vectors, fill_empty_clusters=True, normalize_by_count=True)
+    def process_query(self, vectors: NumpyArray) -> NumpyArray:
+        """
+        Encode a query's vectors into a Fixed Dimensional Encoding (FDE).
+        Uses query-specific settings: no normalization by count and no empty
+        cluster filling to preserve query vector magnitudes.
+        Args:
+            vectors (NumpyArray]): Query vectors of shape (n_tokens, dim)
+        Returns:
+            NumpyArray: Fixed dimensional encoding of shape (r_reps * b * dim_proj,)
+        """
+        return self.process(vectors, fill_empty_clusters=False, normalize_by_count=False)
+    def process(
+        self,
+        vectors: NumpyArray,
+        fill_empty_clusters: bool = True,
+        normalize_by_count: bool = True,
+    ) -> NumpyArray:
+        """
+        Core encoding method that transforms variable-length vector sequences into FDEs.
+        The encoding process:
+        1. For each of r_reps random projections:
+           a. Assign vectors to clusters using SimHash
+           b. Compute cluster centers (sum of vectors in each cluster)
+           c. Optionally normalize by cluster size
+           d. Fill empty clusters using Hamming distance if requested
+           e. Apply random projection for dimensionality reduction
+           f. Flatten cluster centers into a vector
+        2. Concatenate all projection results
+        Args:
+            vectors (np.ndarray): Input vectors of shape (n_vectors, dim)
+            fill_empty_clusters (bool): Whether to fill empty clusters using nearest
+                                      vectors based on Hamming distance of cluster IDs
+            normalize_by_count (bool): Whether to normalize cluster centers by the
+                                     number of vectors assigned to each cluster
+        Returns:
+            np.ndarray: Fixed dimensional encoding of shape (r_reps * b * dim_proj)
+                        where B = 2^k_sim is the number of clusters
+        Raises:
+            AssertionError: If input vectors don't have expected dimensionality
+        """
+        assert (
+            vectors.shape[1] == self.dim
+        ), f"Expected vectors of shape (n, {self.dim}), got {vectors.shape}"
+        # Store results from each random projection
+        output_vectors = []
+        # num of space partitions in SimHash
+        num_partitions = 2**self.k_sim
+        cluster_center_ids = np.arange(num_partitions)
+        precomputed_hamming_matrix = (
+            hamming_distance_matrix(cluster_center_ids) if fill_empty_clusters else None
+        )
+        for projection_index, simhash in enumerate(self.simhash_projections):
+            # Initialize cluster centers and count vectors assigned to each cluster
+            cluster_centers = np.zeros((num_partitions, self.dim))
+            cluster_center_id_to_vectors: dict[int, list[int]] = {
+                cluster_center_id: [] for cluster_center_id in cluster_center_ids
+            }
+            cluster_vector_counts = None
+            empty_mask = None
+            # Assign each vector to its cluster and accumulate cluster centers
+            vector_cluster_ids = simhash.get_cluster_ids(vectors)
+            for cluster_id, (vec_idx, vec) in zip(vector_cluster_ids, enumerate(vectors)):
+                cluster_centers[cluster_id] += vec
+                cluster_center_id_to_vectors[cluster_id].append(vec_idx)
+            if normalize_by_count or fill_empty_clusters:
+                cluster_vector_counts = np.bincount(vector_cluster_ids, minlength=num_partitions)
+                empty_mask = cluster_vector_counts == 0
+            if normalize_by_count:
+                assert empty_mask is not None
+                assert cluster_vector_counts is not None
+                non_empty_mask = ~empty_mask
+                cluster_centers[non_empty_mask] /= cluster_vector_counts[non_empty_mask][:, None]
+            # Fill empty clusters using vectors with minimum Hamming distance
+            if fill_empty_clusters:
+                assert empty_mask is not None
+                assert precomputed_hamming_matrix is not None
+                masked_hamming = np.where(
+                    empty_mask[None, :], MAX_HAMMING_DISTANCE, precomputed_hamming_matrix
+                )
+                nearest_non_empty = np.argmin(masked_hamming, axis=1)
+                fill_vectors = np.array(
+                    [
+                        vectors[cluster_center_id_to_vectors[cluster_id][0]]
+                        for cluster_id in nearest_non_empty[empty_mask]
+                    ]
+                ).reshape(-1, self.dim)
+                cluster_centers[empty_mask] = fill_vectors
+            # Apply random projection for dimensionality reduction if needed
+            if self.dim_proj < self.dim:
+                dim_reduction_projection = self.dim_reduction_projections[
+                    projection_index
+                ]  # Get projection matrix for this repetition
+                projected_centers = (1 / np.sqrt(self.dim_proj)) * (
+                    cluster_centers @ dim_reduction_projection
+                )
+                # Flatten cluster centers into a single vector and add to output
+                output_vectors.append(projected_centers.flatten())
+                continue
+            # If no projection needed (dim_proj == dim), use original cluster centers
+            output_vectors.append(cluster_centers.flatten())
+        # Concatenate results from all R_reps projections into final FDE
+        return np.concatenate(output_vectors)
+if __name__ == "__main__":
+    v_arrs = np.random.randn(10, 100, 128)
+    muvera = Muvera(128, 4, 8, 20, 42)
+    for v_arr in v_arrs:
+        muvera.process(v_arr)  # type: ignore

fastembed/py.typed ADDED Viewed

	@@ -0,0 +1 @@
1	+ partial

fastembed/rerank/cross_encoder/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from fastembed.rerank.cross_encoder.text_cross_encoder import TextCrossEncoder
+__all__ = ["TextCrossEncoder"]

fastembed/rerank/cross_encoder/custom_text_cross_encoder.py ADDED Viewed

@@ -0,0 +1,47 @@
+from typing import Sequence, Any
+from fastembed.common import OnnxProvider
+from fastembed.common.model_description import BaseModelDescription
+from fastembed.common.types import Device
+from fastembed.rerank.cross_encoder.onnx_text_cross_encoder import OnnxTextCrossEncoder
+class CustomTextCrossEncoder(OnnxTextCrossEncoder):
+    SUPPORTED_MODELS: list[BaseModelDescription] = []
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: str | None = None,
+        threads: int | None = None,
+        providers: Sequence[OnnxProvider] | None = None,
+        cuda: bool | Device = Device.AUTO,
+        device_ids: list[int] | None = None,
+        lazy_load: bool = False,
+        device_id: int | None = None,
+        specific_model_path: str | None = None,
+        **kwargs: Any,
+    ):
+        super().__init__(
+            model_name=model_name,
+            cache_dir=cache_dir,
+            threads=threads,
+            providers=providers,
+            cuda=cuda,
+            device_ids=device_ids,
+            lazy_load=lazy_load,
+            device_id=device_id,
+            specific_model_path=specific_model_path,
+            **kwargs,
+        )
+    @classmethod
+    def _list_supported_models(cls) -> list[BaseModelDescription]:
+        return cls.SUPPORTED_MODELS
+    @classmethod
+    def add_model(
+        cls,
+        model_description: BaseModelDescription,
+    ) -> None:
+        cls.SUPPORTED_MODELS.append(model_description)

fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py ADDED Viewed

@@ -0,0 +1,239 @@
+from typing import Any, Iterable, Sequence, Type
+from loguru import logger
+from fastembed.common import OnnxProvider
+from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.common.types import Device
+from fastembed.common.utils import define_cache_dir
+from fastembed.rerank.cross_encoder.onnx_text_model import (
+    OnnxCrossEncoderModel,
+    TextRerankerWorker,
+)
+from fastembed.rerank.cross_encoder.text_cross_encoder_base import TextCrossEncoderBase
+from fastembed.common.model_description import BaseModelDescription, ModelSource
+supported_onnx_models: list[BaseModelDescription] = [
+    BaseModelDescription(
+        model="Xenova/ms-marco-MiniLM-L-6-v2",
+        description="MiniLM-L-6-v2 model optimized for re-ranking tasks.",
+        license="apache-2.0",
+        size_in_GB=0.08,
+        sources=ModelSource(hf="Xenova/ms-marco-MiniLM-L-6-v2"),
+        model_file="onnx/model.onnx",
+    ),
+    BaseModelDescription(
+        model="Xenova/ms-marco-MiniLM-L-12-v2",
+        description="MiniLM-L-12-v2 model optimized for re-ranking tasks.",
+        license="apache-2.0",
+        size_in_GB=0.12,
+        sources=ModelSource(hf="Xenova/ms-marco-MiniLM-L-12-v2"),
+        model_file="onnx/model.onnx",
+    ),
+    BaseModelDescription(
+        model="BAAI/bge-reranker-base",
+        description="BGE reranker base model for cross-encoder re-ranking.",
+        license="mit",
+        size_in_GB=1.04,
+        sources=ModelSource(hf="BAAI/bge-reranker-base"),
+        model_file="onnx/model.onnx",
+    ),
+    BaseModelDescription(
+        model="jinaai/jina-reranker-v1-tiny-en",
+        description="Designed for blazing-fast re-ranking with 8K context length and fewer parameters than jina-reranker-v1-turbo-en.",
+        license="apache-2.0",
+        size_in_GB=0.13,
+        sources=ModelSource(hf="jinaai/jina-reranker-v1-tiny-en"),
+        model_file="onnx/model.onnx",
+    ),
+    BaseModelDescription(
+        model="jinaai/jina-reranker-v1-turbo-en",
+        description="Designed for blazing-fast re-ranking with 8K context length.",
+        license="apache-2.0",
+        size_in_GB=0.15,
+        sources=ModelSource(hf="jinaai/jina-reranker-v1-turbo-en"),
+        model_file="onnx/model.onnx",
+    ),
+    BaseModelDescription(
+        model="jinaai/jina-reranker-v2-base-multilingual",
+        description="A multi-lingual reranker model for cross-encoder re-ranking with 1K context length and sliding window",
+        license="cc-by-nc-4.0",
+        size_in_GB=1.11,
+        sources=ModelSource(hf="jinaai/jina-reranker-v2-base-multilingual"),
+        model_file="onnx/model.onnx",
+    ),
+]
+class OnnxTextCrossEncoder(TextCrossEncoderBase, OnnxCrossEncoderModel):
+    @classmethod
+    def _list_supported_models(cls) -> list[BaseModelDescription]:
+        """Lists the supported models.
+        Returns:
+            list[BaseModelDescription]: A list of BaseModelDescription objects containing the model information.
+        """
+        return supported_onnx_models
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: str | None = None,
+        threads: int | None = None,
+        providers: Sequence[OnnxProvider] | None = None,
+        cuda: bool | Device = Device.AUTO,
+        device_ids: list[int] | None = None,
+        lazy_load: bool = False,
+        device_id: int | None = None,
+        specific_model_path: str | None = None,
+        **kwargs: Any,
+    ):
+        """
+        Args:
+            model_name (str): The name of the model to use.
+            cache_dir (str, optional): The path to the cache directory.
+                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
+                                       Defaults to `fastembed_cache` in the system's temp directory.
+            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
+            providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
+                Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
+            device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
+            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
+                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
+            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
+            specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. Xenova/ms-marco-MiniLM-L-6-v2.
+        """
+        super().__init__(model_name, cache_dir, threads, **kwargs)
+        self.providers = providers
+        self.lazy_load = lazy_load
+        self._extra_session_options = self._select_exposed_session_options(kwargs)
+        # List of device ids, that can be used for data parallel processing in workers
+        self.device_ids = device_ids
+        self.cuda = cuda
+        if self.device_ids is not None and len(self.device_ids) > 1:
+            logger.warning(
+                "Parallel execution is currently not supported for cross encoders, "
+                f"only the first device will be used for inference: {self.device_ids[0]}."
+            )
+        # This device_id will be used if we need to load model in current process
+        self.device_id: int | None = None
+        if device_id is not None:
+            self.device_id = device_id
+        elif self.device_ids is not None:
+            self.device_id = self.device_ids[0]
+        self.model_description = self._get_model_description(model_name)
+        self.cache_dir = str(define_cache_dir(cache_dir))
+        self._specific_model_path = specific_model_path
+        self._model_dir = self.download_model(
+            self.model_description,
+            self.cache_dir,
+            local_files_only=self._local_files_only,
+            specific_model_path=self._specific_model_path,
+        )
+        if not self.lazy_load:
+            self.load_onnx_model()
+    def load_onnx_model(self) -> None:
+        self._load_onnx_model(
+            model_dir=self._model_dir,
+            model_file=self.model_description.model_file,
+            threads=self.threads,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_id=self.device_id,
+            extra_session_options=self._extra_session_options,
+        )
+    def rerank(
+        self,
+        query: str,
+        documents: Iterable[str],
+        batch_size: int = 64,
+        **kwargs: Any,
+    ) -> Iterable[float]:
+        """Reranks documents based on their relevance to a given query.
+        Args:
+            query (str): The query string to which document relevance is calculated.
+            documents (Iterable[str]): Iterable of documents to be reranked.
+            batch_size (int, optional): The number of documents processed in each batch. Higher batch sizes improve speed
+                                        but require more memory. Default is 64.
+        Returns:
+            Iterable[float]: An iterable of relevance scores for each document.
+        """
+        yield from self._rerank_documents(
+            query=query, documents=documents, batch_size=batch_size, **kwargs
+        )
+    def rerank_pairs(
+        self,
+        pairs: Iterable[tuple[str, str]],
+        batch_size: int = 64,
+        parallel: int | None = None,
+        **kwargs: Any,
+    ) -> Iterable[float]:
+        yield from self._rerank_pairs(
+            model_name=self.model_name,
+            cache_dir=str(self.cache_dir),
+            pairs=pairs,
+            batch_size=batch_size,
+            parallel=parallel,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_ids=self.device_ids,
+            local_files_only=self._local_files_only,
+            specific_model_path=self._specific_model_path,
+            extra_session_options=self._extra_session_options,
+            **kwargs,
+        )
+    @classmethod
+    def _get_worker_class(cls) -> Type[TextRerankerWorker]:
+        return TextCrossEncoderWorker
+    def _post_process_onnx_output(
+        self, output: OnnxOutputContext, **kwargs: Any
+    ) -> Iterable[float]:
+        return (float(elem) for elem in output.model_output)
+    def token_count(
+        self, pairs: Iterable[tuple[str, str]], batch_size: int = 1024, **kwargs: Any
+    ) -> int:
+        """Returns the number of tokens in the pairs.
+        Args:
+            pairs: Iterable of tuples, where each tuple contains a query and a document to be tokenized
+            batch_size: Batch size for tokenizing
+        Returns:
+            token count: overall number of tokens in the pairs
+        """
+        return self._token_count(pairs, batch_size=batch_size, **kwargs)
+class TextCrossEncoderWorker(TextRerankerWorker):
+    def init_embedding(
+        self,
+        model_name: str,
+        cache_dir: str,
+        **kwargs: Any,
+    ) -> OnnxTextCrossEncoder:
+        return OnnxTextCrossEncoder(
+            model_name=model_name,
+            cache_dir=cache_dir,
+            threads=1,
+            **kwargs,
+        )