PyPI - fastembed-bio - Versions diffs - 0.1.0__py3-none-any.whl - Mend

fastembed-bio 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

fastembed/__init__.py +24 -0
fastembed/bio/__init__.py +3 -0
fastembed/bio/protein_embedding.py +456 -0
fastembed/common/__init__.py +3 -0
fastembed/common/model_description.py +52 -0
fastembed/common/model_management.py +471 -0
fastembed/common/onnx_model.py +188 -0
fastembed/common/preprocessor_utils.py +84 -0
fastembed/common/types.py +27 -0
fastembed/common/utils.py +69 -0
fastembed/embedding.py +24 -0
fastembed/image/__init__.py +3 -0
fastembed/image/image_embedding.py +135 -0
fastembed/image/image_embedding_base.py +55 -0
fastembed/image/onnx_embedding.py +217 -0
fastembed/image/onnx_image_model.py +156 -0
fastembed/image/transform/functional.py +221 -0
fastembed/image/transform/operators.py +499 -0
fastembed/late_interaction/__init__.py +5 -0
fastembed/late_interaction/colbert.py +301 -0
fastembed/late_interaction/jina_colbert.py +58 -0
fastembed/late_interaction/late_interaction_embedding_base.py +80 -0
fastembed/late_interaction/late_interaction_text_embedding.py +180 -0
fastembed/late_interaction/token_embeddings.py +83 -0
fastembed/late_interaction_multimodal/__init__.py +5 -0
fastembed/late_interaction_multimodal/colmodernvbert.py +532 -0
fastembed/late_interaction_multimodal/colpali.py +327 -0
fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding.py +189 -0
fastembed/late_interaction_multimodal/late_interaction_multimodal_embedding_base.py +86 -0
fastembed/late_interaction_multimodal/onnx_multimodal_model.py +291 -0
fastembed/parallel_processor.py +253 -0
fastembed/postprocess/__init__.py +3 -0
fastembed/postprocess/muvera.py +362 -0
fastembed/py.typed +1 -0
fastembed/rerank/cross_encoder/__init__.py +3 -0
fastembed/rerank/cross_encoder/custom_text_cross_encoder.py +47 -0
fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py +239 -0
fastembed/rerank/cross_encoder/onnx_text_model.py +204 -0
fastembed/rerank/cross_encoder/text_cross_encoder.py +178 -0
fastembed/rerank/cross_encoder/text_cross_encoder_base.py +63 -0
fastembed/sparse/__init__.py +4 -0
fastembed/sparse/bm25.py +359 -0
fastembed/sparse/bm42.py +369 -0
fastembed/sparse/minicoil.py +372 -0
fastembed/sparse/sparse_embedding_base.py +90 -0
fastembed/sparse/sparse_text_embedding.py +143 -0
fastembed/sparse/splade_pp.py +196 -0
fastembed/sparse/utils/minicoil_encoder.py +146 -0
fastembed/sparse/utils/sparse_vectors_converter.py +244 -0
fastembed/sparse/utils/tokenizer.py +120 -0
fastembed/sparse/utils/vocab_resolver.py +202 -0
fastembed/text/__init__.py +3 -0
fastembed/text/clip_embedding.py +56 -0
fastembed/text/custom_text_embedding.py +97 -0
fastembed/text/multitask_embedding.py +109 -0
fastembed/text/onnx_embedding.py +353 -0
fastembed/text/onnx_text_model.py +180 -0
fastembed/text/pooled_embedding.py +136 -0
fastembed/text/pooled_normalized_embedding.py +164 -0
fastembed/text/text_embedding.py +228 -0
fastembed/text/text_embedding_base.py +75 -0
fastembed_bio-0.1.0.dist-info/METADATA +339 -0
fastembed_bio-0.1.0.dist-info/RECORD +66 -0
fastembed_bio-0.1.0.dist-info/WHEEL +4 -0
fastembed_bio-0.1.0.dist-info/licenses/LICENSE +201 -0
fastembed_bio-0.1.0.dist-info/licenses/NOTICE +22 -0

fastembed/sparse/bm42.py ADDED Viewed

@@ -0,0 +1,369 @@
+import math
+import string
+from pathlib import Path
+from typing import Any, Iterable, Sequence, Type
+import mmh3
+import numpy as np
+from py_rust_stemmers import SnowballStemmer
+from fastembed.common import OnnxProvider
+from fastembed.common.onnx_model import OnnxOutputContext
+from fastembed.common.types import Device
+from fastembed.common.utils import define_cache_dir
+from fastembed.sparse.sparse_embedding_base import (
+    SparseEmbedding,
+    SparseTextEmbeddingBase,
+)
+from fastembed.text.onnx_text_model import OnnxTextModel, TextEmbeddingWorker
+from fastembed.common.model_description import SparseModelDescription, ModelSource
+supported_bm42_models: list[SparseModelDescription] = [
+    SparseModelDescription(
+        model="Qdrant/bm42-all-minilm-l6-v2-attentions",
+        vocab_size=30522,
+        description="Light sparse embedding model, which assigns an importance score to each token in the text",
+        license="apache-2.0",
+        size_in_GB=0.09,
+        sources=ModelSource(hf="Qdrant/all_miniLM_L6_v2_with_attentions"),
+        model_file="model.onnx",
+        additional_files=["stopwords.txt"],
+        requires_idf=True,
+    ),
+]
+_MODEL_TO_LANGUAGE = {
+    "Qdrant/bm42-all-minilm-l6-v2-attentions": "english",
+}
+MODEL_TO_LANGUAGE = {
+    model_name.lower(): language for model_name, language in _MODEL_TO_LANGUAGE.items()
+}
+def get_language_by_model_name(model_name: str) -> str:
+    return MODEL_TO_LANGUAGE[model_name.lower()]
+class Bm42(SparseTextEmbeddingBase, OnnxTextModel[SparseEmbedding]):
+    """
+    Bm42 is an extension of BM25, which tries to better evaluate importance of tokens in the documents,
+    by extracting attention weights from the transformer model.
+    Traditional BM25 uses a count of tokens in the document to evaluate the importance of the token,
+    but this approach doesn't work well with short documents or chunks of text, as almost all tokens
+    there are unique.
+    BM42 addresses this issue by replacing the token count with the attention weights from the transformer model.
+    This allows sparse embeddings to work well with short documents, handle rare tokens and leverage traditional NLP
+    techniques like stemming and stopwords.
+    WARNING: This model is expected to be used with `modifier="idf"` in the sparse vector index of Qdrant.
+    """
+    ONNX_OUTPUT_NAMES = ["attention_6"]
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: str | None = None,
+        threads: int | None = None,
+        providers: Sequence[OnnxProvider] | None = None,
+        alpha: float = 0.5,
+        cuda: bool | Device = Device.AUTO,
+        device_ids: list[int] | None = None,
+        lazy_load: bool = False,
+        device_id: int | None = None,
+        specific_model_path: str | None = None,
+        **kwargs: Any,
+    ):
+        """
+        Args:
+            model_name (str): The name of the model to use.
+            cache_dir (str, optional): The path to the cache directory.
+                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
+                                       Defaults to `fastembed_cache` in the system's temp directory.
+            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
+            providers (Optional[Sequence[OnnxProvider]], optional): The providers to use for onnxruntime.
+            alpha (float, optional): Parameter, that defines the importance of the token weight in the document
+                versus the importance of the token frequency in the corpus. Defaults to 0.5, based on empirical testing.
+                It is recommended to only change this parameter based on training data for a specific dataset.
+            cuda (Union[bool, Device], optional): Whether to use cuda for inference. Mutually exclusive with `providers`
+                Defaults to Device.AUTO.
+            device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
+                workers. Should be used with `cuda` equals to `True`, `Device.AUTO` or `Device.CUDA`, mutually exclusive
+                with `providers`. Defaults to None.
+            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
+                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
+            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
+            specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        """
+        super().__init__(model_name, cache_dir, threads, **kwargs)
+        self.providers = providers
+        self.lazy_load = lazy_load
+        self._extra_session_options = self._select_exposed_session_options(kwargs)
+        # List of device ids, that can be used for data parallel processing in workers
+        self.device_ids = device_ids
+        self.cuda = cuda
+        # This device_id will be used if we need to load model in current process
+        self.device_id: int | None = None
+        if device_id is not None:
+            self.device_id = device_id
+        elif self.device_ids is not None:
+            self.device_id = self.device_ids[0]
+        self.model_description = self._get_model_description(model_name)
+        self.cache_dir = str(define_cache_dir(cache_dir))
+        self._specific_model_path = specific_model_path
+        self._model_dir = self.download_model(
+            self.model_description,
+            self.cache_dir,
+            local_files_only=self._local_files_only,
+            specific_model_path=self._specific_model_path,
+        )
+        self.invert_vocab: dict[int, str] = {}
+        self.special_tokens: set[str] = set()
+        self.special_tokens_ids: set[int] = set()
+        self.punctuation = set(string.punctuation)
+        self.stopwords = set(self._load_stopwords(self._model_dir))
+        self.stemmer = SnowballStemmer(get_language_by_model_name(self.model_name))
+        self.alpha = alpha
+        if not self.lazy_load:
+            self.load_onnx_model()
+    def load_onnx_model(self) -> None:
+        self._load_onnx_model(
+            model_dir=self._model_dir,
+            model_file=self.model_description.model_file,
+            threads=self.threads,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_id=self.device_id,
+            extra_session_options=self._extra_session_options,
+        )
+        for token, idx in self.tokenizer.get_vocab().items():  # type: ignore[union-attr]
+            self.invert_vocab[idx] = token
+        self.special_tokens = set(self.special_token_to_id.keys())
+        self.special_tokens_ids = set(self.special_token_to_id.values())
+        self.stopwords = set(self._load_stopwords(self._model_dir))
+    def _filter_pair_tokens(self, tokens: list[tuple[str, Any]]) -> list[tuple[str, Any]]:
+        result: list[tuple[str, Any]] = []
+        for token, value in tokens:
+            if token in self.stopwords or token in self.punctuation:
+                continue
+            result.append((token, value))
+        return result
+    def _stem_pair_tokens(self, tokens: list[tuple[str, Any]]) -> list[tuple[str, Any]]:
+        result: list[tuple[str, Any]] = []
+        for token, value in tokens:
+            processed_token = self.stemmer.stem_word(token)
+            result.append((processed_token, value))
+        return result
+    @classmethod
+    def _aggregate_weights(
+        cls, tokens: list[tuple[str, list[int]]], weights: list[float]
+    ) -> list[tuple[str, float]]:
+        result: list[tuple[str, float]] = []
+        for token, idxs in tokens:
+            sum_weight = sum(weights[idx] for idx in idxs)
+            result.append((token, sum_weight))
+        return result
+    def _reconstruct_bpe(
+        self, bpe_tokens: Iterable[tuple[int, str]]
+    ) -> list[tuple[str, list[int]]]:
+        result: list[tuple[str, list[int]]] = []
+        acc: str = ""
+        acc_idx: list[int] = []
+        continuing_subword_prefix = self.tokenizer.model.continuing_subword_prefix  # type: ignore[union-attr]
+        continuing_subword_prefix_len = len(continuing_subword_prefix)
+        for idx, token in bpe_tokens:
+            if token in self.special_tokens:
+                continue
+            if token.startswith(continuing_subword_prefix):
+                acc += token[continuing_subword_prefix_len:]
+                acc_idx.append(idx)
+            else:
+                if acc:
+                    result.append((acc, acc_idx))
+                    acc_idx = []
+                acc = token
+                acc_idx.append(idx)
+        if acc:
+            result.append((acc, acc_idx))
+        return result
+    def _rescore_vector(self, vector: dict[str, float]) -> dict[int, float]:
+        """
+        Orders all tokens in the vector by their importance and generates a new score based on the importance order.
+        So that the scoring doesn't depend on absolute values assigned by the model, but on the relative importance.
+        """
+        new_vector: dict[int, float] = {}
+        for token, value in vector.items():
+            token_id = abs(mmh3.hash(token))
+            # Examples:
+            # Num 0: Log(1/1 + 1) = 0.6931471805599453
+            # Num 1: Log(1/2 + 1) = 0.4054651081081644
+            # Num 2: Log(1/3 + 1) = 0.28768207245178085
+            new_vector[token_id] = math.log(1.0 + value) ** self.alpha  # value
+        return new_vector
+    def _post_process_onnx_output(
+        self, output: OnnxOutputContext, **kwargs: Any
+    ) -> Iterable[SparseEmbedding]:
+        if output.input_ids is None:
+            raise ValueError("input_ids must be provided for document post-processing")
+        token_ids_batch = output.input_ids.astype(int)
+        # attention_value shape: (batch_size, num_heads, num_tokens, num_tokens)
+        pooled_attention = np.mean(output.model_output[:, :, 0], axis=1) * output.attention_mask
+        for document_token_ids, attention_value in zip(token_ids_batch, pooled_attention):
+            document_tokens_with_ids = (
+                (idx, self.invert_vocab[token_id])
+                for idx, token_id in enumerate(document_token_ids)
+            )
+            reconstructed = self._reconstruct_bpe(document_tokens_with_ids)
+            filtered = self._filter_pair_tokens(reconstructed)
+            stemmed = self._stem_pair_tokens(filtered)
+            weighted = self._aggregate_weights(stemmed, attention_value)
+            max_token_weight: dict[str, float] = {}
+            for token, weight in weighted:
+                max_token_weight[token] = max(max_token_weight.get(token, 0), weight)
+            rescored = self._rescore_vector(max_token_weight)
+            yield SparseEmbedding.from_dict(rescored)
+    @classmethod
+    def _list_supported_models(cls) -> list[SparseModelDescription]:
+        """Lists the supported models.
+        Returns:
+            list[SparseModelDescription]: A list of SparseModelDescription objects containing the model information.
+        """
+        return supported_bm42_models
+    @classmethod
+    def _load_stopwords(cls, model_dir: Path) -> list[str]:
+        stopwords_path = model_dir / "stopwords.txt"
+        if not stopwords_path.exists():
+            return []
+        with open(stopwords_path, "r") as f:
+            return f.read().splitlines()
+    def embed(
+        self,
+        documents: str | Iterable[str],
+        batch_size: int = 256,
+        parallel: int | None = None,
+        **kwargs: Any,
+    ) -> Iterable[SparseEmbedding]:
+        """
+        Encode a list of documents into list of embeddings.
+        We use mean pooling with attention so that the model can handle variable-length inputs.
+        Args:
+            documents: Iterator of documents or single document to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+        Returns:
+            List of embeddings, one per document
+        """
+        yield from self._embed_documents(
+            model_name=self.model_name,
+            cache_dir=str(self.cache_dir),
+            documents=documents,
+            batch_size=batch_size,
+            parallel=parallel,
+            providers=self.providers,
+            cuda=self.cuda,
+            device_ids=self.device_ids,
+            alpha=self.alpha,
+            local_files_only=self._local_files_only,
+            specific_model_path=self._specific_model_path,
+            extra_session_options=self._extra_session_options,
+        )
+    @classmethod
+    def _query_rehash(cls, tokens: Iterable[str]) -> dict[int, float]:
+        result: dict[int, float] = {}
+        for token in tokens:
+            token_id = abs(mmh3.hash(token))
+            result[token_id] = 1.0
+        return result
+    def query_embed(self, query: str | Iterable[str], **kwargs: Any) -> Iterable[SparseEmbedding]:
+        """
+        To emulate BM25 behaviour, we don't need to use smart weights in the query, and
+        it's enough to just hash the tokens and assign a weight of 1.0 to them.
+        It is also faster, as we don't need to run the model for the query.
+        """
+        if isinstance(query, str):
+            query = [query]
+        if not hasattr(self, "model") or self.model is None:
+            self.load_onnx_model()
+        for text in query:
+            encoded = self.tokenizer.encode(text)  # type: ignore[union-attr]
+            document_tokens_with_ids = enumerate(encoded.tokens)
+            reconstructed = self._reconstruct_bpe(document_tokens_with_ids)
+            filtered = self._filter_pair_tokens(reconstructed)
+            stemmed = self._stem_pair_tokens(filtered)
+            yield SparseEmbedding.from_dict(self._query_rehash(token for token, _ in stemmed))
+    @classmethod
+    def _get_worker_class(cls) -> Type[TextEmbeddingWorker[SparseEmbedding]]:
+        return Bm42TextEmbeddingWorker
+    def token_count(
+        self, texts: str | Iterable[str], batch_size: int = 1024, **kwargs: Any
+    ) -> int:
+        if not hasattr(self, "model") or self.model is None:
+            self.load_onnx_model()  # loads the tokenizer as well
+        return self._token_count(texts, batch_size=batch_size, **kwargs)
+class Bm42TextEmbeddingWorker(TextEmbeddingWorker[SparseEmbedding]):
+    def init_embedding(self, model_name: str, cache_dir: str, **kwargs: Any) -> Bm42:
+        return Bm42(
+            model_name=model_name,
+            cache_dir=cache_dir,
+            **kwargs,
+        )