PyPI - mteb - Versions diffs - 2.2.2__py3-none-any.whl → 2.3.1__py3-none-any.whl - Mend

mteb 2.2.2py3-none-any.whl → 2.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

mteb/__init__.py +4 -0
mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
mteb/evaluate.py +38 -7
mteb/models/__init__.py +4 -1
mteb/models/cache_wrappers/__init__.py +2 -1
mteb/models/model_implementations/colpali_models.py +4 -4
mteb/models/model_implementations/colqwen_models.py +206 -2
mteb/models/model_implementations/eagerworks_models.py +163 -0
mteb/models/model_implementations/euler_models.py +25 -0
mteb/models/model_implementations/google_models.py +1 -1
mteb/models/model_implementations/jina_models.py +203 -5
mteb/models/model_implementations/nb_sbert.py +1 -1
mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +10 -11
mteb/models/model_implementations/nvidia_models.py +1 -1
mteb/models/model_implementations/ops_moa_models.py +2 -2
mteb/models/model_implementations/promptriever_models.py +4 -4
mteb/models/model_implementations/qwen3_models.py +3 -3
mteb/models/model_implementations/qzhou_models.py +1 -1
mteb/models/model_implementations/random_baseline.py +8 -18
mteb/models/model_implementations/vdr_models.py +1 -0
mteb/models/model_implementations/yuan_models_en.py +57 -0
mteb/models/search_encoder_index/__init__.py +7 -0
mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
mteb/models/search_wrappers.py +157 -41
mteb/results/model_result.py +2 -1
mteb/results/task_result.py +12 -0
mteb/similarity_functions.py +49 -0
mteb/tasks/reranking/multilingual/__init__.py +2 -0
mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +3 -3
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/METADATA +6 -1
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/RECORD +40 -31
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/WHEEL +0 -0
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/entry_points.txt +0 -0
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/licenses/LICENSE +0 -0
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/top_level.txt +0 -0

mteb/models/search_wrappers.py CHANGED Viewed

@@ -21,6 +21,7 @@ from mteb.types import (
 )
 from .models_protocols import CrossEncoderProtocol, EncoderProtocol
+from .search_encoder_index.search_backend_protocol import IndexEncoderSearchProtocol
 logger = logging.getLogger(__name__)
@@ -28,13 +29,19 @@ logger = logging.getLogger(__name__)
 class SearchEncoderWrapper:
     """Wrapper for Encoder models to be used in search tasks."""
-    corpus_chunk_size = 50_000
     task_corpus: CorpusDatasetType | None
-    def __init__(self, model: EncoderProtocol):
+    def __init__(
+        self,
+        model: EncoderProtocol,
+        corpus_chunk_size: int = 50_000,
+        index_backend: IndexEncoderSearchProtocol | None = None,
+    ) -> None:
         self.model = model
         self.task_corpus = None
         self.mteb_model_meta = model.mteb_model_meta
+        self.corpus_chunk_size = corpus_chunk_size
+        self.index_backend = index_backend
     def index(
         self,
@@ -56,6 +63,22 @@ class SearchEncoderWrapper:
         """
         # Always retain corpus for potential reranking or fallback flows
         self.task_corpus = corpus
+        if self.index_backend is not None:
+            all_doc_embeddings = self.model.encode(
+                create_dataloader(
+                    corpus,
+                    task_metadata,
+                    prompt_type=PromptType.document,
+                    **encode_kwargs,
+                ),
+                task_metadata=task_metadata,
+                hf_split=hf_split,
+                hf_subset=hf_subset,
+                prompt_type=PromptType.document,
+                **encode_kwargs,
+            )
+            self.index_backend.add_documents(all_doc_embeddings, corpus["id"])
     def search(
         self,
@@ -105,27 +128,74 @@ class SearchEncoderWrapper:
         if top_ranked is not None:
             logger.info("Reranking pre-ranked documents...")
-            result_heaps = self._rerank_documents(
-                query_idx_to_id=query_idx_to_id,
-                query_embeddings=query_embeddings,
-                top_ranked=top_ranked,
-                top_k=top_k,
-                task_metadata=task_metadata,
-                hf_subset=hf_subset,
-                hf_split=hf_split,
-                encode_kwargs=encode_kwargs,
-            )
+            if self.index_backend is None:
+                result_heaps = self._rerank_documents(
+                    query_idx_to_id=query_idx_to_id,
+                    query_embeddings=query_embeddings,
+                    top_ranked=top_ranked,
+                    top_k=top_k,
+                    task_metadata=task_metadata,
+                    hf_subset=hf_subset,
+                    hf_split=hf_split,
+                    encode_kwargs=encode_kwargs,
+                )
+            else:
+                cos_scores_top_k_values, cos_scores_top_k_idx = (
+                    self.index_backend.search(
+                        query_embeddings,
+                        top_k,
+                        similarity_fn=self.model.similarity,
+                        top_ranked=top_ranked,
+                        query_idx_to_id=query_idx_to_id,
+                    )
+                )
+                result_heaps = {qid: [] for qid in query_idx_to_id.values()}
+                for query_itr in range(len(query_embeddings)):
+                    result_heaps = self._rerank_sort_results(
+                        result_heaps=result_heaps,
+                        query_id=query_idx_to_id[query_itr],
+                        ranked_ids=top_ranked[query_idx_to_id[query_itr]],
+                        scores_top_k_idx=torch.tensor(
+                            [cos_scores_top_k_idx[query_itr]]
+                        ),
+                        scores_top_k_values=torch.tensor(
+                            [cos_scores_top_k_values[query_itr]]
+                        ),
+                    )
+                self.index_backend.clear()
         else:
             logger.info("Performing full corpus search...")
-            result_heaps = self._full_corpus_search(
-                query_idx_to_id=query_idx_to_id,
-                query_embeddings=query_embeddings,
-                task_metadata=task_metadata,
-                hf_subset=hf_subset,
-                hf_split=hf_split,
-                top_k=top_k,
-                encode_kwargs=encode_kwargs,
-            )
+            if self.index_backend is None:
+                result_heaps = self._full_corpus_search(
+                    query_idx_to_id=query_idx_to_id,
+                    query_embeddings=query_embeddings,
+                    task_metadata=task_metadata,
+                    hf_subset=hf_subset,
+                    hf_split=hf_split,
+                    top_k=top_k,
+                    encode_kwargs=encode_kwargs,
+                )
+            else:
+                cos_scores_top_k_values, cos_scores_top_k_idx = (
+                    self.index_backend.search(
+                        query_embeddings,
+                        top_k,
+                        similarity_fn=self.model.similarity,
+                        top_ranked=None,
+                        query_idx_to_id=None,
+                    )
+                )
+                result_heaps = {qid: [] for qid in query_idx_to_id.values()}
+                result_heaps = self._sort_full_corpus_results(
+                    result_heaps=result_heaps,
+                    query_idx_to_id=query_idx_to_id,
+                    query_embeddings=query_embeddings,
+                    cos_scores_top_k_idx=cos_scores_top_k_idx,
+                    cos_scores_top_k_values=cos_scores_top_k_values,
+                    sub_corpus_ids=self.task_corpus["id"],
+                    top_k=top_k,
+                )
+                self.index_backend.clear()
         # Reset the task corpus dataloader to None to free up memory
         self.task_corpus = None
@@ -192,19 +262,45 @@ class SearchEncoderWrapper:
             cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
             sub_corpus_ids = list(sub_corpus_ids)
-            for query_itr in range(len(query_embeddings)):
-                query_id = query_idx_to_id[query_itr]
-                for sub_corpus_id, score in zip(
-                    cos_scores_top_k_idx[query_itr],
-                    cos_scores_top_k_values[query_itr],
-                ):
-                    corpus_id = sub_corpus_ids[sub_corpus_id]
-                    if len(result_heaps[query_id]) < top_k:
-                        # push item on the heap
-                        heapq.heappush(result_heaps[query_id], (score, corpus_id))
-                    else:
-                        # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element
-                        heapq.heappushpop(result_heaps[query_id], (score, corpus_id))
+            result_heaps = self._sort_full_corpus_results(
+                result_heaps=result_heaps,
+                query_idx_to_id=query_idx_to_id,
+                query_embeddings=query_embeddings,
+                cos_scores_top_k_idx=cos_scores_top_k_idx,
+                cos_scores_top_k_values=cos_scores_top_k_values,
+                sub_corpus_ids=sub_corpus_ids,
+                top_k=top_k,
+            )
+        return result_heaps
+    def _sort_full_corpus_results(
+        self,
+        result_heaps: dict[str, list[tuple[float, str]]],
+        query_idx_to_id: dict[int, str],
+        query_embeddings: Array,
+        cos_scores_top_k_idx: list[list[int]],
+        cos_scores_top_k_values: list[list[float]],
+        sub_corpus_ids: list[str],
+        top_k: int,
+    ) -> dict[str, list[tuple[float, str]]]:
+        """Sort the heaps into descending order lists.
+        Returns:
+            A dictionary mapping query IDs to a sorted list of tuples, each containing a relevance score and a document ID.
+        """
+        for query_itr in range(len(query_embeddings)):
+            query_id = query_idx_to_id[query_itr]
+            for sub_corpus_id, score in zip(
+                cos_scores_top_k_idx[query_itr],
+                cos_scores_top_k_values[query_itr],
+            ):
+                corpus_id = sub_corpus_ids[sub_corpus_id]
+                if len(result_heaps[query_id]) < top_k:
+                    # push item on the heap
+                    heapq.heappush(result_heaps[query_id], (score, corpus_id))
+                else:
+                    # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element
+                    heapq.heappushpop(result_heaps[query_id], (score, corpus_id))
         return result_heaps
     def _rerank_documents(
@@ -279,14 +375,34 @@ class SearchEncoderWrapper:
             scores_top_k_values = scores_top_k_values.cpu()
             scores_top_k_idx = scores_top_k_idx.cpu()
-            # Build result heap
-            for doc_idx, score in zip(
-                scores_top_k_idx[0].tolist(),
-                scores_top_k_values[0].tolist(),
-            ):
-                corpus_id = ranked_ids[doc_idx]
-                heapq.heappush(result_heaps[query_id], (score, corpus_id))
+            result_heaps = self._rerank_sort_results(
+                result_heaps=result_heaps,
+                query_id=query_id,
+                ranked_ids=ranked_ids,
+                scores_top_k_idx=scores_top_k_idx,
+                scores_top_k_values=scores_top_k_values,
+            )
+        return result_heaps
+    def _rerank_sort_results(
+        self,
+        result_heaps: list[tuple[float, str]],
+        query_id: str,
+        ranked_ids: list[str],
+        scores_top_k_idx: torch.Tensor,
+        scores_top_k_values: torch.Tensor,
+    ) -> list[tuple[float, str]]:
+        """Sort the heap into descending order list.
+        Returns:
+            A sorted list of tuples, each containing a relevance score and a document ID.
+        """
+        for doc_idx, score in zip(
+            scores_top_k_idx[0].tolist(),
+            scores_top_k_values[0].tolist(),
+        ):
+            corpus_id = ranked_ids[doc_idx]
+            heapq.heappush(result_heaps[query_id], (score, corpus_id))
         return result_heaps
     def encode(

mteb/results/model_result.py CHANGED Viewed

@@ -22,7 +22,7 @@ from mteb.types import (
     SplitName,
 )
-from .task_result import TaskResult
+from .task_result import TaskError, TaskResult
 logger = logging.getLogger(__name__)
@@ -82,6 +82,7 @@ class ModelResult(BaseModel):
             protected_namespaces=(),
         )
     )
+    exceptions: list[TaskError] | None = None
     def __repr__(self) -> str:
         n_entries = len(self.task_results)

mteb/results/task_result.py CHANGED Viewed

@@ -840,3 +840,15 @@ class TaskResult(BaseModel):
                     )
                 )
         return results
+class TaskError(BaseModel):
+    """A class to represent an error that occurred during the evaluation of a task.
+    Attributes:
+        task_name: The name of the MTEB task.
+        exception: The error message that occurred during the evaluation.
+    """
+    task_name: str
+    exception: str

mteb/similarity_functions.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 from mteb.models import EncoderProtocol
+from mteb.models.model_meta import ScoringFunction
 from mteb.types import Array
@@ -38,6 +39,54 @@ def compute_pairwise_similarity(
     return pairwise_cos_sim(embedding1, embedding2)
+def select_similarity(
+    embedding1: Array,
+    embedding2: Array,
+    similarity_fn: ScoringFunction,
+) -> Array:
+    """Compute similarity between two sets of embeddings using the specified similarity function.
+    Args:
+        embedding1: The first set of embeddings.
+        embedding2: The second set of embeddings.
+        similarity_fn: The similarity function to use (COSINE, DOT_PRODUCT, EUCLIDEAN).
+    Returns:
+        Array: The computed similarity scores.
+    """
+    if similarity_fn is ScoringFunction.COSINE:
+        return cos_sim(embedding1, embedding2)
+    elif similarity_fn is ScoringFunction.DOT_PRODUCT:
+        return dot_score(embedding1, embedding2)
+    elif similarity_fn is ScoringFunction.EUCLIDEAN:
+        return euclidean_sim(embedding1, embedding2)
+    raise ValueError(f"Unsupported similarity function: {similarity_fn}")
+def select_pairwise_similarity(
+    embedding1: Array,
+    embedding2: Array,
+    similarity_fn: ScoringFunction,
+) -> Array:
+    """Compute pairwise similarity between two sets of embeddings using the specified similarity function.
+    Args:
+        embedding1: The first set of embeddings.
+        embedding2: The second set of embeddings.
+        similarity_fn: The similarity function to use (COSINE, DOT_PRODUCT, EUCLIDEAN).
+    Returns:
+        Array: The computed pairwise similarity scores.
+    """
+    if similarity_fn is ScoringFunction.COSINE:
+        return pairwise_cos_sim(embedding1, embedding2)
+    elif similarity_fn is ScoringFunction.DOT_PRODUCT:
+        return pairwise_dot_score(embedding1, embedding2)
+    elif similarity_fn is ScoringFunction.EUCLIDEAN:
+        return pairwise_euclidean_sim(embedding1, embedding2)
+    raise ValueError(f"Unsupported similarity function: {similarity_fn}")
 def _normalize_embeddings(embeddings: Array) -> torch.Tensor:
     """Normalizes the embeddings matrix, so that each sentence embedding has unit length.

mteb/tasks/reranking/multilingual/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from .esci_reranking import ESCIReranking
 from .hume_wikipedia_reranking_multilingual import HUMEWikipediaRerankingMultilingual
 from .miracl_reranking import MIRACLReranking
+from .multi_long_doc_reranking import MultiLongDocReranking
 from .wikipedia_reranking_multilingual import WikipediaRerankingMultilingual
 from .x_glue_wpr_reranking import XGlueWPRReranking
@@ -8,6 +9,7 @@ __all__ = [
     "ESCIReranking",
     "HUMEWikipediaRerankingMultilingual",
     "MIRACLReranking",
+    "MultiLongDocReranking",
     "WikipediaRerankingMultilingual",
     "XGlueWPRReranking",
 ]

mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py ADDED Viewed

@@ -0,0 +1,70 @@
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+class MultiLongDocReranking(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="MultiLongDocReranking",
+        description=(
+            "Reranking version of MultiLongDocRetrieval (MLDR). MLDR is a Multilingual Long-Document "
+            "Retrieval dataset built on Wikipedia, Wudao and mC4, covering 13 typologically diverse languages. "
+            "Specifically, we sample lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose "
+            "paragraphs from them. Then we use GPT-3.5 to generate questions based on these paragraphs. "
+            "The generated question and the sampled article constitute a new text pair to the dataset."
+        ),
+        reference="https://huggingface.co/datasets/Shitao/MLDR",
+        dataset={
+            "path": "mteb/MultiLongDocReranking",
+            "revision": "ad09ce14c17bce6edae151b7f6ef12e15d91dbf3",
+        },
+        type="Reranking",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs={
+            "ar": ["ara-Arab"],
+            "de": ["deu-Latn"],
+            "en": ["eng-Latn"],
+            "es": ["spa-Latn"],
+            "fr": ["fra-Latn"],
+            "hi": ["hin-Deva"],
+            "it": ["ita-Latn"],
+            "ja": ["jpn-Jpan"],
+            "ko": ["kor-Kore"],
+            "pt": ["por-Latn"],
+            "ru": ["rus-Cyrl"],
+            "th": ["tha-Thai"],
+            "zh": ["zho-Hans"],
+        },
+        main_score="ndcg_at_10",
+        date=(
+            "2000-01-01",
+            "2024-12-31",
+        ),  # Not found in the paper, guessed using the paper's publication date and constituent datasets
+        domains=[
+            "Encyclopaedic",
+            "Written",
+            "Web",
+            "Non-fiction",
+            "Fiction",
+        ],  # narrativeqa, wikipedia, wudao, mC4
+        task_subtypes=[],
+        license="mit",
+        annotations_creators="LM-generated",  # gpt-3.5
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@misc{bge-m3,
+  archiveprefix = {arXiv},
+  author = {Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
+  eprint = {2402.03216},
+  primaryclass = {cs.CL},
+  title = {BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
+  year = {2024},
+}
+""",
+        prompt={
+            "query": "Given a question, rerank long documents based on their relevance to answer the question"
+        },
+        adapted_from=["MultiLongDocRetrieval"],
+    )

mteb/tasks/retrieval/eng/vidore_bench_retrieval.py CHANGED Viewed

@@ -351,6 +351,7 @@ class VidoreSyntheticDocQAAIRetrieval(AbsTaskRetrieval):
 }
 """,
         prompt={"query": "Find a screenshot that relevant to the user's question."},
+        adapted_from=["VidoreDocVQARetrieval"],
     )
     def load_data(self) -> None:
@@ -394,6 +395,7 @@ class VidoreSyntheticDocQAEnergyRetrieval(AbsTaskRetrieval):
 }
 """,
         prompt={"query": "Find a screenshot that relevant to the user's question."},
+        adapted_from=["VidoreDocVQARetrieval"],
     )
     def load_data(self) -> None:
@@ -437,6 +439,7 @@ class VidoreSyntheticDocQAGovernmentReportsRetrieval(AbsTaskRetrieval):
 }
 """,
         prompt={"query": "Find a screenshot that relevant to the user's question."},
+        adapted_from=["VidoreDocVQARetrieval"],
     )
     def load_data(self) -> None:
@@ -480,6 +483,7 @@ class VidoreSyntheticDocQAHealthcareIndustryRetrieval(AbsTaskRetrieval):
 }
 """,
         prompt={"query": "Find a screenshot that relevant to the user's question."},
+        adapted_from=["VidoreDocVQARetrieval"],
     )
     def load_data(self) -> None:

mteb 2.2.2__py3-none-any.whl → 2.3.1__py3-none-any.whl

mteb 2.2.2py3-none-any.whl → 2.3.1py3-none-any.whl