PyPI - mteb - Versions diffs - 2.2.2__py3-none-any.whl → 2.3.1__py3-none-any.whl - Mend

mteb 2.2.2py3-none-any.whl → 2.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

mteb/__init__.py +4 -0
mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
mteb/evaluate.py +38 -7
mteb/models/__init__.py +4 -1
mteb/models/cache_wrappers/__init__.py +2 -1
mteb/models/model_implementations/colpali_models.py +4 -4
mteb/models/model_implementations/colqwen_models.py +206 -2
mteb/models/model_implementations/eagerworks_models.py +163 -0
mteb/models/model_implementations/euler_models.py +25 -0
mteb/models/model_implementations/google_models.py +1 -1
mteb/models/model_implementations/jina_models.py +203 -5
mteb/models/model_implementations/nb_sbert.py +1 -1
mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +10 -11
mteb/models/model_implementations/nvidia_models.py +1 -1
mteb/models/model_implementations/ops_moa_models.py +2 -2
mteb/models/model_implementations/promptriever_models.py +4 -4
mteb/models/model_implementations/qwen3_models.py +3 -3
mteb/models/model_implementations/qzhou_models.py +1 -1
mteb/models/model_implementations/random_baseline.py +8 -18
mteb/models/model_implementations/vdr_models.py +1 -0
mteb/models/model_implementations/yuan_models_en.py +57 -0
mteb/models/search_encoder_index/__init__.py +7 -0
mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
mteb/models/search_wrappers.py +157 -41
mteb/results/model_result.py +2 -1
mteb/results/task_result.py +12 -0
mteb/similarity_functions.py +49 -0
mteb/tasks/reranking/multilingual/__init__.py +2 -0
mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +3 -3
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/METADATA +6 -1
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/RECORD +40 -31
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/WHEEL +0 -0
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/entry_points.txt +0 -0
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/licenses/LICENSE +0 -0
{mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/top_level.txt +0 -0

mteb/models/model_implementations/colqwen_models.py CHANGED Viewed

@@ -1,11 +1,19 @@
 import logging
+from typing import Any
 import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
 from mteb._requires_package import (
+    requires_image_dependencies,
     requires_package,
 )
-from mteb.models.model_meta import ModelMeta
+from mteb.abstasks.task_metadata import TaskMetadata
+from mteb.models.abs_encoder import AbsEncoder
+from mteb.models.model_meta import ModelMeta, ScoringFunction
+from mteb.types import Array, BatchedInput, PromptType
 from .colpali_models import (
     COLPALI_CITATION,
@@ -73,6 +81,132 @@ class ColQwen2_5Wrapper(ColPaliEngineWrapper):  # noqa: N801
         )
+class ColQwen3Wrapper(AbsEncoder):
+    """Wrapper for the ColQwen3 vision-language retrieval model."""
+    def __init__(
+        self,
+        model_name: str,
+        *,
+        revision: str | None = None,
+        device: str | None = None,
+        dtype: torch.dtype | str | None = torch.bfloat16,
+        **kwargs: Any,
+    ):
+        requires_image_dependencies()
+        requires_package(self, "transformers", model_name, "pip install mteb[colqwen3]")
+        from transformers import AutoModel, AutoProcessor
+        self.device = device or (
+            "cuda"
+            if torch.cuda.is_available()
+            else "mps"
+            if torch.backends.mps.is_available()
+            else "cpu"
+        )
+        self.model = AutoModel.from_pretrained(
+            model_name,
+            revision=revision,
+            dtype=dtype,
+            trust_remote_code=True,
+            **kwargs,
+        ).to(self.device)
+        self.model.eval()
+        self.processor = AutoProcessor.from_pretrained(
+            model_name,
+            revision=revision,
+            trust_remote_code=True,
+            max_num_visual_tokens=1280,
+        )
+    def encode(
+        self,
+        inputs: DataLoader[BatchedInput],
+        *,
+        task_metadata: TaskMetadata,
+        hf_split: str,
+        hf_subset: str,
+        prompt_type: PromptType | None = None,
+        **kwargs: Any,
+    ) -> Array:
+        if (
+            "text" not in inputs.dataset.features
+            and "image" not in inputs.dataset.features
+        ):
+            raise ValueError("No text or image features found in inputs.")
+        return self.get_fused_embeddings(inputs, **kwargs)
+    def _encode_inputs(self, encoded_inputs: dict[str, torch.Tensor]) -> torch.Tensor:
+        outputs = self.model(**encoded_inputs)
+        # Avoid boolean casting of tensors when checking for custom attributes.
+        embeddings = getattr(outputs, "embeddings", None)
+        if embeddings is None:
+            embeddings = outputs[0]
+        return embeddings
+    def get_fused_embeddings(
+        self,
+        image_texts_pairs: DataLoader[BatchedInput] | None = None,
+        batch_size: int = 32,
+        show_progress_bar: bool = True,
+        fusion_mode="concat",
+        **kwargs: Any,
+    ):
+        import torchvision.transforms.functional as F
+        contains_image = "image" in image_texts_pairs.dataset.features
+        contains_text = "text" in image_texts_pairs.dataset.features
+        contains_both = contains_image and contains_text
+        if contains_both:
+            progress_desc = "Encoding images+texts"
+        elif contains_image:
+            progress_desc = "Encoding images"
+        elif contains_text:
+            progress_desc = "Encoding texts"
+        else:
+            raise ValueError("No text or image features found in inputs.")
+        all_embeds: list[torch.Tensor] = []
+        with torch.no_grad():
+            for batch in tqdm(
+                image_texts_pairs,
+                disable=not show_progress_bar,
+                desc=progress_desc,
+            ):
+                if contains_image:
+                    imgs = [
+                        F.to_pil_image(b.to(self.device))
+                        if not isinstance(b, Image.Image)
+                        else b
+                        for b in batch["image"]
+                    ]
+                else:
+                    imgs = None
+                if contains_text:
+                    texts = batch["text"]
+                else:
+                    texts = None
+                if contains_both:
+                    assert len(imgs) == len(texts), (
+                        f"The number of texts and images must have the same length, got {len(imgs)} and {len(texts)}"
+                    )
+                inputs = self.processor(images=imgs, text=texts)
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                outs = self._encode_inputs(inputs)
+                all_embeds.extend(outs.cpu().to(torch.float32))
+        padded = torch.nn.utils.rnn.pad_sequence(
+            all_embeds, batch_first=True, padding_value=0
+        )
+        return padded
+    def similarity(self, a, b):
+        return self.processor.score_multi_vector(a, b, device=self.device)
 colqwen2 = ModelMeta(
     loader=ColQwen2Wrapper,
     loader_kwargs=dict(
@@ -125,6 +259,72 @@ colqwen2_5 = ModelMeta(
     citation=COLPALI_CITATION,
 )
+TOMORO_TRAINING_DATA = {
+    "VDRMultilingualRetrieval",
+    # from https://huggingface.co/datasets/vidore/colpali_train_set
+    "VidoreDocVQARetrieval",
+    "VidoreInfoVQARetrieval",
+    "VidoreTatdqaRetrieval",
+    "VidoreArxivQARetrieval",
+    "VisRAG-Ret-Train-Synthetic-data",
+    "VisRAG-Ret-Train-In-domain-data",
+}
+TOMORO_CITATION = """
+@misc{huang2025tomoro_colqwen3_embed,
+  title={TomoroAI/tomoro-colqwen3-embed},
+  author={Xin Huang and Kye Min Tan and Albert Phelps},
+  year={2025},
+  url={https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-8b}
+}
+"""
+colqwen3_8b = ModelMeta(
+    loader=ColQwen3Wrapper,
+    name="TomoroAI/tomoro-colqwen3-embed-8b",
+    languages=["eng-Latn"],
+    revision="0b9fe28142910e209bbac15b1efe85507c27644f",
+    release_date="2025-11-26",
+    modalities=["image", "text"],
+    n_parameters=8_000_000_000,
+    memory_usage_mb=16724,
+    max_tokens=262144,
+    embed_dim=320,
+    license="apache-2.0",
+    open_weights=True,
+    public_training_code="https://github.com/illuin-tech/colpali",
+    public_training_data=None,
+    framework=["PyTorch"],
+    reference="https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-8b",
+    similarity_fn_name=ScoringFunction.MAX_SIM,
+    use_instructions=True,
+    training_datasets=TOMORO_TRAINING_DATA,
+    citation=TOMORO_CITATION,
+)
+colqwen3_4b = ModelMeta(
+    loader=ColQwen3Wrapper,
+    name="TomoroAI/tomoro-colqwen3-embed-4b",
+    languages=["eng-Latn"],
+    revision="6a32fb68598730bf5620fbf18d832c784235c59c",
+    release_date="2025-11-26",
+    modalities=["image", "text"],
+    n_parameters=4_000_000_000,
+    memory_usage_mb=8466,
+    max_tokens=262144,
+    embed_dim=320,
+    license="apache-2.0",
+    open_weights=True,
+    public_training_code="https://github.com/illuin-tech/colpali",
+    public_training_data=None,
+    framework=["PyTorch"],
+    reference="https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-4b",
+    similarity_fn_name=ScoringFunction.MAX_SIM,
+    use_instructions=True,
+    training_datasets=TOMORO_TRAINING_DATA,
+    citation=TOMORO_CITATION,
+)
 colnomic_7b = ModelMeta(
     loader=ColQwen2_5Wrapper,
     loader_kwargs=dict(
@@ -223,7 +423,11 @@ colnomic_7b = ModelMeta(
 EVOQWEN_TRAINING_DATA = {
-    "colpali_train_set",
+    # "colpali_train_set",
+    "VidoreDocVQARetrieval",
+    "VidoreInfoVQARetrieval",
+    "VidoreTatdqaRetrieval",
+    "VidoreArxivQARetrieval",
     "VisRAG-Ret-Train-Synthetic-data",
     "VisRAG-Ret-Train-In-domain-data",
 }

mteb/models/model_implementations/eagerworks_models.py ADDED Viewed

@@ -0,0 +1,163 @@
+from typing import Any
+import torch
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from mteb._requires_package import (
+    requires_image_dependencies,
+    requires_package,
+)
+from mteb.abstasks.task_metadata import TaskMetadata
+from mteb.models.abs_encoder import AbsEncoder
+from mteb.models.model_meta import ModelMeta, ScoringFunction
+from mteb.types import Array, BatchedInput, PromptType
+class EagerEmbedV1Wrapper(AbsEncoder):
+    """Wrapper for EagerEmbed single-vector embedding models."""
+    def __init__(
+        self,
+        model_name: str,
+        revision: str | None = None,
+        device: str | None = None,
+        image_size: int = 784,
+        **kwargs,
+    ):
+        requires_image_dependencies()
+        requires_package(
+            self, "qwen_vl_utils", model_name, "pip install mteb[eager_embed]"
+        )
+        from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.image_size = image_size
+        # Load model
+        self.mdl = Qwen3VLForConditionalGeneration.from_pretrained(model_name, **kwargs)
+        self.mdl = self.mdl.to(self.device)
+        self.mdl.eval()
+        # Load processor
+        self.processor = AutoProcessor.from_pretrained(model_name)
+    def get_embedding(self, last_hidden_state: torch.Tensor) -> torch.Tensor:
+        """Extract embeddings from last token of last hidden state."""
+        reps = last_hidden_state[:, -1]
+        return reps
+    def encode(
+        self,
+        inputs: DataLoader[BatchedInput],
+        *,
+        task_metadata: TaskMetadata,
+        hf_split: str,
+        hf_subset: str,
+        prompt_type: PromptType | None = None,
+        **kwargs: Any,
+    ) -> Array:
+        """Encode inputs (text and/or images) into embeddings."""
+        from qwen_vl_utils import process_vision_info
+        all_embeddings: list[torch.Tensor] = []
+        with torch.no_grad():
+            for batch in tqdm(inputs, desc="Encoding"):
+                batch_texts = batch.get("text", [])
+                batch_images = batch.get("image", [])
+                messages = []
+                for i in range(max(len(batch_texts), len(batch_images))):
+                    text_content = batch_texts[i] if batch_texts else ""
+                    image_content = batch_images[i] if batch_images else None
+                    query_prefix = "Query: " if prompt_type == PromptType.query else ""
+                    content = [
+                        {"type": "text", "text": f"{query_prefix}{text_content}"}
+                    ]
+                    if image_content is not None:
+                        content.append(
+                            {
+                                "type": "image",
+                                "image": image_content,
+                                "resized_height": self.image_size,
+                                "resized_width": self.image_size,
+                            }
+                        )
+                    messages.append([{"role": "user", "content": content}])
+                # Prepare inputs
+                texts = [
+                    self.processor.apply_chat_template(
+                        msg, tokenize=False, add_generation_prompt=False
+                    )
+                    + "<|endoftext|>"
+                    for msg in messages
+                ]
+                image_inputs = None
+                video_inputs = None
+                if batch_images:
+                    image_inputs, video_inputs = process_vision_info(messages)
+                model_inputs = self.processor(
+                    text=texts,
+                    images=image_inputs,
+                    videos=video_inputs,
+                    padding="longest",
+                    return_tensors="pt",
+                ).to(self.device)
+                # Get embeddings
+                output = self.mdl(
+                    **model_inputs, return_dict=True, output_hidden_states=True
+                )
+                embeddings = self.get_embedding(output.hidden_states[-1])
+                embeddings = embeddings.cpu().to(torch.float32)
+                embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
+                all_embeddings.append(embeddings)
+        return torch.cat(all_embeddings, dim=0)
+EAGER_EMBED_V1_CITATION = """@article{EagerEmbed,
+  title={Eager Embed V1: Multimodal Dense Embeddings for Retrieval},
+  author={Juan Pablo Balarini},
+  year={2025},
+  publisher={Eagerworks},
+  url={https://github.com/eagerworks/eager-embed},
+}"""
+EAGER_EMBED_V1_TRAINING_DATASETS = {"colpali", "bge-ir", "pixmo-docs", "wiki-ss"}
+Eager_Embed_V1 = ModelMeta(
+    loader=EagerEmbedV1Wrapper,
+    loader_kwargs=dict(
+        dtype=torch.float16,
+        image_size=784,
+    ),
+    name="eagerworks/eager-embed-v1",
+    languages=["fra-Latn", "spa-Latn", "eng-Latn", "deu-Latn"],
+    revision="a6bec272729c5056e2c26618ce085205c82a3b3c",
+    release_date="2025-11-20",
+    modalities=["image", "text"],
+    n_parameters=4_000_000_000,
+    memory_usage_mb=16929,
+    max_tokens=262144,
+    embed_dim=2560,
+    license="apache-2.0",
+    open_weights=True,
+    framework=["Tevatron"],
+    reference="https://huggingface.co/eagerworks/eager-embed-v1",
+    similarity_fn_name=ScoringFunction.COSINE,
+    use_instructions=True,
+    training_datasets=EAGER_EMBED_V1_TRAINING_DATASETS,
+    citation=EAGER_EMBED_V1_CITATION,
+    adapted_from="https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct",
+    public_training_code="https://github.com/eagerworks/eager-embed",
+    public_training_data="https://github.com/eagerworks/eager-embed/blob/main/dataset_config.yaml",
+)

mteb/models/model_implementations/euler_models.py ADDED Viewed

@@ -0,0 +1,25 @@
+from mteb.models.model_meta import ModelMeta
+from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
+Euler_Legal_Embedding_V1 = ModelMeta(
+    loader=sentence_transformers_loader,
+    name="Mira190/Euler-Legal-Embedding-V1",
+    revision="df607ed9e25e569514a99c27cdaaab16e76b6dd4",
+    release_date="2025-11-06",
+    languages=["eng-Latn"],
+    n_parameters=8000000000,
+    memory_usage_mb=15618,
+    max_tokens=1536,
+    embed_dim=4096,
+    license="apache-2.0",
+    open_weights=True,
+    public_training_code=None,
+    public_training_data=None,
+    framework=["PyTorch", "Sentence Transformers"],
+    reference="https://huggingface.co/Mira190/Euler-Legal-Embedding-V1",
+    similarity_fn_name="cosine",
+    use_instructions=False,
+    training_datasets=set(),  # final-data-new-anonymized-grok4-filtered
+    adapted_from="Qwen/Qwen3-Embedding-8B",
+    superseded_by=None,
+)

mteb/models/model_implementations/google_models.py CHANGED Viewed

@@ -275,5 +275,5 @@ embedding_gemma_300m = ModelMeta(
     public_training_data=None,
     training_datasets=GECKO_TRAINING_DATA,
     similarity_fn_name="cosine",
-    memory_usage_mb=578,
+    memory_usage_mb=1155,
 )

mteb/models/model_implementations/jina_models.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import logging
+from collections import defaultdict
 from typing import Any, ClassVar
 import numpy as np
 import torch
+from sentence_transformers import CrossEncoder
 from torch.utils.data import DataLoader
 from mteb._requires_package import requires_package
@@ -10,13 +12,92 @@ from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.languages import PROGRAMMING_LANGS
 from mteb.models.abs_encoder import AbsEncoder
 from mteb.models.model_meta import ModelMeta, ScoringFunction
-from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
+from mteb.models.sentence_transformer_wrapper import (
+    CrossEncoderWrapper,
+    SentenceTransformerEncoderWrapper,
+)
 from mteb.types import Array, BatchedInput, PromptType
 logger = logging.getLogger(__name__)
 MIN_SENTENCE_TRANSFORMERS_VERSION = (3, 1, 0)
+multilingual_langs = [
+    "afr-Latn",
+    "ara-Arab",
+    "aze-Latn",
+    "bel-Cyrl",
+    "bul-Cyrl",
+    "ben-Beng",
+    "cat-Latn",
+    "ceb-Latn",
+    "ces-Latn",
+    "cym-Latn",
+    "dan-Latn",
+    "deu-Latn",
+    "ell-Grek",
+    "eng-Latn",
+    "spa-Latn",
+    "est-Latn",
+    "eus-Latn",
+    "fas-Arab",
+    "fin-Latn",
+    "fra-Latn",
+    "glg-Latn",
+    "guj-Gujr",
+    "heb-Hebr",
+    "hin-Deva",
+    "hrv-Latn",
+    "hat-Latn",
+    "hun-Latn",
+    "hye-Armn",
+    "ind-Latn",
+    "isl-Latn",
+    "ita-Latn",
+    "jpn-Jpan",
+    "jav-Latn",
+    "kat-Geor",
+    "kaz-Cyrl",
+    "khm-Khmr",
+    "kan-Knda",
+    "kor-Hang",
+    "kir-Cyrl",
+    "lao-Laoo",
+    "lit-Latn",
+    "lav-Latn",
+    "mkd-Cyrl",
+    "mal-Mlym",
+    "mon-Cyrl",
+    "mar-Deva",
+    "msa-Latn",
+    "mya-Mymr",
+    "nep-Deva",
+    "nld-Latn",
+    "nor-Latn",
+    "nob-Latn",
+    "nno-Latn",
+    "pan-Guru",
+    "pol-Latn",
+    "por-Latn",
+    "que-Latn",
+    "ron-Latn",
+    "rus-Cyrl",
+    "sin-Sinh",
+    "slk-Latn",
+    "slv-Latn",
+    "swa-Latn",
+    "tam-Taml",
+    "tel-Telu",
+    "tha-Thai",
+    "tgl-Latn",
+    "tur-Latn",
+    "ukr-Cyrl",
+    "urd-Arab",
+    "vie-Latn",
+    "yor-Latn",
+    "zho-Hans",
+]
 XLMR_LANGUAGES = [
     "afr-Latn",
     "amh-Latn",
@@ -119,6 +200,28 @@ XLMR_LANGUAGES = [
     "zho-Hans",
 ]
+JINARerankerV3_TRAINING_DATA = {
+    "MIRACLRetrieval",
+    "MIRACLRetrievalHardNegatives",
+    "MIRACLReranking",
+    "CMedQAv1-reranking",
+    "CMedQAv2-reranking",
+    "MrTidyRetrieval",
+    "T2Reranking",
+    "MSMARCO",
+    "MSMARCOHardNegatives",
+    "NQ",
+    "NQHardNegatives",
+    "HotpotQA",
+    "HotpotQAHardNegatives",
+    "T2Retrieval",
+    "DuRetrieval",
+    "MMarcoReranking",
+    "CornStack",
+    "MultiLongDocRetrieval",
+    "StackOverflowQA",
+}
 JinaV4_TRAINING_DATA = {
     "MSMARCO",
     "MSMARCOHardNegatives",
@@ -139,14 +242,72 @@ JinaV4_TRAINING_DATA = {
     "CornStack",
     "VDRMultilingualRetrieval",
     # from https://huggingface.co/datasets/vidore/colpali_train_set
-    "DocVQA",
-    "InfoVQA",
-    "TATDQA",
-    "arXivQA",
+    "VidoreDocVQARetrieval",
+    "VidoreInfoVQARetrieval",
+    "VidoreTatdqaRetrieval",
+    "VidoreArxivQARetrieval",
     # "other", # inhouse dataset including synthetic datasets
 }
+class JinaRerankerV3Wrapper(CrossEncoderWrapper):
+    """Wrapper integration for MTEB."""
+    def __init__(
+        self,
+        model: CrossEncoder | str,
+        revision: str | None = None,
+        trust_remote_code: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        from sentence_transformers.util import get_device_name
+        from transformers import AutoModel
+        self.model = AutoModel.from_pretrained(
+            model, trust_remote_code=trust_remote_code, dtype="auto"
+        )
+        device = kwargs.get("device", None)
+        if device is None:
+            device = get_device_name()
+            logger.info(f"Use pytorch device: {device}")
+        self.model.to(device)
+        self.model.eval()
+    def predict(
+        self,
+        inputs1: DataLoader[BatchedInput],
+        inputs2: DataLoader[BatchedInput],
+        *,
+        task_metadata: TaskMetadata,
+        hf_split: str,
+        hf_subset: str,
+        prompt_type: PromptType | None = None,
+        **kwargs: Any,
+    ) -> Array:
+        all_corpus = [text for batch in inputs2 for text in batch["text"]]
+        all_queries = [text for batch in inputs1 for text in batch["text"]]
+        sentences_count = len(all_corpus)
+        query_groups: dict[str, list[tuple[int, str]]] = defaultdict(list)
+        for idx, (query, doc) in enumerate(zip(all_queries, all_corpus)):
+            query_groups[query].append((idx, doc))
+        results = np.zeros(sentences_count, dtype=np.float32)
+        for query, doc_infos in query_groups.items():
+            original_indices, docs = zip(*doc_infos)
+            scores = self.model.rerank(
+                query, list(docs), max_query_length=3072, max_doc_length=2048
+            )
+            for scr in scores:
+                original_idx = original_indices[scr["index"]]
+                results[original_idx] = float(scr["relevance_score"])
+        return results
 class JinaWrapper(SentenceTransformerEncoderWrapper):
     """following the hf model card documentation."""
@@ -553,6 +714,43 @@ def get_programming_task_override(
     return current_task_name
+jina_reranker_v3 = ModelMeta(
+    loader=JinaRerankerV3Wrapper,
+    loader_kwargs=dict(
+        trust_remote_code=True,
+    ),
+    name="jinaai/jina-reranker-v3",
+    languages=multilingual_langs,
+    open_weights=True,
+    revision="050e171c4f75dfec5b648ed8470a2475e5a30f30",
+    release_date="2025-09-18",  # official release date
+    modalities=["text"],
+    n_parameters=int(0.6 * 1e9),
+    memory_usage_mb=1138,
+    max_tokens=131072,
+    embed_dim=None,
+    license="cc-by-nc-4.0",
+    similarity_fn_name=None,
+    framework=["PyTorch"],
+    use_instructions=None,
+    reference="https://huggingface.co/jinaai/jina-reranker-v3",
+    is_cross_encoder=True,
+    public_training_code=None,
+    public_training_data=None,
+    training_datasets=JINARerankerV3_TRAINING_DATA,
+    adapted_from="Qwen/Qwen3-0.6B",
+    citation="""@misc{wang2025jinarerankerv3lateinteractionlistwise,
+      title={jina-reranker-v3: Last but Not Late Interaction for Listwise Document Reranking},
+      author={Feng Wang and Yuqing Li and Han Xiao},
+      year={2025},
+      eprint={2509.25085},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2509.25085},}
+""",
+)
 jina_embeddings_v4 = ModelMeta(
     loader=JinaV4Wrapper,
     loader_kwargs=dict(

mteb/models/model_implementations/nb_sbert.py CHANGED Viewed

@@ -11,7 +11,7 @@ nb_sbert = ModelMeta(
     revision="b95656350a076aeafd2d23763660f80655408cc6",
     release_date="2022-11-23",
     n_parameters=1_780_000_000,
-    memory_usage_mb=197,
+    memory_usage_mb=678,
     embed_dim=4096,
     license="apache-2.0",
     max_tokens=75,

mteb 2.2.2__py3-none-any.whl → 2.3.1__py3-none-any.whl

mteb 2.2.2py3-none-any.whl → 2.3.1py3-none-any.whl