PyPI - mteb - Versions diffs - 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl - Mend

mteb 2.1.4py3-none-any.whl → 2.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (458) hide show

mteb/models/model_implementations/jasper_models.py CHANGED Viewed

@@ -7,13 +7,225 @@ from torch.utils.data import DataLoader
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models.abs_encoder import AbsEncoder
+from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
+from mteb.models.model_implementations.bge_models import (
+    bge_chinese_training_data,
+    bge_full_data,
+    bge_m3_training_data,
+)
+from mteb.models.model_implementations.e5_instruct import E5_MISTRAL_TRAINING_DATA
+from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets
+from mteb.models.model_implementations.qzhou_models import qzhou_training_data
 from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.types import Array, BatchedInput, PromptType
-from .nvidia_models import nvidia_training_datasets
 logger = logging.getLogger(__name__)
+jasper_token_compression_600m_prompts_dict = {
+    "AFQMC": "Retrieve semantically similar text",
+    "AILACasedocs": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "AILAStatutes": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "ATEC": "Retrieve semantically similar text",
+    "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual",
+    "ArXivHierarchicalClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts",
+    "ArXivHierarchicalClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles",
+    "ArguAna": {
+        "query": "Given a claim, find documents that refute the claim",
+        "document": "Given a claim, find documents that refute the claim",
+    },
+    "AskUbuntuDupQuestions": {
+        "query": "Retrieve duplicate questions from AskUbuntu forum",
+        "document": "",
+    },
+    "BIOSSES": "Retrieve semantically similar text",
+    "BQ": "Retrieve semantically similar text",
+    "Banking77Classification": "Given a online banking query, find the corresponding intents",
+    "BiorxivClusteringP2P.v2": "Identify the main category of Biorxiv papers based on the titles and abstracts",
+    "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts",
+    "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles",
+    "CMedQAv1-reranking": {
+        "query": "Given a Chinese community medical question, retrieve replies that best answer the question",
+        "document": "",
+    },
+    "CMedQAv2-reranking": {
+        "query": "Given a Chinese community medical question, retrieve replies that best answer the question",
+        "document": "",
+    },
+    "CQADupstackGamingRetrieval": {
+        "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question",
+        "document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question",
+    },
+    "CQADupstackUnixRetrieval": {
+        "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question",
+        "document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question",
+    },
+    "ClimateFEVERHardNegatives": {
+        "query": "Given a claim about climate change, retrieve documents that support or refute the claim",
+        "document": "",
+    },
+    "CmedqaRetrieval": {
+        "query": "Given a Chinese community medical question, retrieve replies that best answer the question",
+        "document": "",
+    },
+    "Cmnli": "Retrieve semantically similar text.",
+    "CovidRetrieval": {
+        "query": "Given a question on COVID-19, retrieve news articles that answer the question",
+        "document": "",
+    },
+    "DuRetrieval": {
+        "query": "Given a Chinese search query, retrieve web passages that answer the question",
+        "document": "",
+    },
+    "EcomRetrieval": {
+        "query": "Given a user query from an e-commerce website, retrieve description sentences of relevant products",
+        "document": "",
+    },
+    "FEVERHardNegatives": {
+        "query": "Given a claim, retrieve documents that support or refute the claim",
+        "document": "",
+    },
+    "FiQA2018": {
+        "query": "Given a financial question, retrieve user replies that best answer the question",
+        "document": "",
+    },
+    "GerDaLIRSmall": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "HotpotQAHardNegatives": {
+        "query": "Given a multi-hop question, retrieve documents that can help answer the question",
+        "document": "",
+    },
+    "IFlyTek": "Given an App description text, find the appropriate fine-grained category",
+    "ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset",
+    "JDReview": "Classify the customer review for iPhone on e-commerce platform into positive or negative",
+    "LCQMC": "Retrieve semantically similar text",
+    "LeCaRDv2": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "LegalBenchConsumerContractsQA": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "LegalBenchCorporateLobbying": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "LegalQuAD": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "LegalSummarization": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "MMarcoReranking": {
+        "query": "Given a Chinese search query, retrieve web passages that answer the question",
+        "document": "",
+    },
+    "MMarcoRetrieval": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation",
+    "MassiveIntentClassification": "Given a user utterance as query, find the user intents",
+    "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios",
+    "MedicalRetrieval": {
+        "query": "Given a medical question, retrieve user replies that best answer the question",
+        "document": "",
+    },
+    "MedrxivClusteringP2P.v2": "Identify the main category of Medrxiv papers based on the titles and abstracts",
+    "MedrxivClusteringS2S.v2": "Identify the main category of Medrxiv papers based on the titles",
+    "MindSmallReranking": {
+        "query": "Retrieve relevant news articles based on user browsing history",
+        "document": "",
+    },
+    "MultilingualSentiment": "Classify sentiment of the customer review into positive, neutral, or negative",
+    "Ocnli": "Retrieve semantically similar text.",
+    "OnlineShopping": "Classify the customer review for online shopping into positive or negative",
+    "PAWSX": "Retrieve semantically similar text",
+    "QBQTC": "Retrieve semantically similar text",
+    "SCIDOCS": {
+        "query": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper",
+        "document": "",
+    },
+    "SICK-R": "Retrieve semantically similar text",
+    "STS12": "Retrieve semantically similar text",
+    "STS13": "Retrieve semantically similar text",
+    "STS14": "Retrieve semantically similar text",
+    "STS15": "Retrieve semantically similar text",
+    "STS17": "Retrieve semantically similar text",
+    "STS22.v2": "Retrieve semantically similar text",
+    "STSB": "Retrieve semantically similar text",
+    "STSBenchmark": "Retrieve semantically similar text",
+    "SprintDuplicateQuestions": "Retrieve duplicate questions from Sprint forum",
+    "StackExchangeClustering.v2": "Identify the topic or theme of StackExchange posts based on the titles",
+    "StackExchangeClusteringP2P.v2": "Identify the topic or theme of StackExchange posts based on the given paragraphs",
+    "SummEvalSummarization.v2": "Retrieve semantically similar text",
+    "T2Reranking": {
+        "query": "Given a Chinese search query, retrieve web passages that answer the question",
+        "document": "",
+    },
+    "T2Retrieval": {
+        "query": "Given a Chinese search query, retrieve web passages that answer the question",
+        "document": "",
+    },
+    "TNews": "Classify the fine-grained category of the given news title",
+    "TRECCOVID": {
+        "query": "Given a query on COVID-19, retrieve documents that answer the query",
+        "document": "",
+    },
+    "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents",
+    "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles",
+    "Touche2020Retrieval.v3": {
+        "query": "Given a question, retrieve detailed and persuasive arguments that answer the question",
+        "document": "",
+    },
+    "ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic",
+    "TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral",
+    "TwentyNewsgroupsClustering.v2": "Identify the topic or theme of the given news articles",
+    "TwitterSemEval2015": "Retrieve tweets that are semantically similar to the given tweet",
+    "TwitterURLCorpus": "Retrieve tweets that are semantically similar to the given tweet",
+    "VideoRetrieval": {
+        "query": "Given a video search query, retrieve the titles of relevant videos",
+        "document": "",
+    },
+    "Waimai": "Classify the customer review from a food takeaway platform into positive or negative",
+}
+jasper_token_compression_600m_loader_kwargs = dict(
+    model_kwargs={
+        "attn_implementation": "sdpa",
+        "torch_dtype": "bfloat16",
+        "trust_remote_code": True,
+    },
+    tokenizer_kwargs={"padding_side": "left"},
+    trust_remote_code=True,
+    prompts_dict=jasper_token_compression_600m_prompts_dict,
+    apply_instruction_to_passages=True,
+    instruction_template="Instruct: {instruction}\nQuery: ",
+    max_seq_length=1024,
+)
+def instruction_template(
+    instruction: str, prompt_type: PromptType | None = None
+) -> str:
+    if not instruction or prompt_type == PromptType.document:
+        return ""
+    if isinstance(instruction, dict):
+        if prompt_type is None:
+            instruction = "Given a web search query, retrieve relevant passages that answer the query"
+        else:
+            instruction = instruction[prompt_type]
+    return f"Instruct: {instruction}\nQuery:"
 class JasperModel(AbsEncoder):
     def __init__(
@@ -74,6 +286,7 @@ jasper_en_v1 = ModelMeta(
         instruction_template="Instruct: {instruction}\nQuery: ",
     ),
     name="NovaSearch/jasper_en_vision_language_v1",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=True,
     revision="d6330ce98f8a0d741e781df845904c9484f00efa",
@@ -114,3 +327,43 @@ jasper_en_v1 = ModelMeta(
 }
 """,
 )
+Jasper_Token_Compression_600M = ModelMeta(
+    loader=InstructSentenceTransformerModel,
+    loader_kwargs=jasper_token_compression_600m_loader_kwargs,
+    name="infgrad/Jasper-Token-Compression-600M",
+    model_type=["dense"],
+    languages=["eng-Latn", "zho-Hans"],
+    open_weights=True,
+    revision="06a100f753a5a96d9e583b3af79c6fcdfacc4719",
+    release_date="2025-11-14",
+    n_parameters=595776512,
+    memory_usage_mb=2272,
+    embed_dim=2048,
+    license="mit",
+    max_tokens=32768,
+    reference="https://huggingface.co/infgrad/Jasper-Token-Compression-600M",
+    similarity_fn_name="cosine",
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instructions=True,
+    public_training_code="https://github.com/DunZhang/Jasper-Token-Compression-Training",
+    # public_training_data: unsupervised data for distillation
+    public_training_data="https://huggingface.co/datasets/infgrad/jasper_text_distill_dataset",
+    training_datasets=bge_m3_training_data
+    | bge_chinese_training_data
+    | bge_full_data
+    | E5_MISTRAL_TRAINING_DATA
+    | qzhou_training_data,
+    citation="""
+@misc{zhang2025jaspertokencompression600mtechnicalreport,
+      title={Jasper-Token-Compression-600M Technical Report},
+      author={Dun Zhang and Ziyang Zeng and Yudong Zhou and Shuyang Lu},
+      year={2025},
+      eprint={2511.14405},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR},
+      url={https://arxiv.org/abs/2511.14405},
+}
+""",
+)

mteb/models/model_implementations/jina_clip.py CHANGED Viewed

@@ -123,6 +123,7 @@ class JinaCLIPModel(AbsEncoder):
 jina_clip_v1 = ModelMeta(
     loader=JinaCLIPModel,  # type: ignore
     name="jinaai/jina-clip-v1",
+    model_type=["dense"],
     languages=["eng-Latn"],
     revision="06150c7c382d7a4faedc7d5a0d8cdb59308968f4",
     release_date="2024-05-30",

mteb/models/model_implementations/jina_models.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import logging
+from collections import defaultdict
 from typing import Any, ClassVar
 import numpy as np
 import torch
+from sentence_transformers import CrossEncoder
 from torch.utils.data import DataLoader
 from mteb._requires_package import requires_package
@@ -10,13 +12,92 @@ from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.languages import PROGRAMMING_LANGS
 from mteb.models.abs_encoder import AbsEncoder
 from mteb.models.model_meta import ModelMeta, ScoringFunction
-from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
+from mteb.models.sentence_transformer_wrapper import (
+    CrossEncoderWrapper,
+    SentenceTransformerEncoderWrapper,
+)
 from mteb.types import Array, BatchedInput, PromptType
 logger = logging.getLogger(__name__)
 MIN_SENTENCE_TRANSFORMERS_VERSION = (3, 1, 0)
+multilingual_langs = [
+    "afr-Latn",
+    "ara-Arab",
+    "aze-Latn",
+    "bel-Cyrl",
+    "bul-Cyrl",
+    "ben-Beng",
+    "cat-Latn",
+    "ceb-Latn",
+    "ces-Latn",
+    "cym-Latn",
+    "dan-Latn",
+    "deu-Latn",
+    "ell-Grek",
+    "eng-Latn",
+    "spa-Latn",
+    "est-Latn",
+    "eus-Latn",
+    "fas-Arab",
+    "fin-Latn",
+    "fra-Latn",
+    "glg-Latn",
+    "guj-Gujr",
+    "heb-Hebr",
+    "hin-Deva",
+    "hrv-Latn",
+    "hat-Latn",
+    "hun-Latn",
+    "hye-Armn",
+    "ind-Latn",
+    "isl-Latn",
+    "ita-Latn",
+    "jpn-Jpan",
+    "jav-Latn",
+    "kat-Geor",
+    "kaz-Cyrl",
+    "khm-Khmr",
+    "kan-Knda",
+    "kor-Hang",
+    "kir-Cyrl",
+    "lao-Laoo",
+    "lit-Latn",
+    "lav-Latn",
+    "mkd-Cyrl",
+    "mal-Mlym",
+    "mon-Cyrl",
+    "mar-Deva",
+    "msa-Latn",
+    "mya-Mymr",
+    "nep-Deva",
+    "nld-Latn",
+    "nor-Latn",
+    "nob-Latn",
+    "nno-Latn",
+    "pan-Guru",
+    "pol-Latn",
+    "por-Latn",
+    "que-Latn",
+    "ron-Latn",
+    "rus-Cyrl",
+    "sin-Sinh",
+    "slk-Latn",
+    "slv-Latn",
+    "swa-Latn",
+    "tam-Taml",
+    "tel-Telu",
+    "tha-Thai",
+    "tgl-Latn",
+    "tur-Latn",
+    "ukr-Cyrl",
+    "urd-Arab",
+    "vie-Latn",
+    "yor-Latn",
+    "zho-Hans",
+]
 XLMR_LANGUAGES = [
     "afr-Latn",
     "amh-Latn",
@@ -119,6 +200,28 @@ XLMR_LANGUAGES = [
     "zho-Hans",
 ]
+JINARerankerV3_TRAINING_DATA = {
+    "MIRACLRetrieval",
+    "MIRACLRetrievalHardNegatives",
+    "MIRACLReranking",
+    "CMedQAv1-reranking",
+    "CMedQAv2-reranking",
+    "MrTidyRetrieval",
+    "T2Reranking",
+    "MSMARCO",
+    "MSMARCOHardNegatives",
+    "NQ",
+    "NQHardNegatives",
+    "HotpotQA",
+    "HotpotQAHardNegatives",
+    "T2Retrieval",
+    "DuRetrieval",
+    "MMarcoReranking",
+    "CornStack",
+    "MultiLongDocRetrieval",
+    "StackOverflowQA",
+}
 JinaV4_TRAINING_DATA = {
     "MSMARCO",
     "MSMARCOHardNegatives",
@@ -139,14 +242,72 @@ JinaV4_TRAINING_DATA = {
     "CornStack",
     "VDRMultilingualRetrieval",
     # from https://huggingface.co/datasets/vidore/colpali_train_set
-    "DocVQA",
-    "InfoVQA",
-    "TATDQA",
-    "arXivQA",
+    "VidoreDocVQARetrieval",
+    "VidoreInfoVQARetrieval",
+    "VidoreTatdqaRetrieval",
+    "VidoreArxivQARetrieval",
     # "other", # inhouse dataset including synthetic datasets
 }
+class JinaRerankerV3Wrapper(CrossEncoderWrapper):
+    """Wrapper integration for MTEB."""
+    def __init__(
+        self,
+        model: CrossEncoder | str,
+        revision: str | None = None,
+        trust_remote_code: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        from sentence_transformers.util import get_device_name
+        from transformers import AutoModel
+        self.model = AutoModel.from_pretrained(
+            model, trust_remote_code=trust_remote_code, dtype="auto"
+        )
+        device = kwargs.get("device", None)
+        if device is None:
+            device = get_device_name()
+            logger.info(f"Use pytorch device: {device}")
+        self.model.to(device)
+        self.model.eval()
+    def predict(
+        self,
+        inputs1: DataLoader[BatchedInput],
+        inputs2: DataLoader[BatchedInput],
+        *,
+        task_metadata: TaskMetadata,
+        hf_split: str,
+        hf_subset: str,
+        prompt_type: PromptType | None = None,
+        **kwargs: Any,
+    ) -> Array:
+        all_corpus = [text for batch in inputs2 for text in batch["text"]]
+        all_queries = [text for batch in inputs1 for text in batch["text"]]
+        sentences_count = len(all_corpus)
+        query_groups: dict[str, list[tuple[int, str]]] = defaultdict(list)
+        for idx, (query, doc) in enumerate(zip(all_queries, all_corpus)):
+            query_groups[query].append((idx, doc))
+        results = np.zeros(sentences_count, dtype=np.float32)
+        for query, doc_infos in query_groups.items():
+            original_indices, docs = zip(*doc_infos)
+            scores = self.model.rerank(
+                query, list(docs), max_query_length=3072, max_doc_length=2048
+            )
+            for scr in scores:
+                original_idx = original_indices[scr["index"]]
+                results[original_idx] = float(scr["relevance_score"])
+        return results
 class JinaWrapper(SentenceTransformerEncoderWrapper):
     """following the hf model card documentation."""
@@ -553,6 +714,43 @@ def get_programming_task_override(
     return current_task_name
+jina_reranker_v3 = ModelMeta(
+    loader=JinaRerankerV3Wrapper,
+    loader_kwargs=dict(
+        trust_remote_code=True,
+    ),
+    name="jinaai/jina-reranker-v3",
+    model_type=["cross-encoder"],
+    languages=multilingual_langs,
+    open_weights=True,
+    revision="050e171c4f75dfec5b648ed8470a2475e5a30f30",
+    release_date="2025-09-18",  # official release date
+    modalities=["text"],
+    n_parameters=int(0.6 * 1e9),
+    memory_usage_mb=1138,
+    max_tokens=131072,
+    embed_dim=None,
+    license="cc-by-nc-4.0",
+    similarity_fn_name=None,
+    framework=["PyTorch"],
+    use_instructions=None,
+    reference="https://huggingface.co/jinaai/jina-reranker-v3",
+    public_training_code=None,
+    public_training_data=None,
+    training_datasets=JINARerankerV3_TRAINING_DATA,
+    adapted_from="Qwen/Qwen3-0.6B",
+    citation="""@misc{wang2025jinarerankerv3lateinteractionlistwise,
+      title={jina-reranker-v3: Last but Not Late Interaction for Listwise Document Reranking},
+      author={Feng Wang and Yuqing Li and Han Xiao},
+      year={2025},
+      eprint={2509.25085},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2509.25085},}
+""",
+)
 jina_embeddings_v4 = ModelMeta(
     loader=JinaV4Wrapper,
     loader_kwargs=dict(
@@ -565,6 +763,7 @@ jina_embeddings_v4 = ModelMeta(
         },
     ),
     name="jinaai/jina-embeddings-v4",
+    model_type=["dense"],
     languages=XLMR_LANGUAGES,
     open_weights=True,
     revision="4a58ca57710c49f51896e4bc820e202fbf64904b",
@@ -613,6 +812,7 @@ jina_embeddings_v3 = ModelMeta(
         },
     ),
     name="jinaai/jina-embeddings-v3",
+    model_type=["dense"],
     languages=XLMR_LANGUAGES,
     open_weights=True,
     revision="215a6e121fa0183376388ac6b1ae230326bfeaed",
@@ -666,6 +866,7 @@ jina_embeddings_v2_base_en = ModelMeta(
         trust_remote_code=True,
     ),
     name="jinaai/jina-embeddings-v2-base-en",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=True,
     revision="6e85f575bc273f1fd840a658067d0157933c83f0",
@@ -729,6 +930,7 @@ jina_embeddings_v2_small_en = ModelMeta(
         trust_remote_code=True,
     ),
     name="jinaai/jina-embeddings-v2-small-en",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=True,
     revision="44e7d1d6caec8c883c2d4b207588504d519788d0",
@@ -789,6 +991,7 @@ jina_embeddings_v2_small_en = ModelMeta(
 jina_embedding_b_en_v1 = ModelMeta(
     loader=SentenceTransformerEncoderWrapper,
     name="jinaai/jina-embedding-b-en-v1",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=True,
     revision="32aa658e5ceb90793454d22a57d8e3a14e699516",
@@ -845,6 +1048,7 @@ jina_embedding_b_en_v1 = ModelMeta(
 jina_embedding_s_en_v1 = ModelMeta(
     loader=SentenceTransformerEncoderWrapper,
     name="jinaai/jina-embedding-s-en-v1",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=True,
     revision="5ac6cd473e2324c6d5f9e558a6a9f65abb57143e",

mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

mteb 2.1.4py3-none-any.whl → 2.5.2py3-none-any.whl