PyPI - mteb - Versions diffs - 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl - Mend

mteb 2.1.4py3-none-any.whl → 2.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (527) hide show

mteb/models/model_implementations/jasper_models.py CHANGED Viewed

@@ -7,13 +7,225 @@ from torch.utils.data import DataLoader
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models.abs_encoder import AbsEncoder
+from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
+from mteb.models.model_implementations.bge_models import (
+    bge_chinese_training_data,
+    bge_full_data,
+    bge_m3_training_data,
+)
+from mteb.models.model_implementations.e5_instruct import E5_MISTRAL_TRAINING_DATA
+from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets
+from mteb.models.model_implementations.qzhou_models import qzhou_training_data
 from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.types import Array, BatchedInput, PromptType
-from .nvidia_models import nvidia_training_datasets
 logger = logging.getLogger(__name__)
+jasper_token_compression_600m_prompts_dict = {
+    "AFQMC": "Retrieve semantically similar text",
+    "AILACasedocs": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "AILAStatutes": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "ATEC": "Retrieve semantically similar text",
+    "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual",
+    "ArXivHierarchicalClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts",
+    "ArXivHierarchicalClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles",
+    "ArguAna": {
+        "query": "Given a claim, find documents that refute the claim",
+        "document": "Given a claim, find documents that refute the claim",
+    },
+    "AskUbuntuDupQuestions": {
+        "query": "Retrieve duplicate questions from AskUbuntu forum",
+        "document": "",
+    },
+    "BIOSSES": "Retrieve semantically similar text",
+    "BQ": "Retrieve semantically similar text",
+    "Banking77Classification": "Given a online banking query, find the corresponding intents",
+    "BiorxivClusteringP2P.v2": "Identify the main category of Biorxiv papers based on the titles and abstracts",
+    "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts",
+    "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles",
+    "CMedQAv1-reranking": {
+        "query": "Given a Chinese community medical question, retrieve replies that best answer the question",
+        "document": "",
+    },
+    "CMedQAv2-reranking": {
+        "query": "Given a Chinese community medical question, retrieve replies that best answer the question",
+        "document": "",
+    },
+    "CQADupstackGamingRetrieval": {
+        "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question",
+        "document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question",
+    },
+    "CQADupstackUnixRetrieval": {
+        "query": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question",
+        "document": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question",
+    },
+    "ClimateFEVERHardNegatives": {
+        "query": "Given a claim about climate change, retrieve documents that support or refute the claim",
+        "document": "",
+    },
+    "CmedqaRetrieval": {
+        "query": "Given a Chinese community medical question, retrieve replies that best answer the question",
+        "document": "",
+    },
+    "Cmnli": "Retrieve semantically similar text.",
+    "CovidRetrieval": {
+        "query": "Given a question on COVID-19, retrieve news articles that answer the question",
+        "document": "",
+    },
+    "DuRetrieval": {
+        "query": "Given a Chinese search query, retrieve web passages that answer the question",
+        "document": "",
+    },
+    "EcomRetrieval": {
+        "query": "Given a user query from an e-commerce website, retrieve description sentences of relevant products",
+        "document": "",
+    },
+    "FEVERHardNegatives": {
+        "query": "Given a claim, retrieve documents that support or refute the claim",
+        "document": "",
+    },
+    "FiQA2018": {
+        "query": "Given a financial question, retrieve user replies that best answer the question",
+        "document": "",
+    },
+    "GerDaLIRSmall": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "HotpotQAHardNegatives": {
+        "query": "Given a multi-hop question, retrieve documents that can help answer the question",
+        "document": "",
+    },
+    "IFlyTek": "Given an App description text, find the appropriate fine-grained category",
+    "ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset",
+    "JDReview": "Classify the customer review for iPhone on e-commerce platform into positive or negative",
+    "LCQMC": "Retrieve semantically similar text",
+    "LeCaRDv2": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "LegalBenchConsumerContractsQA": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "LegalBenchCorporateLobbying": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "LegalQuAD": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "LegalSummarization": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "MMarcoReranking": {
+        "query": "Given a Chinese search query, retrieve web passages that answer the question",
+        "document": "",
+    },
+    "MMarcoRetrieval": {
+        "query": "Given a web search query, retrieve relevant passages that answer the query",
+        "document": "",
+    },
+    "MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation",
+    "MassiveIntentClassification": "Given a user utterance as query, find the user intents",
+    "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios",
+    "MedicalRetrieval": {
+        "query": "Given a medical question, retrieve user replies that best answer the question",
+        "document": "",
+    },
+    "MedrxivClusteringP2P.v2": "Identify the main category of Medrxiv papers based on the titles and abstracts",
+    "MedrxivClusteringS2S.v2": "Identify the main category of Medrxiv papers based on the titles",
+    "MindSmallReranking": {
+        "query": "Retrieve relevant news articles based on user browsing history",
+        "document": "",
+    },
+    "MultilingualSentiment": "Classify sentiment of the customer review into positive, neutral, or negative",
+    "Ocnli": "Retrieve semantically similar text.",
+    "OnlineShopping": "Classify the customer review for online shopping into positive or negative",
+    "PAWSX": "Retrieve semantically similar text",
+    "QBQTC": "Retrieve semantically similar text",
+    "SCIDOCS": {
+        "query": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper",
+        "document": "",
+    },
+    "SICK-R": "Retrieve semantically similar text",
+    "STS12": "Retrieve semantically similar text",
+    "STS13": "Retrieve semantically similar text",
+    "STS14": "Retrieve semantically similar text",
+    "STS15": "Retrieve semantically similar text",
+    "STS17": "Retrieve semantically similar text",
+    "STS22.v2": "Retrieve semantically similar text",
+    "STSB": "Retrieve semantically similar text",
+    "STSBenchmark": "Retrieve semantically similar text",
+    "SprintDuplicateQuestions": "Retrieve duplicate questions from Sprint forum",
+    "StackExchangeClustering.v2": "Identify the topic or theme of StackExchange posts based on the titles",
+    "StackExchangeClusteringP2P.v2": "Identify the topic or theme of StackExchange posts based on the given paragraphs",
+    "SummEvalSummarization.v2": "Retrieve semantically similar text",
+    "T2Reranking": {
+        "query": "Given a Chinese search query, retrieve web passages that answer the question",
+        "document": "",
+    },
+    "T2Retrieval": {
+        "query": "Given a Chinese search query, retrieve web passages that answer the question",
+        "document": "",
+    },
+    "TNews": "Classify the fine-grained category of the given news title",
+    "TRECCOVID": {
+        "query": "Given a query on COVID-19, retrieve documents that answer the query",
+        "document": "",
+    },
+    "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents",
+    "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles",
+    "Touche2020Retrieval.v3": {
+        "query": "Given a question, retrieve detailed and persuasive arguments that answer the question",
+        "document": "",
+    },
+    "ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic",
+    "TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral",
+    "TwentyNewsgroupsClustering.v2": "Identify the topic or theme of the given news articles",
+    "TwitterSemEval2015": "Retrieve tweets that are semantically similar to the given tweet",
+    "TwitterURLCorpus": "Retrieve tweets that are semantically similar to the given tweet",
+    "VideoRetrieval": {
+        "query": "Given a video search query, retrieve the titles of relevant videos",
+        "document": "",
+    },
+    "Waimai": "Classify the customer review from a food takeaway platform into positive or negative",
+}
+jasper_token_compression_600m_loader_kwargs = dict(
+    model_kwargs={
+        "attn_implementation": "sdpa",
+        "torch_dtype": "bfloat16",
+        "trust_remote_code": True,
+    },
+    tokenizer_kwargs={"padding_side": "left"},
+    trust_remote_code=True,
+    prompts_dict=jasper_token_compression_600m_prompts_dict,
+    apply_instruction_to_passages=True,
+    instruction_template="Instruct: {instruction}\nQuery: ",
+    max_seq_length=1024,
+)
+def instruction_template(
+    instruction: str, prompt_type: PromptType | None = None
+) -> str:
+    if not instruction or prompt_type == PromptType.document:
+        return ""
+    if isinstance(instruction, dict):
+        if prompt_type is None:
+            instruction = "Given a web search query, retrieve relevant passages that answer the query"
+        else:
+            instruction = instruction[prompt_type]
+    return f"Instruct: {instruction}\nQuery:"
 class JasperModel(AbsEncoder):
     def __init__(
@@ -74,6 +286,7 @@ jasper_en_v1 = ModelMeta(
         instruction_template="Instruct: {instruction}\nQuery: ",
     ),
     name="NovaSearch/jasper_en_vision_language_v1",
+    model_type=["dense"],
     languages=["eng-Latn"],
     open_weights=True,
     revision="d6330ce98f8a0d741e781df845904c9484f00efa",
@@ -85,7 +298,7 @@ jasper_en_v1 = ModelMeta(
     license="apache-2.0",
     reference="https://huggingface.co/infgrad/jasper_en_vision_language_v1",
     similarity_fn_name=ScoringFunction.COSINE,
-    framework=["Sentence Transformers", "PyTorch"],
+    framework=["Sentence Transformers", "PyTorch", "safetensors"],
     use_instructions=True,
     adapted_from=None,
     superseded_by=None,
@@ -114,3 +327,43 @@ jasper_en_v1 = ModelMeta(
 }
 """,
 )
+Jasper_Token_Compression_600M = ModelMeta(
+    loader=InstructSentenceTransformerModel,
+    loader_kwargs=jasper_token_compression_600m_loader_kwargs,
+    name="infgrad/Jasper-Token-Compression-600M",
+    model_type=["dense"],
+    languages=["eng-Latn", "zho-Hans"],
+    open_weights=True,
+    revision="06a100f753a5a96d9e583b3af79c6fcdfacc4719",
+    release_date="2025-11-14",
+    n_parameters=595776512,
+    memory_usage_mb=2272,
+    embed_dim=2048,
+    license="mit",
+    max_tokens=32768,
+    reference="https://huggingface.co/infgrad/Jasper-Token-Compression-600M",
+    similarity_fn_name="cosine",
+    framework=["Sentence Transformers", "PyTorch", "safetensors"],
+    use_instructions=True,
+    public_training_code="https://github.com/DunZhang/Jasper-Token-Compression-Training",
+    # public_training_data: unsupervised data for distillation
+    public_training_data="https://huggingface.co/datasets/infgrad/jasper_text_distill_dataset",
+    training_datasets=bge_m3_training_data
+    | bge_chinese_training_data
+    | bge_full_data
+    | E5_MISTRAL_TRAINING_DATA
+    | qzhou_training_data,
+    citation="""
+@misc{zhang2025jaspertokencompression600mtechnicalreport,
+      title={Jasper-Token-Compression-600M Technical Report},
+      author={Dun Zhang and Ziyang Zeng and Yudong Zhou and Shuyang Lu},
+      year={2025},
+      eprint={2511.14405},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR},
+      url={https://arxiv.org/abs/2511.14405},
+}
+""",
+)

mteb/models/model_implementations/jina_clip.py CHANGED Viewed

@@ -7,6 +7,7 @@ from tqdm.auto import tqdm
 from mteb._requires_package import requires_image_dependencies
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.models.abs_encoder import AbsEncoder
+from mteb.models.model_implementations.colpali_models import COLPALI_TRAINING_DATA
 from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.types import Array, BatchedInput, PromptType
@@ -120,9 +121,19 @@ class JinaCLIPModel(AbsEncoder):
         raise ValueError
+_JINA_CLIP_TRAIN_DATASETS_V1 = {
+    # LAION400M
+    # ShareGPT4V
+    "MSMARCO",
+    "NQ",
+    "HotpotQA",
+    # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
+}
 jina_clip_v1 = ModelMeta(
-    loader=JinaCLIPModel,  # type: ignore
+    loader=JinaCLIPModel,
     name="jinaai/jina-clip-v1",
+    model_type=["dense"],
     languages=["eng-Latn"],
     revision="06150c7c382d7a4faedc7d5a0d8cdb59308968f4",
     release_date="2024-05-30",
@@ -135,17 +146,45 @@ jina_clip_v1 = ModelMeta(
     open_weights=True,
     public_training_code=None,
     public_training_data=None,
-    framework=["PyTorch"],
+    framework=["PyTorch", "Transformers", "ONNX", "safetensors"],
     reference="https://huggingface.co/jinaai/jina-clip-v1",
     similarity_fn_name=ScoringFunction.COSINE,
     use_instructions=True,
-    training_datasets={
-        # LAION400M
-        # ShareGPT4V
-        "MSMARCO",
-        # NQ
-        # HotpotQA
-        # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
-    },
+    training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1,
     citation=JINA_CLIP_CITATION,
+    superseded_by="jinaai/jina-clip-v2",
+)
+jina_clip_v2 = ModelMeta(
+    loader=JinaCLIPModel,
+    name="jinaai/jina-clip-v2",
+    revision="344d954da76eb8ad47a7aaff42d012e30c15b8fe",
+    release_date="2024-10-09",
+    languages=["eng-Latn"],
+    n_parameters=865278477,
+    memory_usage_mb=1650.0,
+    max_tokens=8192,
+    embed_dim=1024,
+    license="cc-by-nc-4.0",
+    open_weights=True,
+    public_training_code=None,
+    public_training_data=None,
+    framework=["PyTorch", "Sentence Transformers"],
+    reference="https://huggingface.co/jinaai/jina-clip-v2",
+    similarity_fn_name=ScoringFunction.COSINE,
+    use_instructions=False,
+    training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1 | COLPALI_TRAINING_DATA,
+    modalities=["text", "image"],
+    model_type=["dense"],
+    citation="""
+@misc{koukounas2024jinaclipv2multilingualmultimodalembeddings,
+      title={jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images},
+      author={Andreas Koukounas and Georgios Mastrapas and Bo Wang and Mohammad Kalim Akram and Sedigheh Eslami and Michael Günther and Isabelle Mohr and Saba Sturua and Scott Martens and Nan Wang and Han Xiao},
+      year={2024},
+      eprint={2412.08802},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2412.08802},
+}
+""",
 )

mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

mteb 2.1.4py3-none-any.whl → 2.7.2py3-none-any.whl