PyPI - mteb - Versions diffs - 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl - Mend

mteb 2.1.4py3-none-any.whl → 2.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (527) hide show

mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py CHANGED Viewed

@@ -41,4 +41,7 @@ class VABBMultiLabelClassification(AbsTaskMultilabelClassification):
   year = {2024},
 }
 """,
+        prompt={
+            "query": "Classificeer de onderwerpen van een wetenschappelijk artikel op basis van de abstract"
+        },
     )

mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py CHANGED Viewed

@@ -7,12 +7,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class BrazilianToxicTweetsClassification(AbsTaskMultilabelClassification):
     metadata = TaskMetadata(
         name="BrazilianToxicTweetsClassification",
-        description="""
-        ToLD-Br is the biggest dataset for toxic tweets in Brazilian Portuguese, crowdsourced by 42 annotators selected from
-        a pool of 129 volunteers. Annotators were selected aiming to create a plural group in terms of demographics (ethnicity,
-        sexual orientation, age, gender). Each tweet was labeled by three annotators in 6 possible categories: LGBTQ+phobia,
-        Xenophobia, Obscene, Insult, Misogyny and Racism.
-        """,
+        description="ToLD-Br is the biggest dataset for toxic tweets in Brazilian Portuguese, crowdsourced by 42 annotators selected from a pool of 129 volunteers. Annotators were selected aiming to create a plural group in terms of demographics (ethnicity, sexual orientation, age, gender). Each tweet was labeled by three annotators in 6 possible categories: LGBTQ+phobia, Xenophobia, Obscene, Insult, Misogyny and Racism.",
         reference="https://paperswithcode.com/dataset/told-br",
         dataset={
             "path": "mteb/BrazilianToxicTweetsClassification",

mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py CHANGED Viewed

@@ -7,7 +7,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class SwedishPatentCPCGroupClassification(AbsTaskMultilabelClassification):
     metadata = TaskMetadata(
         name="SwedishPatentCPCGroupClassification",
-        description="""This dataset contains historical Swedish patent documents (1885-1972) classified according to the Cooperative Patent Classification (CPC) system at the group level. Each document can have multiple labels, making this a challenging multi-label classification task with significant class imbalance and data sparsity characteristics. The dataset includes patent claims text extracted from digitally recreated versions of historical Swedish patents, generated using Optical Character Recognition (OCR) from original paper documents. The text quality varies due to OCR limitations, but all CPC labels were manually assigned by patent engineers at PRV (Swedish Patent and Registration Office), ensuring high reliability for machine learning applications.""",
+        description="This dataset contains historical Swedish patent documents (1885-1972) classified according to the Cooperative Patent Classification (CPC) system at the group level. Each document can have multiple labels, making this a challenging multi-label classification task with significant class imbalance and data sparsity characteristics. The dataset includes patent claims text extracted from digitally recreated versions of historical Swedish patents, generated using Optical Character Recognition (OCR) from original paper documents. The text quality varies due to OCR limitations, but all CPC labels were manually assigned by patent engineers at PRV (Swedish Patent and Registration Office), ensuring high reliability for machine learning applications.",
         reference="https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254",
         type="MultilabelClassification",
         category="t2t",

mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py CHANGED Viewed

@@ -7,8 +7,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class SwedishPatentCPCSubclassClassification(AbsTaskMultilabelClassification):
     metadata = TaskMetadata(
         name="SwedishPatentCPCSubclassClassification",
-        description="""This dataset contains historical Swedish patent documents (1885-1972) classified according to the Cooperative Patent Classification (CPC) system. Each document can have multiple labels, making this a multi-label classification task with significant implications for patent retrieval and prior art search.
-		The dataset includes patent claims text extracted from digitally recreated versions of historical Swedish patents, generated using Optical Character Recognition (OCR) from original paper documents. The text quality varies due to OCR limitations, but all CPC labels were manually assigned by patent engineers at PRV (Swedish Patent and Registration Office), ensuring high reliability for machine learning applications.""",
+        description="This dataset contains historical Swedish patent documents (1885-1972) classified according to the Cooperative Patent Classification (CPC) system. Each document can have multiple labels, making this a multi-label classification task with significant implications for patent retrieval and prior art search. The dataset includes patent claims text extracted from digitally recreated versions of historical Swedish patents, generated using Optical Character Recognition (OCR) from original paper documents. The text quality varies due to OCR limitations, but all CPC labels were manually assigned by patent engineers at PRV (Swedish Patent and Registration Office), ensuring high reliability for machine learning applications.",
         reference="https://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-368254",
         type="MultilabelClassification",
         category="t2t",

mteb/tasks/pair_classification/dan/talemaader_pc.py CHANGED Viewed

@@ -5,12 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class TalemaaderPC(AbsTaskPairClassification):
     metadata = TaskMetadata(
         name="TalemaaderPC",
-        description="""\
-The Danish Language and Literature Society has developed a dataset for evaluating language models in Danish.
-The dataset contains a total of 1000 Danish idioms and fixed expressions with transferred meanings based on the Danish Dictionary's collection of fixed expressions with associated definitions.
-For each of the 1000 idioms and fixed expressions, three false definitions have also been prepared.
-The dataset can be used to test the performance of language models in identifying correct definitions for Danish idioms and fixed expressions.
-""",
+        description="\\ The Danish Language and Literature Society has developed a dataset for evaluating language models in Danish. The dataset contains a total of 1000 Danish idioms and fixed expressions with transferred meanings based on the Danish Dictionary's collection of fixed expressions with associated definitions. For each of the 1000 idioms and fixed expressions, three false definitions have also been prepared. The dataset can be used to test the performance of language models in identifying correct definitions for Danish idioms and fixed expressions.",
         reference="https://sprogteknologi.dk/dataset/1000-talemader-evalueringsdatasaet",
         dataset={
             "path": "mteb/talemaader_pc",

mteb/tasks/pair_classification/eng/legal_bench_pc.py CHANGED Viewed

@@ -50,15 +50,7 @@ _DATASET_COLUMN_MAP = [
 class LegalBenchPC(AbsTaskPairClassification):
     metadata = TaskMetadata(
         name="LegalBenchPC",
-        description="""This LegalBench pair classification task is a combination of the following datasets:
-        - Citation Prediction Classification: Given a legal statement and a case citation, determine if the citation is supportive of the legal statement.
-        - Consumer Contracts QA: The task consists of 400 yes/no questions relating to consumer contracts (specifically, online terms of service) and is relevant to the legal skill of contract interpretation.
-        - Contract QA: Answer yes/no questions about whether contractual clauses discuss particular issues like confidentiality requirements, BIPA consent, PII data breaches, breach of contract etc.
-        - Hearsay: Classify if a particular piece of evidence qualifies as hearsay. Each sample in the dataset describes (1) an issue being litigated or an assertion a party wishes to prove, and (2) a piece of evidence a party wishes to introduce. The goal is to determine if—as it relates to the issue—the evidence would be considered hearsay under the definition provided above.
-        - Privacy Policy Entailment: Given a privacy policy clause and a description of the clause, determine if the description is correct. This is a binary classification task in which the LLM is provided with a clause from a privacy policy, and a description of that clause (e.g., “The policy describes collection of the user’s HTTP cookies, flash cookies, pixel tags, or similar identifiers by a party to the contract.”).
-        - Privacy Policy QA: Given a question and a clause from a privacy policy, determine if the clause contains enough information to answer the question. This is a binary classification task in which the LLM is provided with a question (e.g., “do you publish my data”) and a clause from a privacy policy. The LLM must determine if the clause contains an answer to the question, and classify the question-clause pair.
-        """,
+        description="This LegalBench pair classification task is a combination of the following datasets: - Citation Prediction Classification: Given a legal statement and a case citation, determine if the citation is supportive of the legal statement. - Consumer Contracts QA: The task consists of 400 yes/no questions relating to consumer contracts (specifically, online terms of service) and is relevant to the legal skill of contract interpretation. - Contract QA: Answer yes/no questions about whether contractual clauses discuss particular issues like confidentiality requirements, BIPA consent, PII data breaches, breach of contract etc. - Hearsay: Classify if a particular piece of evidence qualifies as hearsay. Each sample in the dataset describes (1) an issue being litigated or an assertion a party wishes to prove, and (2) a piece of evidence a party wishes to introduce. The goal is to determine if—as it relates to the issue—the evidence would be considered hearsay under the definition provided above. - Privacy Policy Entailment: Given a privacy policy clause and a description of the clause, determine if the description is correct. This is a binary classification task in which the LLM is provided with a clause from a privacy policy, and a description of that clause (e.g., “The policy describes collection of the user’s HTTP cookies, flash cookies, pixel tags, or similar identifiers by a party to the contract.”). - Privacy Policy QA: Given a question and a clause from a privacy policy, determine if the clause contains enough information to answer the question. This is a binary classification task in which the LLM is provided with a question (e.g., “do you publish my data”) and a clause from a privacy policy. The LLM must determine if the clause contains an answer to the question, and classify the question-clause pair.",
         reference="https://huggingface.co/datasets/nguha/legalbench",
         dataset={
             "path": "mteb/LegalBenchPC",

mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py CHANGED Viewed

@@ -33,4 +33,7 @@ class SICKNLPairClassification(AbsTaskPairClassification):
   year = {2021},
 }
 """,
+        prompt={
+            "query": "Zoek tekst die semantisch vergelijkbaar is met de gegeven tekst."
+        },
     )

mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py CHANGED Viewed

@@ -38,4 +38,7 @@ class XLWICNLPairClassification(AbsTaskPairClassification):
   year = {2020},
 }
 """,
+        prompt={
+            "query": "Zoek tekst die semantisch vergelijkbaar is met de gegeven tekst."
+        },
     )

mteb/tasks/pair_classification/rus/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .terra import TERRa
+from .terra import TERRa, TERRaV2
-__all__ = ["TERRa"]
+__all__ = ["TERRa", "TERRaV2"]

mteb/tasks/pair_classification/rus/terra.py CHANGED Viewed

@@ -1,31 +1,27 @@
 from mteb.abstasks.pair_classification import AbsTaskPairClassification
 from mteb.abstasks.task_metadata import TaskMetadata
+from mteb.types import PromptType
-class TERRa(AbsTaskPairClassification):
-    metadata = TaskMetadata(
-        name="TERRa",
-        dataset={
-            "path": "ai-forever/terra-pairclassification",
-            "revision": "7b58f24536063837d644aab9a023c62199b2a612",
-        },
-        description="Textual Entailment Recognition for Russian. This task requires to recognize, given two text fragments, "
-        + "whether the meaning of one text is entailed (can be inferred) from the other text.",
-        reference="https://arxiv.org/pdf/2010.15925",
-        type="PairClassification",
-        category="t2t",
-        modalities=["text"],
-        eval_splits=["dev"],
-        eval_langs=["rus-Cyrl"],
-        main_score="max_ap",
-        date=("2000-01-01", "2018-01-01"),
-        domains=["News", "Web", "Written"],
-        task_subtypes=[],
-        license="mit",
-        annotations_creators="human-annotated",
-        dialect=[],
-        sample_creation="found",
-        bibtex_citation=r"""
+_terra_metadata = dict(
+    dataset={
+        "path": "ai-forever/terra-pairclassification",
+        "revision": "7b58f24536063837d644aab9a023c62199b2a612",
+    },
+    reference="https://arxiv.org/pdf/2010.15925",
+    type="PairClassification",
+    category="t2t",
+    modalities=["text"],
+    eval_splits=["dev"],
+    eval_langs=["rus-Cyrl"],
+    main_score="max_ap",
+    date=("2000-01-01", "2018-01-01"),
+    domains=["News", "Web", "Written"],
+    task_subtypes=[],
+    license="mit",
+    annotations_creators="human-annotated",
+    dialect=[],
+    sample_creation="found",
+    bibtex_citation=r"""
 @article{shavrina2020russiansuperglue,
   author = {Shavrina, Tatiana
 and Fenogenova, Alena
@@ -42,7 +38,37 @@ and Evlampiev, Andrey},
   year = {2020},
 }
 """,
+)
+class TERRa(AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="TERRa",
+        description="Textual Entailment Recognition for Russian. This task requires to recognize, given two text fragments, "
+        + "whether the meaning of one text is entailed (can be inferred) from the other text.",
         prompt="Given a premise, retrieve a hypothesis that is entailed by the premise",
+        **_terra_metadata,
+    )
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_column("sent1", "sentence1")
+        self.dataset = self.dataset.rename_column("sent2", "sentence2")
+class TERRaV2(AbsTaskPairClassification):
+    input1_prompt_type = PromptType.document
+    input2_prompt_type = PromptType.query
+    metadata = TaskMetadata(
+        name="TERRa.V2",
+        description="Textual Entailment Recognition for Russian. This task requires to recognize, given two text fragments, "
+        + "whether the meaning of one text is entailed (can be inferred) from the other text."
+        + " Version 2 uses different prompt types for the two inputs.",
+        adapted_from=["TERRa"],
+        prompt={
+            PromptType.query.value: "Given a premise, retrieve a hypothesis that is entailed by the premise"
+        },
+        **_terra_metadata,
     )
     def dataset_transform(self):

mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py CHANGED Viewed

@@ -5,11 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class SprintDuplicateQuestionsPCVN(AbsTaskPairClassification):
     metadata = TaskMetadata(
         name="SprintDuplicateQuestions-VN",
-        description="""A translated dataset from Duplicate questions from the Sprint community.
-            The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
-            - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
-            - Applies advanced embedding models to filter the translations.
-            - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
+        description="A translated dataset from Duplicate questions from the Sprint community. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
         reference="https://www.aclweb.org/anthology/D18-1131/",
         dataset={
             "path": "GreenNode/sprintduplicatequestions-pairclassification-vn",

mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py CHANGED Viewed

@@ -9,11 +9,7 @@ class TwitterSemEval2015PCVN(AbsTaskPairClassification):
             "path": "GreenNode/twittersemeval2015-pairclassification-vn",
             "revision": "9215a3c954078fd15c2bbecca914477d53944de1",
         },
-        description="""A translated dataset from Paraphrase-Pairs of Tweets from the SemEval 2015 workshop.
-            The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
-            - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
-            - Applies advanced embedding models to filter the translations.
-            - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
+        description="A translated dataset from Paraphrase-Pairs of Tweets from the SemEval 2015 workshop. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
         reference="https://alt.qcri.org/semeval2015/task1/",
         category="t2c",
         type="PairClassification",

mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py CHANGED Viewed

@@ -9,11 +9,7 @@ class TwitterURLCorpusPC(AbsTaskPairClassification):
             "path": "GreenNode/twitterurlcorpus-pairclassification-vn",
             "revision": "6e6a40aaade2129f70432f2156a6d24b63d72be3",
         },
-        description="""A translated dataset from Paraphrase-Pairs of Tweets.
-            The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
-            - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
-            - Applies advanced embedding models to filter the translations.
-            - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
+        description="A translated dataset from Paraphrase-Pairs of Tweets. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
         reference="https://languagenet.github.io/",
         category="t2c",
         type="PairClassification",

mteb/tasks/regression/multilingual/ru_sci_bench_regression.py CHANGED Viewed

@@ -5,9 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class RuSciBenchCitedCountRegression(AbsTaskRegression):
     metadata = TaskMetadata(
         name="RuSciBenchCitedCountRegression",
-        description="""Predicts the number of times a scientific article has been cited by other papers.
-        The prediction is based on the article's title and abstract. The data is sourced from the Russian electronic
-        library of scientific publications (eLibrary.ru) and includes papers with both Russian and English abstracts.""",
+        description="Predicts the number of times a scientific article has been cited by other papers. The prediction is based on the article's title and abstract. The data is sourced from the Russian electronic library of scientific publications (eLibrary.ru) and includes papers with both Russian and English abstracts.",
         reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
         dataset={
             "path": "mlsa-iai-msu-lab/ru_sci_bench_mteb",
@@ -51,9 +49,7 @@ class RuSciBenchCitedCountRegression(AbsTaskRegression):
 class RuSciBenchYearPublRegression(AbsTaskRegression):
     metadata = TaskMetadata(
         name="RuSciBenchYearPublRegression",
-        description="""Predicts the publication year of a scientific article. The prediction is based on the
-        article's title and abstract. The data is sourced from the Russian electronic library of scientific
-        publications (eLibrary.ru) and includes papers with both Russian and English abstracts.""",
+        description="Predicts the publication year of a scientific article. The prediction is based on the article's title and abstract. The data is sourced from the Russian electronic library of scientific publications (eLibrary.ru) and includes papers with both Russian and English abstracts.",
         reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
         dataset={
             "path": "mlsa-iai-msu-lab/ru_sci_bench_mteb",

mteb/tasks/reranking/jpn/__init__.py CHANGED Viewed

@@ -1,5 +1,13 @@
 from .j_qa_ra_reranking import JQaRAReranking
+from .j_qa_ra_reranking_lite import JQaRARerankingLite
 from .ja_cwir_reranking import JaCWIRReranking
+from .ja_cwir_reranking_lite import JaCWIRRerankingLite
 from .m_marco_reranking import VoyageMMarcoReranking
-__all__ = ["JQaRAReranking", "JaCWIRReranking", "VoyageMMarcoReranking"]
+__all__ = [
+    "JQaRAReranking",
+    "JQaRARerankingLite",
+    "JaCWIRReranking",
+    "JaCWIRRerankingLite",
+    "VoyageMMarcoReranking",
+]

mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py ADDED Viewed

@@ -0,0 +1,49 @@
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+class JQaRARerankingLite(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="JQaRARerankingLite",
+        dataset={
+            "path": "mteb/JQaRARerankingLite",
+            "revision": "d23d3ad479f74824ed126052e810eac47e685558",
+        },
+        description=(
+            "JQaRA (Japanese Question Answering with Retrieval Augmentation) is a reranking dataset "
+            "consisting of questions from JAQKET and corpus from Japanese Wikipedia. This is the lightweight "
+            "version with a reduced corpus (172,897 documents) constructed using hard negatives from "
+            "5 high-performance models."
+        ),
+        reference="https://huggingface.co/datasets/hotchpotch/JQaRA",
+        type="Reranking",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["jpn-Jpan"],
+        main_score="ndcg_at_10",
+        date=("2020-01-01", "2025-01-01"),
+        domains=["Encyclopaedic", "Non-fiction", "Written"],
+        task_subtypes=["Question answering"],
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=["jpn-Jpan"],
+        sample_creation="found",
+        adapted_from=["JQaRAReranking"],
+        bibtex_citation=r"""
+@misc{jmteb_lite,
+  author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan and Fukuchi, Akihiko and Shibata, Tomohide
+and Kawahara, Daisuke},
+  howpublished = {\url{https://huggingface.co/datasets/sbintuitions/JMTEB-lite}},
+  title = {{J}{M}{T}{E}{B}-lite: {T}he {L}ightweight {V}ersion of {JMTEB}},
+  year = {2025},
+}
+@misc{yuichi-tateno-2024-jqara,
+  author = {Yuichi Tateno},
+  title = {JQaRA: Japanese Question Answering with Retrieval Augmentation
+- 検索拡張(RAG)評価のための日本語Q&Aデータセット},
+  url = {https://huggingface.co/datasets/hotchpotch/JQaRA},
+}
+""",
+    )

mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py ADDED Viewed

@@ -0,0 +1,47 @@
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+class JaCWIRRerankingLite(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="JaCWIRRerankingLite",
+        dataset={
+            "path": "mteb/JaCWIRRerankingLite",
+            "revision": "b7c738193fb9b20c97c2b5d9a8fa3f3d28503dc0",
+        },
+        description=(
+            "JaCWIR (Japanese Casual Web IR) is a dataset consisting of questions and webpage meta descriptions "
+            "collected from Hatena Bookmark. This is the lightweight reranking version with a reduced corpus "
+            "(188,033 documents) constructed using hard negatives from 5 high-performance models."
+        ),
+        reference="https://huggingface.co/datasets/hotchpotch/JaCWIR",
+        type="Reranking",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["jpn-Jpan"],
+        main_score="ndcg_at_10",
+        date=("2020-01-01", "2025-01-01"),
+        domains=["Web", "Written"],
+        task_subtypes=["Article retrieval"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        adapted_from=["JaCWIRReranking"],
+        bibtex_citation=r"""
+@misc{jmteb_lite,
+  author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan and Fukuchi, Akihiko and Shibata, Tomohide
+and Kawahara, Daisuke},
+  howpublished = {\url{https://huggingface.co/datasets/sbintuitions/JMTEB-lite}},
+  title = {{J}{M}{T}{E}{B}-lite: {T}he {L}ightweight {V}ersion of {JMTEB}},
+  year = {2025},
+}
+@misc{yuichi-tateno-2024-jacwir,
+  author = {Yuichi Tateno},
+  title = {JaCWIR: Japanese Casual Web IR - 日本語情報検索評価のための小規模でカジュアルなWebタイトルと概要のデータセット},
+  url = {https://huggingface.co/datasets/hotchpotch/JaCWIR},
+}
+""",
+    )

mteb/tasks/reranking/multilingual/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from .esci_reranking import ESCIReranking
 from .hume_wikipedia_reranking_multilingual import HUMEWikipediaRerankingMultilingual
 from .miracl_reranking import MIRACLReranking
+from .multi_long_doc_reranking import MultiLongDocReranking
 from .wikipedia_reranking_multilingual import WikipediaRerankingMultilingual
 from .x_glue_wpr_reranking import XGlueWPRReranking
@@ -8,6 +9,7 @@ __all__ = [
     "ESCIReranking",
     "HUMEWikipediaRerankingMultilingual",
     "MIRACLReranking",
+    "MultiLongDocReranking",
     "WikipediaRerankingMultilingual",
     "XGlueWPRReranking",
 ]

mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py ADDED Viewed

@@ -0,0 +1,70 @@
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+class MultiLongDocReranking(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="MultiLongDocReranking",
+        description=(
+            "Reranking version of MultiLongDocRetrieval (MLDR). MLDR is a Multilingual Long-Document "
+            "Retrieval dataset built on Wikipedia, Wudao and mC4, covering 13 typologically diverse languages. "
+            "Specifically, we sample lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose "
+            "paragraphs from them. Then we use GPT-3.5 to generate questions based on these paragraphs. "
+            "The generated question and the sampled article constitute a new text pair to the dataset."
+        ),
+        reference="https://huggingface.co/datasets/Shitao/MLDR",
+        dataset={
+            "path": "mteb/MultiLongDocReranking",
+            "revision": "ad09ce14c17bce6edae151b7f6ef12e15d91dbf3",
+        },
+        type="Reranking",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs={
+            "ar": ["ara-Arab"],
+            "de": ["deu-Latn"],
+            "en": ["eng-Latn"],
+            "es": ["spa-Latn"],
+            "fr": ["fra-Latn"],
+            "hi": ["hin-Deva"],
+            "it": ["ita-Latn"],
+            "ja": ["jpn-Jpan"],
+            "ko": ["kor-Kore"],
+            "pt": ["por-Latn"],
+            "ru": ["rus-Cyrl"],
+            "th": ["tha-Thai"],
+            "zh": ["zho-Hans"],
+        },
+        main_score="ndcg_at_10",
+        date=(
+            "2000-01-01",
+            "2024-12-31",
+        ),  # Not found in the paper, guessed using the paper's publication date and constituent datasets
+        domains=[
+            "Encyclopaedic",
+            "Written",
+            "Web",
+            "Non-fiction",
+            "Fiction",
+        ],  # narrativeqa, wikipedia, wudao, mC4
+        task_subtypes=[],
+        license="mit",
+        annotations_creators="LM-generated",  # gpt-3.5
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@misc{bge-m3,
+  archiveprefix = {arXiv},
+  author = {Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
+  eprint = {2402.03216},
+  primaryclass = {cs.CL},
+  title = {BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
+  year = {2024},
+}
+""",
+        prompt={
+            "query": "Given a question, rerank long documents based on their relevance to answer the question"
+        },
+        adapted_from=["MultiLongDocRetrieval"],
+    )

mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py CHANGED Viewed

@@ -44,7 +44,7 @@ class WikipediaRerankingMultilingual(AbsTaskRetrieval):
         dialect=[],
         sample_creation="LM-generated and verified",
         bibtex_citation=r"""
-@online{wikidump,
+@online{wikidump2024,
   author = {Wikimedia Foundation},
   title = {Wikimedia Downloads},
   url = {https://dumps.wikimedia.org},

mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py CHANGED Viewed

@@ -78,8 +78,7 @@ _CITATION = r"""
 class XGlueWPRReranking(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="XGlueWPRReranking",
-        description="""XGLUE is a new benchmark dataset to evaluate the performance of cross-lingual pre-trained models
-        with respect to cross-lingual natural language understanding and generation. XGLUE is composed of 11 tasks spans 19 languages.""",
+        description="XGLUE is a new benchmark dataset to evaluate the performance of cross-lingual pre-trained models with respect to cross-lingual natural language understanding and generation. XGLUE is composed of 11 tasks spans 19 languages.",
         reference="https://github.com/microsoft/XGLUE",
         dataset={
             "path": "mteb/XGlueWPRReranking",

mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py CHANGED Viewed

@@ -5,11 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class AskUbuntuDupQuestionsVN(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="AskUbuntuDupQuestions-VN",
-        description="""A translated dataset from AskUbuntu Question Dataset - Questions from AskUbuntu with manual annotations marking pairs of questions as similar or non-similar
-            The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
-            - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
-            - Applies advanced embedding models to filter the translations.
-            - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
+        description="A translated dataset from AskUbuntu Question Dataset - Questions from AskUbuntu with manual annotations marking pairs of questions as similar or non-similar The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
         reference="https://github.com/taolei87/askubuntu",
         dataset={
             "path": "mteb/AskUbuntuDupQuestions-VN",

mteb/tasks/reranking/vie/sci_docs_reranking_vn.py CHANGED Viewed

@@ -5,11 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class SciDocsRerankingVN(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="SciDocsRR-VN",
-        description="""A translated dataset from Ranking of related scientific papers based on their title.
-            The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
-            - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
-            - Applies advanced embedding models to filter the translations.
-            - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
+        description="A translated dataset from Ranking of related scientific papers based on their title. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
         reference="https://allenai.org/data/scidocs",
         dataset={
             "path": "mteb/SciDocsRR-VN",

mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py CHANGED Viewed

@@ -5,11 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class StackOverflowDupQuestionsVN(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="StackOverflowDupQuestions-VN",
-        description="""A translated dataset from Stack Overflow Duplicate Questions Task for questions with the tags Java, JavaScript and Python
-            The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
-            - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
-            - Applies advanced embedding models to filter the translations.
-            - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
+        description="A translated dataset from Stack Overflow Duplicate Questions Task for questions with the tags Java, JavaScript and Python The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
         reference="https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf",
         dataset={
             "path": "mteb/StackOverflowDupQuestions-VN",

mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

mteb 2.1.4py3-none-any.whl → 2.7.2py3-none-any.whl