PyPI - mteb - Versions diffs - 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl - Mend

mteb 2.0.5py3-none-any.whl → 2.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (412) hide show

mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py ADDED Viewed

@@ -0,0 +1,33 @@
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+class DutchNewsArticlesRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="DutchNewsArticlesRetrieval",
+        description="This dataset contains all the articles published by the NOS as of the 1st of January 2010. The "
+        "data is obtained by scraping the NOS website. The NOS is one of the biggest (online) news "
+        "organizations in the Netherlands.",
+        reference="https://www.kaggle.com/datasets/maxscheijen/dutch-news-articles",
+        dataset={
+            "path": "clips/mteb-nl-news-articles-ret",
+            "revision": "c8042a86f3eb0d1fcec79a4a44ebf1eafe635462",
+        },
+        type="Retrieval",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["nld-Latn"],
+        main_score="ndcg_at_10",
+        date=("2009-11-01", "2010-01-01"),
+        domains=["Written", "News"],
+        task_subtypes=["Article retrieval"],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="",
+        prompt={
+            "query": "Gegeven een titel, haal het nieuwsartikel op dat het beste bij de titel past"
+        },
+    )

mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py ADDED Viewed

@@ -0,0 +1,42 @@
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+class LegalQANLRetrieval(AbsTaskRetrieval):
+    ignore_identical_ids = True
+    metadata = TaskMetadata(
+        name="LegalQANLRetrieval",
+        description="To this end, we create and publish a Dutch legal QA dataset, consisting of question-answer pairs "
+        "with attributions to Dutch law articles.",
+        reference="https://aclanthology.org/2024.nllp-1.12/",
+        dataset={
+            "path": "clips/mteb-nl-legalqa-pr",
+            "revision": "8f593522dfbe7ec07055ca9d38a700e7643d3882",
+        },
+        type="Retrieval",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["nld-Latn"],
+        main_score="ndcg_at_10",
+        date=("2021-05-01", "2021-08-26"),
+        domains=["Legal", "Written"],
+        task_subtypes=[],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="expert-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{redelaar2024attributed,
+  author = {Redelaar, Felicia and Van Drie, Romy and Verberne, Suzan and De Boer, Maaike},
+  booktitle = {Proceedings of the natural legal language processing workshop 2024},
+  pages = {154--165},
+  title = {Attributed Question Answering for Preconditions in the Dutch Law},
+  year = {2024},
+}
+""",
+        prompt={
+            "query": "Gegeven een juridische vraag, haal documenten op die kunnen helpen bij het beantwoorden van de vraag"
+        },
+    )

mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py CHANGED Viewed

@@ -1,31 +1,26 @@
 from mteb.abstasks.retrieval import AbsTaskRetrieval
 from mteb.abstasks.task_metadata import TaskMetadata
-class NFCorpusNL(AbsTaskRetrieval):
-    metadata = TaskMetadata(
-        name="NFCorpus-NL",
-        dataset={
-            "path": "clips/beir-nl-nfcorpus",
-            "revision": "942953e674fd0f619ff89897abb806dc3df5dd39",
-        },
-        description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval. NFCorpus-NL is "
-        "a Dutch translation.",
-        reference="https://huggingface.co/datasets/clips/beir-nl-nfcorpus",
-        type="Retrieval",
-        category="t2t",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["nld-Latn"],
-        main_score="ndcg_at_10",
-        date=("2016-03-01", "2016-03-01"),  # best guess: based on publication date
-        domains=["Medical", "Academic", "Written"],
-        task_subtypes=[],
-        license="cc-by-4.0",
-        annotations_creators="derived",
-        dialect=[],
-        sample_creation="machine-translated and verified",  # manually checked a small subset
-        bibtex_citation=r"""
+_nf_corpus_metadata = dict(
+    dataset={
+        "path": "clips/beir-nl-nfcorpus",
+        "revision": "942953e674fd0f619ff89897abb806dc3df5dd39",
+    },
+    reference="https://huggingface.co/datasets/clips/beir-nl-nfcorpus",
+    type="Retrieval",
+    category="t2t",
+    modalities=["text"],
+    eval_splits=["test"],
+    eval_langs=["nld-Latn"],
+    main_score="ndcg_at_10",
+    date=("2016-03-01", "2016-03-01"),  # best guess: based on publication date
+    domains=["Medical", "Academic", "Written"],
+    task_subtypes=[],
+    license="cc-by-4.0",
+    annotations_creators="derived",
+    dialect=[],
+    sample_creation="machine-translated and verified",  # manually checked a small subset
+    bibtex_citation=r"""
 @misc{banar2024beirnlzeroshotinformationretrieval,
   archiveprefix = {arXiv},
   author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans},
@@ -36,5 +31,27 @@ class NFCorpusNL(AbsTaskRetrieval):
   year = {2024},
 }
 """,
+)
+class NFCorpusNL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NFCorpus-NL",
+        description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval. NFCorpus-NL is "
+        "a Dutch translation.",
         adapted_from=["NFCorpus"],
+        **_nf_corpus_metadata,
+    )
+class NFCorpusNLv2(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NFCorpus-NL.v2",
+        description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval. NFCorpus-NL is "
+        "a Dutch translation. This version adds a Dutch prompt to the dataset.",
+        adapted_from=["NFCorpus-NL"],
+        prompt={
+            "query": "Gegeven een vraag, haal relevante documenten op die de vraag het beste beantwoorden"
+        },
+        **_nf_corpus_metadata,
     )

mteb/tasks/retrieval/nld/open_tender_retrieval.py ADDED Viewed

@@ -0,0 +1,41 @@
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+class OpenTenderRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="OpenTenderRetrieval",
+        description="This dataset contains Belgian and Dutch tender calls from OpenTender in Dutch",
+        reference="https://arxiv.org/abs/2509.12340",
+        dataset={
+            "path": "clips/mteb-nl-opentender-ret",
+            "revision": "83eec1aa9c58f1dc8acfac015f653a9c25bda3f4",
+        },
+        type="Retrieval",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["nld-Latn"],
+        main_score="ndcg_at_10",
+        date=("2009-11-01", "2010-01-01"),
+        domains=["Government", "Written"],
+        task_subtypes=["Article retrieval"],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@misc{banar2025mtebnle5nlembeddingbenchmark,
+  archiveprefix = {arXiv},
+  author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
+  eprint = {2509.12340},
+  primaryclass = {cs.CL},
+  title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
+  url = {https://arxiv.org/abs/2509.12340},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Gegeven een titel, haal de aanbestedingsbeschrijving op die het beste bij de titel past"
+        },
+    )

mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py CHANGED Viewed

@@ -1,30 +1,26 @@
 from mteb.abstasks.retrieval import AbsTaskRetrieval
 from mteb.abstasks.task_metadata import TaskMetadata
-class SciFactNL(AbsTaskRetrieval):
-    metadata = TaskMetadata(
-        name="SciFact-NL",
-        dataset={
-            "path": "clips/beir-nl-scifact",
-            "revision": "856d8dfc294b138856bbf3042450e3782321e44e",
-        },
-        description="SciFactNL verifies scientific claims in Dutch using evidence from the research literature containing scientific paper abstracts.",
-        reference="https://huggingface.co/datasets/clips/beir-nl-scifact",
-        type="Retrieval",
-        category="t2t",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["nld-Latn"],
-        main_score="ndcg_at_10",
-        date=("2020-05-01", "2020-05-01"),  # best guess: based on submission date
-        domains=["Academic", "Medical", "Written"],
-        task_subtypes=[],
-        license="cc-by-4.0",
-        annotations_creators="derived",
-        dialect=[],
-        sample_creation="machine-translated and verified",  # manually checked a small subset
-        bibtex_citation=r"""
+_sci_fact_nl_metadata = dict(
+    dataset={
+        "path": "clips/beir-nl-scifact",
+        "revision": "856d8dfc294b138856bbf3042450e3782321e44e",
+    },
+    reference="https://huggingface.co/datasets/clips/beir-nl-scifact",
+    type="Retrieval",
+    category="t2t",
+    modalities=["text"],
+    eval_splits=["test"],
+    eval_langs=["nld-Latn"],
+    main_score="ndcg_at_10",
+    date=("2020-05-01", "2020-05-01"),  # best guess: based on submission date
+    domains=["Academic", "Medical", "Written"],
+    task_subtypes=[],
+    license="cc-by-4.0",
+    annotations_creators="derived",
+    dialect=[],
+    sample_creation="machine-translated and verified",  # manually checked a small subset
+    bibtex_citation=r"""
 @misc{banar2024beirnlzeroshotinformationretrieval,
   archiveprefix = {arXiv},
   author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans},
@@ -35,5 +31,27 @@ class SciFactNL(AbsTaskRetrieval):
   year = {2024},
 }
 """,
+)
+class SciFactNL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="SciFact-NL",
+        description="SciFactNL verifies scientific claims in Dutch using evidence from the research literature "
+        "containing scientific paper abstracts.",
         adapted_from=["SciFact"],
+        **_sci_fact_nl_metadata,
+    )
+class SciFactNLv2(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="SciFact-NL.v2",
+        description="SciFactNL verifies scientific claims in Dutch using evidence from the research literature "
+        "containing scientific paper abstracts. This version adds a Dutch prompt to the dataset.",
+        adapted_from=["SciFact-NL"],
+        prompt={
+            "query": "Given a scientific claim, retrieve documents that support or refute the claim"
+        },
+        **_sci_fact_nl_metadata,
     )

mteb/tasks/retrieval/nld/scidocsnl_retrieval.py CHANGED Viewed

@@ -1,33 +1,26 @@
 from mteb.abstasks.retrieval import AbsTaskRetrieval
 from mteb.abstasks.task_metadata import TaskMetadata
-class SCIDOCSNL(AbsTaskRetrieval):
-    metadata = TaskMetadata(
-        name="SCIDOCS-NL",
-        dataset={
-            "path": "clips/beir-nl-scidocs",
-            "revision": "4e018aa220029f9d1bd5a31de3650e322e32ea38",
-        },
-        description=(
-            "SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation"
-            + " prediction, to document classification and recommendation. SciDocs-NL is a Dutch translation."
-        ),
-        reference="https://huggingface.co/datasets/clips/beir-nl-scidocs",
-        type="Retrieval",
-        category="t2t",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["nld-Latn"],
-        main_score="ndcg_at_10",
-        date=("2020-05-01", "2020-05-01"),  # best guess: based on submission date
-        domains=["Academic", "Written", "Non-fiction"],
-        task_subtypes=[],
-        license="cc-by-sa-4.0",
-        annotations_creators="derived",
-        dialect=[],
-        sample_creation="machine-translated and verified",  # manually checked a small subset
-        bibtex_citation=r"""
+_scidocsnl_metadata = dict(
+    dataset={
+        "path": "clips/beir-nl-scidocs",
+        "revision": "4e018aa220029f9d1bd5a31de3650e322e32ea38",
+    },
+    reference="https://huggingface.co/datasets/clips/beir-nl-scidocs",
+    type="Retrieval",
+    category="t2t",
+    modalities=["text"],
+    eval_splits=["test"],
+    eval_langs=["nld-Latn"],
+    main_score="ndcg_at_10",
+    date=("2020-05-01", "2020-05-01"),  # best guess: based on submission date
+    domains=["Academic", "Written", "Non-fiction"],
+    task_subtypes=[],
+    license="cc-by-sa-4.0",
+    annotations_creators="derived",
+    dialect=[],
+    sample_creation="machine-translated and verified",  # manually checked a small subset
+    bibtex_citation=r"""
 @misc{banar2024beirnlzeroshotinformationretrieval,
   archiveprefix = {arXiv},
   author = {Nikolay Banar and Ehsan Lotfi and Walter Daelemans},
@@ -38,5 +31,29 @@ class SCIDOCSNL(AbsTaskRetrieval):
   year = {2024},
 }
 """,
+)
+class SCIDOCSNL(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="SCIDOCS-NL",
+        description="SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from "
+        "citation prediction, to document classification and recommendation. SciDocs-NL is a Dutch "
+        "translation.",
         adapted_from=["SCIDOCS"],
+        **_scidocsnl_metadata,
+    )
+class SCIDOCSNLv2(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="SCIDOCS-NL.v2",
+        description="SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from "
+        "citation prediction, to document classification and recommendation. SciDocs-NL is a Dutch "
+        "translation. This version adds a Dutch prompt to the dataset.",
+        adapted_from=["SCIDOCS-NL"],
+        **_scidocsnl_metadata,
+        prompt={
+            "query": "Gegeven de titel van een wetenschappelijk artikel, haal de abstracts op van artikelen die door het gegeven artikel worden geciteerd"
+        },
     )

mteb/tasks/retrieval/nld/vabb_retrieval.py ADDED Viewed

@@ -0,0 +1,44 @@
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+class VABBRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="VABBRetrieval",
+        description="This dataset contains the fourteenth edition of the Flemish Academic Bibliography for the Social "
+        "Sciences and Humanities (VABB-SHW), a database of academic publications from the social sciences "
+        "and humanities authored by researchers affiliated to Flemish universities (more information). "
+        "Publications in the database are used as one of the parameters of the Flemish performance-based "
+        "research funding system",
+        reference="https://zenodo.org/records/14214806",
+        dataset={
+            "path": "clips/mteb-nl-vabb-ret",
+            "revision": "af4a1e5b3ed451103894f86ff6b3ce85085d7b48",
+        },
+        type="Retrieval",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["nld-Latn"],
+        main_score="ndcg_at_10",
+        date=("2009-11-01", "2010-01-01"),
+        domains=["Academic", "Written"],
+        task_subtypes=["Article retrieval"],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@dataset{aspeslagh2024vabb,
+  author = {Aspeslagh, Pieter and Guns, Raf and Engels, Tim C. E.},
+  doi = {10.5281/zenodo.14214806},
+  publisher = {Zenodo},
+  title = {VABB-SHW: Dataset of Flemish Academic Bibliography for the Social Sciences and Humanities (edition 14)},
+  url = {https://doi.org/10.5281/zenodo.14214806},
+  year = {2024},
+}
+""",
+        prompt={
+            "query": "Gegeven een titel, haal de wetenschappelijke abstract op die het beste bij de titel past"
+        },
+    )

mteb/tasks/retrieval/nob/norquad.py CHANGED Viewed

@@ -59,9 +59,9 @@ Fishel, Mark},
         self.data_loaded = True
     def dataset_transform(self) -> None:
-        """And transform to a retrieval datset, which have the following attributes
+        """And transform to a retrieval dataset, which have the following attributes
-        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document datas like title and text
+        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
         self.queries = dict[query_id, str] #id => query
         self.relevant_docs = dict[query_id, dict[[doc_id, score]]
         """

mteb/tasks/retrieval/nob/snl_retrieval.py CHANGED Viewed

@@ -46,9 +46,9 @@ class SNLRetrieval(AbsTaskRetrieval):
         self.data_loaded = True
     def dataset_transform(self) -> None:
-        """And transform to a retrieval datset, which have the following attributes
+        """And transform to a retrieval dataset, which have the following attributes
-        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document datas like title and text
+        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
         self.queries = dict[query_id, str] #id => query
         self.relevant_docs = dict[query_id, dict[[doc_id, score]]
         """

mteb/tasks/retrieval/rus/__init__.py CHANGED Viewed

@@ -1,4 +1,13 @@
-from .ria_news_retrieval import RiaNewsRetrieval, RiaNewsRetrievalHardNegatives
+from .ria_news_retrieval import (
+    RiaNewsRetrieval,
+    RiaNewsRetrievalHardNegatives,
+    RiaNewsRetrievalHardNegativesV2,
+)
 from .ru_bq_retrieval import RuBQRetrieval
-__all__ = ["RiaNewsRetrieval", "RiaNewsRetrievalHardNegatives", "RuBQRetrieval"]
+__all__ = [
+    "RiaNewsRetrieval",
+    "RiaNewsRetrievalHardNegatives",
+    "RiaNewsRetrievalHardNegativesV2",
+    "RuBQRetrieval",
+]

mteb/tasks/retrieval/rus/ria_news_retrieval.py CHANGED Viewed

@@ -1,6 +1,31 @@
 from mteb.abstasks.retrieval import AbsTaskRetrieval
 from mteb.abstasks.task_metadata import TaskMetadata
+_ria_news_metadata = dict(
+    reference="https://arxiv.org/abs/1901.07786",
+    type="Retrieval",
+    category="t2t",
+    modalities=["text"],
+    eval_splits=["test"],
+    eval_langs=["rus-Cyrl"],
+    main_score="ndcg_at_10",
+    date=("2010-01-01", "2014-12-31"),
+    domains=["News", "Written"],
+    task_subtypes=["Article retrieval"],
+    license="cc-by-nc-nd-4.0",
+    annotations_creators="derived",
+    dialect=[],
+    sample_creation="found",
+    bibtex_citation=r"""
+@inproceedings{gavrilov2018self,
+  author = {Gavrilov, Daniil and  Kalaidin, Pavel and  Malykh, Valentin},
+  booktitle = {Proceedings of the 41st European Conference on Information Retrieval},
+  title = {Self-Attentive Model for Headline Generation},
+  year = {2019},
+}
+""",
+)
 class RiaNewsRetrieval(AbsTaskRetrieval):
     ignore_identical_ids = True
@@ -12,29 +37,8 @@ class RiaNewsRetrieval(AbsTaskRetrieval):
             "revision": "82374b0bbacda6114f39ff9c5b925fa1512ca5d7",
         },
         description="News article retrieval by headline. Based on Rossiya Segodnya dataset.",
-        reference="https://arxiv.org/abs/1901.07786",
-        type="Retrieval",
-        category="t2t",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["rus-Cyrl"],
-        main_score="ndcg_at_10",
-        date=("2010-01-01", "2014-12-31"),
-        domains=["News", "Written"],
-        task_subtypes=["Article retrieval"],
-        license="cc-by-nc-nd-4.0",
-        annotations_creators="derived",
-        dialect=[],
-        sample_creation="found",
-        bibtex_citation=r"""
-@inproceedings{gavrilov2018self,
-  author = {Gavrilov, Daniil and  Kalaidin, Pavel and  Malykh, Valentin},
-  booktitle = {Proceedings of the 41st European Conference on Information Retrieval},
-  title = {Self-Attentive Model for Headline Generation},
-  year = {2019},
-}
-""",
         prompt={"query": "Given a news title, retrieve relevant news article"},
+        **_ria_news_metadata,
     )
@@ -48,27 +52,27 @@ class RiaNewsRetrievalHardNegatives(AbsTaskRetrieval):
             "revision": "d42860a6c15f0a2c4485bda10c6e5b641fdfe479",
         },
         description="News article retrieval by headline. Based on Rossiya Segodnya dataset. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
-        reference="https://arxiv.org/abs/1901.07786",
-        type="Retrieval",
-        category="t2t",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["rus-Cyrl"],
-        main_score="ndcg_at_10",
-        date=("2010-01-01", "2014-12-31"),
-        domains=["News", "Written"],
-        task_subtypes=["Article retrieval"],
-        license="cc-by-nc-nd-4.0",
-        annotations_creators="derived",
-        dialect=[],
-        sample_creation="found",
-        bibtex_citation=r"""
-@inproceedings{gavrilov2018self,
-  author = {Gavrilov, Daniil and  Kalaidin, Pavel and  Malykh, Valentin},
-  booktitle = {Proceedings of the 41st European Conference on Information Retrieval},
-  title = {Self-Attentive Model for Headline Generation},
-  year = {2019},
-}
-""",
         adapted_from=["RiaNewsRetrieval"],
+        superseded_by="RiaNewsRetrievalHardNegatives.v2",
+        **_ria_news_metadata,
+    )
+class RiaNewsRetrievalHardNegativesV2(AbsTaskRetrieval):
+    ignore_identical_ids = True
+    metadata = TaskMetadata(
+        name="RiaNewsRetrievalHardNegatives.v2",
+        dataset={
+            "path": "mteb/RiaNewsRetrieval_test_top_250_only_w_correct-v2",
+            "revision": "d42860a6c15f0a2c4485bda10c6e5b641fdfe479",
+        },
+        description=(
+            "News article retrieval by headline. Based on Rossiya Segodnya dataset. "
+            "The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
+            "V2 uses a more appropriate prompt rather than the default prompt for retrieval. You can get more information on the effect of different prompt in the [PR](https://github.com/embeddings-benchmark/mteb/pull/3469#issuecomment-3436467106)"
+        ),
+        adapted_from=["RiaNewsRetrieval"],
+        prompt={"query": "Given a news title, retrieve relevant news article"},
+        **_ria_news_metadata,
     )

mteb/tasks/retrieval/slk/slovak_sum_retrieval.py CHANGED Viewed

@@ -7,13 +7,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class SlovakSumRetrieval(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="SlovakSumRetrieval",
-        description="""
-            SlovakSum, a Slovak news summarization dataset consisting of over 200 thousand
-            news articles with titles and short abstracts obtained from multiple Slovak newspapers.
-            Originally intended as a summarization task, but since no human annotations were provided
-            here reformulated to a retrieval task.
-        """,
+        description="SlovakSum, a Slovak news summarization dataset consisting of over 200 thousand news articles with titles and short abstracts obtained from multiple Slovak newspapers. Originally intended as a summarization task, but since no human annotations were provided here reformulated to a retrieval task.",
         reference="https://huggingface.co/datasets/NaiveNeuron/slovaksum",
         dataset={
             "path": "NaiveNeuron/slovaksum",

mteb/tasks/retrieval/tur/tur_hist_quad.py CHANGED Viewed

@@ -42,9 +42,9 @@ class TurHistQuadRetrieval(AbsTaskRetrieval):
     )
     def load_data(self, **kwargs) -> None:
-        """And transform to a retrieval datset, which have the following attributes
+        """And transform to a retrieval dataset, which have the following attributes
-        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document datas like title and text
+        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
         self.queries = dict[query_id, str] #id => query
         self.relevant_docs = dict[query_id, dict[[doc_id, score]]
         """

mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py CHANGED Viewed

@@ -5,11 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class ArguAnaVN(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="ArguAna-VN",
-        description="""A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval
-            The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
-            - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
-            - Applies advanced embedding models to filter the translations.
-            - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
+        description="A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
         reference="http://argumentation.bplaced.net/arguana/data",
         dataset={
             "path": "GreenNode/arguana-vn",

mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py CHANGED Viewed

@@ -5,11 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class ClimateFEVERVN(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="ClimateFEVER-VN",
-        description="""A translated dataset from CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change.
-            The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
-            - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
-            - Applies advanced embedding models to filter the translations.
-            - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
+        description="A translated dataset from CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
         reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
         dataset={
             "path": "GreenNode/climate-fever-vn",

mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py CHANGED Viewed

@@ -5,11 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
 class CQADupstackAndroidVN(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="CQADupstackAndroid-VN",
-        description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research
-            The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
-            - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
-            - Applies advanced embedding models to filter the translations.
-            - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
+        description="A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
         reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/",
         dataset={
             "path": "GreenNode/cqadupstack-android-vn",

mteb 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl

mteb 2.0.5py3-none-any.whl → 2.1.19py3-none-any.whl