PyPI - mteb - Versions diffs - 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl - Mend

mteb 2.0.5py3-none-any.whl → 2.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (412) hide show

mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py CHANGED Viewed

@@ -20,9 +20,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
         corpus[split] = split_dataset.map(
             lambda x, idx: {
                 "id": f"corpus-{split}-{idx}",
-                # "text": None,
                 "modality": "text",
-                "image": None,
             },
             with_indices=True,
             remove_columns=[
@@ -37,9 +35,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
         queries[split] = split_dataset.map(
             lambda x, idx: {
                 "id": f"query-{split}-{idx}",
-                "text": None,
                 "modality": "image",
-                # "image": None,
             },
             with_indices=True,
             remove_columns=[

mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py CHANGED Viewed

@@ -20,9 +20,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
         corpus[split] = split_dataset.map(
             lambda x, idx: {
                 "id": f"corpus-{split}-{idx}",
-                "text": None,
                 "modality": "image",
-                # "image": None,
             },
             with_indices=True,
             remove_columns=[
@@ -37,9 +35,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
         queries[split] = split_dataset.map(
             lambda x, idx: {
                 "id": f"query-{split}-{idx}",
-                # "text": None,
                 "modality": "text",
-                "image": None,
             },
             with_indices=True,
             remove_columns=[

mteb/tasks/retrieval/eng/vidore_bench_retrieval.py CHANGED Viewed

@@ -24,7 +24,6 @@ def _load_data(
             lambda x: {
                 "id": f"query-{split}-{x['query-id']}",
                 "text": x["query"],
-                "image": None,
                 "modality": "text",
             },
             remove_columns=["query-id", "query"],
@@ -40,7 +39,6 @@ def _load_data(
         corpus_ds = corpus_ds.map(
             lambda x: {
                 "id": f"corpus-{split}-{x['corpus-id']}",
-                "text": None,
                 "modality": "image",
             },
             remove_columns=["corpus-id"],

mteb/tasks/retrieval/eng/wino_grande_retrieval.py CHANGED Viewed

@@ -9,7 +9,7 @@ class WinoGrande(AbsTaskRetrieval):
         reference="https://winogrande.allenai.org/",
         dataset={
             "path": "mteb/WinoGrande",
-            "revision": "770abbd7f77affc005f9734996e795925cbc0f65",
+            "revision": "4dec9c5666e9f84702ac614363db6d96a68bc6de",
         },
         type="Retrieval",
         category="t2t",

mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py CHANGED Viewed

@@ -9,10 +9,7 @@ class JaCWIRRetrieval(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="JaCWIRRetrieval",
-        description="""JaCWIR is a small-scale Japanese information retrieval evaluation dataset consisting of
-5000 question texts and approximately 500k web page titles and web page introductions or summaries
-(meta descriptions, etc.). The question texts are created based on one of the 500k web pages,
-and that data is used as a positive example for the question text.""",
+        description="JaCWIR is a small-scale Japanese information retrieval evaluation dataset consisting of 5000 question texts and approximately 500k web page titles and web page introductions or summaries (meta descriptions, etc.). The question texts are created based on one of the 500k web pages, and that data is used as a positive example for the question text.",
         reference="https://huggingface.co/datasets/hotchpotch/JaCWIR",
         dataset={
             "path": "mteb/JaCWIRRetrieval",

mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py CHANGED Viewed

@@ -7,7 +7,7 @@ class JaGovFaqsRetrieval(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="JaGovFaqsRetrieval",
-        description="JaGovFaqs is a dataset consisting of FAQs manully extracted from the website of Japanese bureaus. The dataset consists of 22k FAQs, where the queries (questions) and corpus (answers) have been shuffled, and the goal is to match the answer with the question.",
+        description="JaGovFaqs is a dataset consisting of FAQs manually extracted from the website of Japanese bureaus. The dataset consists of 22k FAQs, where the queries (questions) and corpus (answers) have been shuffled, and the goal is to match the answer with the question.",
         reference="https://github.com/sbintuitions/JMTEB",
         dataset={
             "path": "mteb/JaGovFaqsRetrieval",

mteb/tasks/retrieval/kat/georgian_faq_retrieval.py CHANGED Viewed

@@ -46,10 +46,17 @@ class GeorgianFAQRetrieval(AbsTaskRetrieval):
             split=_EVAL_SPLIT,
             revision=self.metadata.dataset["revision"],
         )
-        question_ids = {
-            question: _id for _id, question in enumerate(set(data["question"]))
-        }
-        answer_ids = {answer: _id for _id, answer in enumerate(set(data["answer"]))}
+        question_ids = {}
+        answer_ids = {}
+        for row in data:
+            question = row["question"]
+            answer = row["answer"]
+            if question not in question_ids:
+                question_ids[question] = len(question_ids)
+            if answer not in answer_ids:
+                answer_ids[answer] = len(answer_ids)
         for row in data:
             question = row["question"]

mteb/tasks/retrieval/multilingual/__init__.py CHANGED Viewed

@@ -81,6 +81,18 @@ from .vidore2_bench_retrieval import (
     Vidore2ESGReportsHLRetrieval,
     Vidore2ESGReportsRetrieval,
 )
+from .vidore3_bench_retrieval import (
+    Vidore3ComputerScienceRetrieval,
+    Vidore3EnergyRetrieval,
+    Vidore3FinanceEnRetrieval,
+    Vidore3FinanceFrRetrieval,
+    Vidore3HrRetrieval,
+    Vidore3IndustrialRetrieval,
+    Vidore3NuclearRetrieval,
+    Vidore3PharmaceuticalsRetrieval,
+    Vidore3PhysicsRetrieval,
+    Vidore3TelecomRetrieval,
+)
 from .web_faq_retrieval import WebFAQRetrieval
 from .wikipedia_retrieval_multilingual import WikipediaRetrievalMultilingual
 from .wit_t2i_retrieval import WITT2IRetrieval
@@ -161,6 +173,16 @@ __all__ = [
     "Vidore2ESGReportsHLRetrieval",
     "Vidore2ESGReportsRetrieval",
     "Vidore2EconomicsReportsRetrieval",
+    "Vidore3ComputerScienceRetrieval",
+    "Vidore3EnergyRetrieval",
+    "Vidore3FinanceEnRetrieval",
+    "Vidore3FinanceFrRetrieval",
+    "Vidore3HrRetrieval",
+    "Vidore3IndustrialRetrieval",
+    "Vidore3NuclearRetrieval",
+    "Vidore3PharmaceuticalsRetrieval",
+    "Vidore3PhysicsRetrieval",
+    "Vidore3TelecomRetrieval",
     "WITT2IRetrieval",
     "WebFAQRetrieval",
     "WikipediaRetrievalMultilingual",

mteb/tasks/retrieval/multilingual/belebele_retrieval.py CHANGED Viewed

@@ -132,7 +132,7 @@ _LANGUAGES = [
 def get_lang_pairs() -> dict[str, list[str]]:
-    # add pairs with same langauge as the source and target
+    # add pairs with same language as the source and target
     # add pairs with english as source or target
     lang_pairs = {}
     for x in _LANGUAGES:
@@ -230,10 +230,11 @@ class BelebeleRetrieval(AbsTaskRetrieval):
             ds_corpus = self.dataset[lang_corpus]
             ds_question = self.dataset[lang_question]
-            question_ids = {
-                question: _id
-                for _id, question in enumerate(set(ds_question["question"]))
-            }
+            question_ids = {}
+            for row in ds_question:
+                question = row["question"]
+                if question not in question_ids:
+                    question_ids[question] = len(question_ids)
             link_to_context_id = {}
             context_idx = 0

mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py CHANGED Viewed

@@ -72,7 +72,6 @@ def _load_single_language(
         lambda x: {
             "id": f"query-{split}-{x['query-id']}",
             "text": x["query"],
-            "image": None,
             "modality": "text",
         },
         remove_columns=["query-id", "query"],
@@ -87,7 +86,6 @@ def _load_single_language(
     corpus_ds = corpus_ds.map(
         lambda x: {
             "id": f"corpus-{split}-{x['corpus-id']}",
-            "text": None,
             "modality": "image",
         },
         remove_columns=["corpus-id"],

mteb/tasks/retrieval/multilingual/miracl_retrieval.py CHANGED Viewed

@@ -92,7 +92,7 @@ class MIRACLRetrievalHardNegativesV2(AbsTaskRetrieval):
             "MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual retrieval "
             "dataset that focuses on search across 18 different languages. The hard negative version has been "
             "created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
-            "V2 uses a more appropriate prompt rather than the default prompt for retrieval."
+            "V2 uses a more appropriate prompt rather than the default prompt for retrieval. You can get more information on the effect of different prompt in the [PR](https://github.com/embeddings-benchmark/mteb/pull/3469#issuecomment-3436467106)"
         ),
         dataset={
             "path": "mteb/MIRACLRetrievalHardNegatives",

mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py CHANGED Viewed

@@ -30,7 +30,7 @@ _LANGUAGES = {
 def _load_miracl_data(
     path: str,
     langs: list,
-    splits: str,
+    splits: list[str],
     revision: str | None = None,
 ):
     corpus = {lang: dict.fromkeys(splits) for lang in langs}
@@ -65,9 +65,7 @@ def _load_miracl_data(
         images_data = images_data.map(
             lambda x: {
                 "id": imgid2docid[str(x["file_name"])],
-                # "modality": "text",
                 "modality": "image",
-                "text": None,
             },
             remove_columns=["file_name"],
         )
@@ -86,7 +84,6 @@ def _load_miracl_data(
                 "id": str(x["_id"]),
                 "text": x["text"],
                 "modality": "text",
-                "image": None,
             },
             remove_columns=["_id"],
         )
@@ -108,10 +105,6 @@ def _load_miracl_data(
                 relevant_docs[lang][split][query_id] = {}
             relevant_docs[lang][split][query_id][doc_id] = score
-    corpus = datasets.DatasetDict(corpus)
-    queries = datasets.DatasetDict(queries)
-    relevant_docs = datasets.DatasetDict(relevant_docs)
     return corpus, queries, relevant_docs
@@ -156,7 +149,7 @@ class MIRACLVisionRetrieval(AbsTaskRetrieval):
         self.corpus, self.queries, self.relevant_docs = _load_miracl_data(
             path=self.metadata.dataset["path"],
-            splits=self.metadata.eval_splits[0],
+            splits=self.metadata.eval_splits,
             langs=self.hf_subsets,
             revision=self.metadata.dataset["revision"],
         )

mteb/tasks/retrieval/multilingual/mkqa_retrieval.py CHANGED Viewed

@@ -34,8 +34,7 @@ _EVAL_LANGS = {
 class MKQARetrieval(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="MKQARetrieval",
-        description="""Multilingual Knowledge Questions & Answers (MKQA)contains 10,000 queries sampled from the Google Natural Questions dataset.
-        For each query we collect new passage-independent answers. These queries and answers are then human translated into 25 Non-English languages.""",
+        description="Multilingual Knowledge Questions & Answers (MKQA)contains 10,000 queries sampled from the Google Natural Questions dataset. For each query we collect new passage-independent answers. These queries and answers are then human translated into 25 Non-English languages.",
         reference="https://github.com/apple/ml-mkqa",
         dataset={
             "path": "mteb/MKQARetrieval",

mteb/tasks/retrieval/multilingual/mlqa_retrieval.py CHANGED Viewed

@@ -75,10 +75,7 @@ _EVAL_LANGS = extend_lang_pairs()
 class MLQARetrieval(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="MLQARetrieval",
-        description="""MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating cross-lingual question answering performance.
-        MLQA consists of over 5K extractive QA instances (12K in English) in SQuAD format in seven languages - English, Arabic,
-        German, Spanish, Hindi, Vietnamese and Simplified Chinese. MLQA is highly parallel, with QA instances parallel between
-        4 different languages on average.""",
+        description="MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating cross-lingual question answering performance. MLQA consists of over 5K extractive QA instances (12K in English) in SQuAD format in seven languages - English, Arabic, German, Spanish, Hindi, Vietnamese and Simplified Chinese. MLQA is highly parallel, with QA instances parallel between 4 different languages on average.",
         reference="https://huggingface.co/datasets/mlqa",
         dataset={
             "path": "mteb/MLQARetrieval",

mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py CHANGED Viewed

@@ -21,8 +21,7 @@ _LANGUAGES = {
 class MultiLongDocRetrieval(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="MultiLongDocRetrieval",
-        description="""Multi Long Doc Retrieval (MLDR) 'is curated by the multilingual articles from Wikipedia, Wudao and mC4 (see Table 7), and NarrativeQA (Kocˇisky ́ et al., 2018; Gu ̈nther et al., 2023), which is only for English.' (Chen et al., 2024).
-        It is constructed by sampling lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose paragraphs from them. Then we use GPT-3.5 to generate questions based on these paragraphs. The generated question and the sampled article constitute a new text pair to the dataset.""",
+        description="Multi Long Doc Retrieval (MLDR) 'is curated by the multilingual articles from Wikipedia, Wudao and mC4 (see Table 7), and NarrativeQA (Kocˇisky ́ et al., 2018; Gu ̈nther et al., 2023), which is only for English.' (Chen et al., 2024). It is constructed by sampling lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose paragraphs from them. Then we use GPT-3.5 to generate questions based on these paragraphs. The generated question and the sampled article constitute a new text pair to the dataset.",
         reference="https://arxiv.org/abs/2402.03216",  # also: https://huggingface.co/datasets/Shitao/MLDR
         dataset={
             "path": "mteb/MultiLongDocRetrieval",

mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py CHANGED Viewed

@@ -32,10 +32,15 @@ def _load_publichealthqa_data(
             split=split,
             revision=revision,
         )
-        question_ids = {
-            question: _id for _id, question in enumerate(set(data["question"]))
-        }
-        answer_ids = {answer: _id for _id, answer in enumerate(set(data["answer"]))}
+        question_ids = {}
+        answer_ids = {}
+        for row in data:
+            if row["question"] is not None and row["question"] not in question_ids:
+                question_ids[row["question"]] = len(question_ids)
+            if row["answer"] is not None and row["answer"] not in answer_ids:
+                answer_ids[row["answer"]] = len(answer_ids)
         for row in data:
             if row["question"] is None or row["answer"] is None:

mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py CHANGED Viewed

@@ -68,11 +68,7 @@ class RuSciBenchCiteRetrieval(AbsTaskRetrieval):
             "path": "mlsa-iai-msu-lab/ru_sci_bench_cite_retrieval",
             "revision": "6cb447d02f41b8b775d5d9df7faf472f44d2f1db",
         },
-        description="""This task is focused on Direct Citation Prediction for scientific papers from eLibrary,
-        Russia's largest electronic library of scientific publications. Given a query paper (title and abstract),
-        the goal is to retrieve papers that are directly cited by it from a larger corpus of papers.
-        The dataset for this task consists of 3,000 query papers, 15,000 relevant (cited) papers,
-        and 75,000 irrelevant papers. The task is available for both Russian and English scientific texts.""",
+        description="This task is focused on Direct Citation Prediction for scientific papers from eLibrary, Russia's largest electronic library of scientific publications. Given a query paper (title and abstract), the goal is to retrieve papers that are directly cited by it from a larger corpus of papers. The dataset for this task consists of 3,000 query papers, 15,000 relevant (cited) papers, and 75,000 irrelevant papers. The task is available for both Russian and English scientific texts.",
         reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
         type="Retrieval",
         category="t2t",
@@ -130,13 +126,7 @@ class RuSciBenchCociteRetrieval(AbsTaskRetrieval):
             "path": "mlsa-iai-msu-lab/ru_sci_bench_cocite_retrieval",
             "revision": "a5da47a245275669d2b6ddf8f96c5338dd2428b4",
         },
-        description="""This task focuses on Co-citation Prediction for scientific papers from eLibrary,
-        Russia's largest electronic library of scientific publications. Given a query paper (title and abstract),
-        the goal is to retrieve other papers that are co-cited with it. Two papers are considered co-cited
-        if they are both cited by at least 5 of the same other papers. Similar to the Direct Citation task,
-        this task employs a retrieval setup: for a given query paper, all other papers in the corpus that
-        are not co-cited with it are considered negative examples. The task is available for both Russian
-        and English scientific texts.""",
+        description="This task focuses on Co-citation Prediction for scientific papers from eLibrary, Russia's largest electronic library of scientific publications. Given a query paper (title and abstract), the goal is to retrieve other papers that are co-cited with it. Two papers are considered co-cited if they are both cited by at least 5 of the same other papers. Similar to the Direct Citation task, this task employs a retrieval setup: for a given query paper, all other papers in the corpus that are not co-cited with it are considered negative examples. The task is available for both Russian and English scientific texts.",
         reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
         type="Retrieval",
         category="t2t",

mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py CHANGED Viewed

@@ -37,7 +37,6 @@ def _load_data(
             lambda x: {
                 "id": f"query-{split}-{x['query-id']}",
                 "text": x["query"],
-                "image": None,
                 "modality": "text",
             },
             remove_columns=["query-id", "query"],
@@ -52,7 +51,6 @@ def _load_data(
         corpus_ds = corpus_ds.map(
             lambda x: {
                 "id": f"corpus-{split}-{x['corpus-id']}",
-                "text": None,
                 "modality": "image",
             },
             remove_columns=["corpus-id"],

mteb 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl

mteb 2.0.5py3-none-any.whl → 2.1.19py3-none-any.whl