PyPI - mteb - Versions diffs - 2.1.0__py3-none-any.whl → 2.1.1__py3-none-any.whl - Mend

mteb 2.1.0py3-none-any.whl → 2.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py CHANGED Viewed

@@ -20,9 +20,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
         corpus[split] = split_dataset.map(
             lambda x, idx: {
                 "id": f"corpus-{split}-{idx}",
-                "text": None,
                 "modality": "image",
-                # "image": None,
             },
             with_indices=True,
             remove_columns=[
@@ -37,9 +35,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
         queries[split] = split_dataset.map(
             lambda x, idx: {
                 "id": f"query-{split}-{idx}",
-                # "text": None,
                 "modality": "text",
-                "image": None,
             },
             with_indices=True,
             remove_columns=[

mteb/tasks/retrieval/eng/vidore_bench_retrieval.py CHANGED Viewed

@@ -24,7 +24,6 @@ def _load_data(
             lambda x: {
                 "id": f"query-{split}-{x['query-id']}",
                 "text": x["query"],
-                "image": None,
                 "modality": "text",
             },
             remove_columns=["query-id", "query"],
@@ -40,7 +39,6 @@ def _load_data(
         corpus_ds = corpus_ds.map(
             lambda x: {
                 "id": f"corpus-{split}-{x['corpus-id']}",
-                "text": None,
                 "modality": "image",
             },
             remove_columns=["corpus-id"],

mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py CHANGED Viewed

@@ -7,7 +7,7 @@ class JaGovFaqsRetrieval(AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="JaGovFaqsRetrieval",
-        description="JaGovFaqs is a dataset consisting of FAQs manully extracted from the website of Japanese bureaus. The dataset consists of 22k FAQs, where the queries (questions) and corpus (answers) have been shuffled, and the goal is to match the answer with the question.",
+        description="JaGovFaqs is a dataset consisting of FAQs manually extracted from the website of Japanese bureaus. The dataset consists of 22k FAQs, where the queries (questions) and corpus (answers) have been shuffled, and the goal is to match the answer with the question.",
         reference="https://github.com/sbintuitions/JMTEB",
         dataset={
             "path": "mteb/JaGovFaqsRetrieval",

mteb/tasks/retrieval/multilingual/belebele_retrieval.py CHANGED Viewed

@@ -132,7 +132,7 @@ _LANGUAGES = [
 def get_lang_pairs() -> dict[str, list[str]]:
-    # add pairs with same langauge as the source and target
+    # add pairs with same language as the source and target
     # add pairs with english as source or target
     lang_pairs = {}
     for x in _LANGUAGES:

mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py CHANGED Viewed

@@ -72,7 +72,6 @@ def _load_single_language(
         lambda x: {
             "id": f"query-{split}-{x['query-id']}",
             "text": x["query"],
-            "image": None,
             "modality": "text",
         },
         remove_columns=["query-id", "query"],
@@ -87,7 +86,6 @@ def _load_single_language(
     corpus_ds = corpus_ds.map(
         lambda x: {
             "id": f"corpus-{split}-{x['corpus-id']}",
-            "text": None,
             "modality": "image",
         },
         remove_columns=["corpus-id"],

mteb/tasks/retrieval/multilingual/miracl_retrieval.py CHANGED Viewed

@@ -92,7 +92,7 @@ class MIRACLRetrievalHardNegativesV2(AbsTaskRetrieval):
             "MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual retrieval "
             "dataset that focuses on search across 18 different languages. The hard negative version has been "
             "created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
-            "V2 uses a more appropriate prompt rather than the default prompt for retrieval."
+            "V2 uses a more appropriate prompt rather than the default prompt for retrieval. You can get more information on the effect of different prompt in the [PR](https://github.com/embeddings-benchmark/mteb/pull/3469#issuecomment-3436467106)"
         ),
         dataset={
             "path": "mteb/MIRACLRetrievalHardNegatives",

mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py CHANGED Viewed

@@ -30,7 +30,7 @@ _LANGUAGES = {
 def _load_miracl_data(
     path: str,
     langs: list,
-    splits: str,
+    splits: list[str],
     revision: str | None = None,
 ):
     corpus = {lang: dict.fromkeys(splits) for lang in langs}
@@ -65,9 +65,7 @@ def _load_miracl_data(
         images_data = images_data.map(
             lambda x: {
                 "id": imgid2docid[str(x["file_name"])],
-                # "modality": "text",
                 "modality": "image",
-                "text": None,
             },
             remove_columns=["file_name"],
         )
@@ -86,7 +84,6 @@ def _load_miracl_data(
                 "id": str(x["_id"]),
                 "text": x["text"],
                 "modality": "text",
-                "image": None,
             },
             remove_columns=["_id"],
         )
@@ -108,10 +105,6 @@ def _load_miracl_data(
                 relevant_docs[lang][split][query_id] = {}
             relevant_docs[lang][split][query_id][doc_id] = score
-    corpus = datasets.DatasetDict(corpus)
-    queries = datasets.DatasetDict(queries)
-    relevant_docs = datasets.DatasetDict(relevant_docs)
     return corpus, queries, relevant_docs
@@ -156,7 +149,7 @@ class MIRACLVisionRetrieval(AbsTaskRetrieval):
         self.corpus, self.queries, self.relevant_docs = _load_miracl_data(
             path=self.metadata.dataset["path"],
-            splits=self.metadata.eval_splits[0],
+            splits=self.metadata.eval_splits,
             langs=self.hf_subsets,
             revision=self.metadata.dataset["revision"],
         )

mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py CHANGED Viewed

@@ -37,7 +37,6 @@ def _load_data(
             lambda x: {
                 "id": f"query-{split}-{x['query-id']}",
                 "text": x["query"],
-                "image": None,
                 "modality": "text",
             },
             remove_columns=["query-id", "query"],
@@ -52,7 +51,6 @@ def _load_data(
         corpus_ds = corpus_ds.map(
             lambda x: {
                 "id": f"corpus-{split}-{x['corpus-id']}",
-                "text": None,
                 "modality": "image",
             },
             remove_columns=["corpus-id"],

mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py CHANGED Viewed

@@ -34,7 +34,6 @@ def _load_wit_data(path: str, langs: list, splits: str, revision: str | None = N
         lang_corpus = lang_data.map(
             lambda x: {
                 "id": "corpus-" + x["image_id"],
-                "text": None,
                 "modality": "image",
                 "image": x["image"],
             },
@@ -60,7 +59,6 @@ def _load_wit_data(path: str, langs: list, splits: str, revision: str | None = N
                         "id": query_id,
                         "text": caption,
                         "modality": "text",
-                        "image": None,
                     }
                 )
                 if query_id not in relevant_docs[lang][split]:

mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from datasets import DatasetDict, load_dataset
+from datasets import DatasetDict, Image, load_dataset
 from mteb.abstasks.retrieval import AbsTaskRetrieval
 from mteb.abstasks.task_metadata import TaskMetadata
@@ -16,7 +16,7 @@ _LANGUAGES = {
 def _load_xflickrco_data(
-    path: str, langs: list, splits: str, revision: str | None = None
+    path: str, langs: list, splits: list[str], revision: str | None = None
 ):
     corpus = {lang: dict.fromkeys(splits) for lang in langs}
     queries = {lang: dict.fromkeys(splits) for lang in langs}
@@ -32,22 +32,23 @@ def _load_xflickrco_data(
         lang_corpus = lang_data.map(
             lambda x: {
                 "id": "corpus-" + x["id"],
-                "text": None,
                 "modality": "image",
-                "image": x["image"]["bytes"],
+                "image": x["image"],
             },
             remove_columns=["sentences"],
         )
+        lang_corpus = lang_corpus.cast_column("image", Image())
         lang_queries = lang_data.map(
             lambda x: {
                 "id": "query-" + x["id"],
                 "text": x["sentences"],
                 "modality": "text",
-                "image": None,
             },
             remove_columns=["sentences"],
         )
+        # None values
+        lang_queries = lang_queries.remove_columns(["image"])
         relevant_docs[lang][split] = {}
         for row in lang_data:

mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from datasets import Dataset, DatasetDict, load_dataset
+from datasets import Dataset, DatasetDict, Image, load_dataset
 from mteb.abstasks.retrieval import AbsTaskRetrieval
 from mteb.abstasks.task_metadata import TaskMetadata
@@ -61,9 +61,8 @@ def _load_xm3600_data(
         lang_corpus = lang_data.map(
             lambda x: {
                 "id": "corpus-" + x["image_id"],
-                "text": None,
                 "modality": "image",
-                "image": x["image"]["bytes"],
+                "image": x["image"],
             },
             remove_columns=[
                 "captions",
@@ -73,6 +72,7 @@ def _load_xm3600_data(
                 "image_id",
             ],
         )
+        lang_corpus = lang_corpus.cast_column("image", Image())
         corpus[lang][split] = lang_corpus
@@ -90,7 +90,6 @@ def _load_xm3600_data(
                         "id": query_id,
                         "text": caption,
                         "modality": "text",
-                        "image": None,
                     }
                 )
                 if query_id not in relevant_docs[lang][split]:

mteb/tasks/retrieval/nob/norquad.py CHANGED Viewed

@@ -59,9 +59,9 @@ Fishel, Mark},
         self.data_loaded = True
     def dataset_transform(self) -> None:
-        """And transform to a retrieval datset, which have the following attributes
+        """And transform to a retrieval dataset, which have the following attributes
-        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document datas like title and text
+        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
         self.queries = dict[query_id, str] #id => query
         self.relevant_docs = dict[query_id, dict[[doc_id, score]]
         """

mteb/tasks/retrieval/nob/snl_retrieval.py CHANGED Viewed

@@ -46,9 +46,9 @@ class SNLRetrieval(AbsTaskRetrieval):
         self.data_loaded = True
     def dataset_transform(self) -> None:
-        """And transform to a retrieval datset, which have the following attributes
+        """And transform to a retrieval dataset, which have the following attributes
-        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document datas like title and text
+        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
         self.queries = dict[query_id, str] #id => query
         self.relevant_docs = dict[query_id, dict[[doc_id, score]]
         """

mteb/tasks/retrieval/rus/__init__.py CHANGED Viewed

@@ -1,4 +1,13 @@
-from .ria_news_retrieval import RiaNewsRetrieval, RiaNewsRetrievalHardNegatives
+from .ria_news_retrieval import (
+    RiaNewsRetrieval,
+    RiaNewsRetrievalHardNegatives,
+    RiaNewsRetrievalHardNegativesV2,
+)
 from .ru_bq_retrieval import RuBQRetrieval
-__all__ = ["RiaNewsRetrieval", "RiaNewsRetrievalHardNegatives", "RuBQRetrieval"]
+__all__ = [
+    "RiaNewsRetrieval",
+    "RiaNewsRetrievalHardNegatives",
+    "RiaNewsRetrievalHardNegativesV2",
+    "RuBQRetrieval",
+]

mteb/tasks/retrieval/rus/ria_news_retrieval.py CHANGED Viewed

@@ -1,6 +1,31 @@
 from mteb.abstasks.retrieval import AbsTaskRetrieval
 from mteb.abstasks.task_metadata import TaskMetadata
+_ria_news_metadata = dict(
+    reference="https://arxiv.org/abs/1901.07786",
+    type="Retrieval",
+    category="t2t",
+    modalities=["text"],
+    eval_splits=["test"],
+    eval_langs=["rus-Cyrl"],
+    main_score="ndcg_at_10",
+    date=("2010-01-01", "2014-12-31"),
+    domains=["News", "Written"],
+    task_subtypes=["Article retrieval"],
+    license="cc-by-nc-nd-4.0",
+    annotations_creators="derived",
+    dialect=[],
+    sample_creation="found",
+    bibtex_citation=r"""
+@inproceedings{gavrilov2018self,
+  author = {Gavrilov, Daniil and  Kalaidin, Pavel and  Malykh, Valentin},
+  booktitle = {Proceedings of the 41st European Conference on Information Retrieval},
+  title = {Self-Attentive Model for Headline Generation},
+  year = {2019},
+}
+""",
+)
 class RiaNewsRetrieval(AbsTaskRetrieval):
     ignore_identical_ids = True
@@ -12,29 +37,8 @@ class RiaNewsRetrieval(AbsTaskRetrieval):
             "revision": "82374b0bbacda6114f39ff9c5b925fa1512ca5d7",
         },
         description="News article retrieval by headline. Based on Rossiya Segodnya dataset.",
-        reference="https://arxiv.org/abs/1901.07786",
-        type="Retrieval",
-        category="t2t",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["rus-Cyrl"],
-        main_score="ndcg_at_10",
-        date=("2010-01-01", "2014-12-31"),
-        domains=["News", "Written"],
-        task_subtypes=["Article retrieval"],
-        license="cc-by-nc-nd-4.0",
-        annotations_creators="derived",
-        dialect=[],
-        sample_creation="found",
-        bibtex_citation=r"""
-@inproceedings{gavrilov2018self,
-  author = {Gavrilov, Daniil and  Kalaidin, Pavel and  Malykh, Valentin},
-  booktitle = {Proceedings of the 41st European Conference on Information Retrieval},
-  title = {Self-Attentive Model for Headline Generation},
-  year = {2019},
-}
-""",
         prompt={"query": "Given a news title, retrieve relevant news article"},
+        **_ria_news_metadata,
     )
@@ -48,27 +52,27 @@ class RiaNewsRetrievalHardNegatives(AbsTaskRetrieval):
             "revision": "d42860a6c15f0a2c4485bda10c6e5b641fdfe479",
         },
         description="News article retrieval by headline. Based on Rossiya Segodnya dataset. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
-        reference="https://arxiv.org/abs/1901.07786",
-        type="Retrieval",
-        category="t2t",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["rus-Cyrl"],
-        main_score="ndcg_at_10",
-        date=("2010-01-01", "2014-12-31"),
-        domains=["News", "Written"],
-        task_subtypes=["Article retrieval"],
-        license="cc-by-nc-nd-4.0",
-        annotations_creators="derived",
-        dialect=[],
-        sample_creation="found",
-        bibtex_citation=r"""
-@inproceedings{gavrilov2018self,
-  author = {Gavrilov, Daniil and  Kalaidin, Pavel and  Malykh, Valentin},
-  booktitle = {Proceedings of the 41st European Conference on Information Retrieval},
-  title = {Self-Attentive Model for Headline Generation},
-  year = {2019},
-}
-""",
         adapted_from=["RiaNewsRetrieval"],
+        superseded_by="RiaNewsRetrievalHardNegatives.v2",
+        **_ria_news_metadata,
+    )
+class RiaNewsRetrievalHardNegativesV2(AbsTaskRetrieval):
+    ignore_identical_ids = True
+    metadata = TaskMetadata(
+        name="RiaNewsRetrievalHardNegatives.v2",
+        dataset={
+            "path": "mteb/RiaNewsRetrieval_test_top_250_only_w_correct-v2",
+            "revision": "d42860a6c15f0a2c4485bda10c6e5b641fdfe479",
+        },
+        description=(
+            "News article retrieval by headline. Based on Rossiya Segodnya dataset. "
+            "The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
+            "V2 uses a more appropriate prompt rather than the default prompt for retrieval. You can get more information on the effect of different prompt in the [PR](https://github.com/embeddings-benchmark/mteb/pull/3469#issuecomment-3436467106)"
+        ),
+        adapted_from=["RiaNewsRetrieval"],
+        prompt={"query": "Given a news title, retrieve relevant news article"},
+        **_ria_news_metadata,
     )

mteb/tasks/retrieval/tur/tur_hist_quad.py CHANGED Viewed

@@ -42,9 +42,9 @@ class TurHistQuadRetrieval(AbsTaskRetrieval):
     )
     def load_data(self, **kwargs) -> None:
-        """And transform to a retrieval datset, which have the following attributes
+        """And transform to a retrieval dataset, which have the following attributes
-        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document datas like title and text
+        self.corpus = dict[doc_id, dict[str, str]] #id => dict with document data like title and text
         self.queries = dict[query_id, str] #id => query
         self.relevant_docs = dict[query_id, dict[[doc_id, score]]
         """

{mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mteb
-Version: 2.1.0
+Version: 2.1.1
 Summary: Massive Text Embedding Benchmark
 Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
 Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -204,10 +204,10 @@ For more on how to use the CLI check out the [related documentation](https://emb
 [Tasks]: https://embeddings-benchmark.github.io/mteb/overview/available_tasks/any2anymultilingualretrieval/
 [Benchmarks]: https://embeddings-benchmark.github.io/mteb/overview/available_benchmarks/
 [Models]: https://embeddings-benchmark.github.io/mteb/overview/available_models/text/
-[Contributing]: docs/CONTRIBUTING.md
-[Adding a model]: docs/contributing/adding_a_model.md
-[Adding a dataset]: docs/contributing/adding_a_dataset.md
-[Adding a benchmark]: docs/contributing/adding_a_benchmark.md
+[Contributing]: https://embeddings-benchmark.github.io/mteb/CONTRIBUTING/
+[Adding a model]: https://embeddings-benchmark.github.io/mteb/contributing/adding_a_model/
+[Adding a dataset]: https://embeddings-benchmark.github.io/mteb/contributing/adding_a_dataset/
+[Adding a benchmark]: https://embeddings-benchmark.github.io/mteb/contributing/adding_a_benchmark/
 [Leaderboard]: https://huggingface.co/spaces/mteb/leaderboard
 ## Citing

mteb 2.1.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

mteb 2.1.0py3-none-any.whl → 2.1.1py3-none-any.whl