PyPI - mteb - Versions diffs - 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl - Mend

mteb 2.0.5py3-none-any.whl → 2.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (412) hide show

mteb/benchmarks/benchmarks/benchmarks.py CHANGED Viewed

@@ -1,4 +1,9 @@
-from mteb.benchmarks.benchmark import Benchmark, HUMEBenchmark, MIEBBenchmark
+from mteb.benchmarks.benchmark import (
+    Benchmark,
+    HUMEBenchmark,
+    MIEBBenchmark,
+    VidoreBenchmark,
+)
 from mteb.get_tasks import MTEBTasks, get_task, get_tasks
 MMTEB_CITATION = r"""@article{enevoldsen2025mmtebmassivemultilingualtext,
@@ -641,7 +646,7 @@ MTEB_KOR = Benchmark(
     icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
     tasks=get_tasks(
         languages=["kor"],
-        tasks=[  # @KennethEnevoldsen: We could probably expand this to a more solid benchamrk, but for now I have left it as is.
+        tasks=[  # @KennethEnevoldsen: We could probably expand this to a more solid benchmark, but for now I have left it as is.
             # Classification
             "KLUE-TC",
             # Reranking
@@ -975,8 +980,6 @@ MTEB_INDIC = Benchmark(
                 # Bitext
                 "IN22ConvBitextMining",
                 "IN22GenBitextMining",
-                "IndicGenBenchFloresBitextMining",
-                "LinceMTBitextMining",
                 # clustering
                 "SIB200ClusteringS2S",
                 # classification
@@ -985,7 +988,6 @@ MTEB_INDIC = Benchmark(
                 "HindiDiscourseClassification",
                 "SentimentAnalysisHindi",
                 "MalayalamNewsClassification",
-                "IndicLangClassification",
                 "MTOPIntentClassification",
                 "MultiHateClassification",
                 "TweetSentimentClassification",
@@ -1008,7 +1010,7 @@ MTEB_INDIC = Benchmark(
         # STS
         (get_task("IndicCrosslingualSTS"),)
     ),
-    description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.",
+    description="A regional geopolitical text embedding benchmark targeting embedding performance on Indic languages.",
     reference=None,
     citation=MMTEB_CITATION,
     contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1016,7 +1018,7 @@ MTEB_INDIC = Benchmark(
 eu_languages = [
-    # official EU languages (56) - we could include the whole economic area e.g. Norway - additioanlly we could include minority languages (probably a good idea?)
+    # official EU languages (56) - we could include the whole economic area e.g. Norway - additionally we could include minority languages (probably a good idea?)
     # germanic
     "dan",
     "eng",
@@ -1084,7 +1086,6 @@ MTEB_EU = Benchmark(
             "AmazonCounterfactualClassification",
             "MassiveScenarioClassification",
             "MultiHateClassification",
-            "NordicLangClassification",
             "ScalaClassification",
             "SwissJudgementClassification",
             "TweetSentimentClassification",
@@ -1142,7 +1143,7 @@ MTEB_EU = Benchmark(
         languages=eu_languages,
         exclusive_language_filter=True,
     ),
-    description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.",
+    description="A regional geopolitical text embedding benchmark targeting embedding performance on European languages.",
     reference=None,
     citation=MMTEB_CITATION,
     contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1636,6 +1637,81 @@ BEIR_NL = Benchmark(
 """,
 )
+MTEB_NL = Benchmark(
+    name="MTEB(nld, v1)",
+    display_name="Dutch",
+    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/nl.svg",
+    tasks=MTEBTasks(
+        get_tasks(
+            languages=["nld"],
+            exclusive_language_filter=True,
+            tasks=[
+                # Classification
+                "DutchBookReviewSentimentClassification.v2",
+                "MassiveIntentClassification",
+                "MassiveScenarioClassification",
+                "SIB200Classification",
+                "MultiHateClassification",
+                "VaccinChatNLClassification",
+                "DutchColaClassification",
+                "DutchGovernmentBiasClassification",
+                "DutchSarcasticHeadlinesClassification",
+                "DutchNewsArticlesClassification",
+                "OpenTenderClassification",
+                "IconclassClassification",
+                # # PairClassification
+                "SICKNLPairClassification",
+                "XLWICNLPairClassification",
+                # # MultiLabelClassification
+                "CovidDisinformationNLMultiLabelClassification",
+                "MultiEURLEXMultilabelClassification",
+                "VABBMultiLabelClassification",
+                # # Clustering
+                "DutchNewsArticlesClusteringS2S",
+                "DutchNewsArticlesClusteringP2P",
+                "SIB200ClusteringS2S",
+                "VABBClusteringS2S",
+                "VABBClusteringP2P",
+                "OpenTenderClusteringS2S",
+                "OpenTenderClusteringP2P",
+                "IconclassClusteringS2S",
+                # # Reranking
+                "WikipediaRerankingMultilingual",
+                # # Retrieval
+                "ArguAna-NL.v2",
+                "SCIDOCS-NL.v2",
+                "SciFact-NL.v2",
+                "NFCorpus-NL.v2",
+                "BelebeleRetrieval",
+                "WebFAQRetrieval",
+                "DutchNewsArticlesRetrieval",
+                "bBSARDNLRetrieval",
+                "LegalQANLRetrieval",
+                "OpenTenderRetrieval",
+                "VABBRetrieval",
+                "WikipediaRetrievalMultilingual",
+                # # STS
+                "SICK-NL-STS",
+                "STSBenchmarkMultilingualSTS",
+            ],
+        )
+    ),
+    description="MTEB-NL",
+    reference="https://arxiv.org/abs/2509.12340",
+    contacts=["nikolay-banar"],
+    citation=r"""
+@misc{banar2025mtebnle5nlembeddingbenchmark,
+  archiveprefix = {arXiv},
+  author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
+  eprint = {22509.12340},
+  primaryclass = {cs.CL},
+  title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
+  url = {https://arxiv.org/abs/2509.12340},
+  year = {2025},
+}
+""",
+)
 MIEB_common_tasks = [
     # Image Classification
     "Birdsnap",  # fine
@@ -1783,7 +1859,7 @@ MIEB_ENG = MIEBBenchmark(
     ),
     description="""MIEB(eng) is a comprehensive image embeddings benchmark, spanning 8 task types, covering 125 tasks.
     In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
-    document undestanding, visual STS, and CV-centric tasks.""",
+    document understanding, visual STS, and CV-centric tasks.""",
     reference="https://arxiv.org/abs/2504.10471",
     contacts=["gowitheflow-1998", "isaac-chung"],
     citation=r"""
@@ -1817,7 +1893,7 @@ MIEB_MULTILINGUAL = MIEBBenchmark(
     ),
     description="""MIEB(Multilingual) is a comprehensive image embeddings benchmark, spanning 10 task types, covering 130 tasks and a total of 39 languages.
     In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
-    document undestanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
+    document understanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
     datasets + the multilingual parts of VisualSTS-b and VisualSTS-16.""",
     reference="https://arxiv.org/abs/2504.10471",
     contacts=["gowitheflow-1998", "isaac-chung"],
@@ -2038,7 +2114,7 @@ BUILT_MTEB = Benchmark(
             "BuiltBenchReranking",
         ],
     ),
-    description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various dicsiplines such as architeture, engineering, constrcution, and operations management of the built environment.',
+    description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various disciplines such as architecture, engineering, construction, and operations management of the built environment.',
     reference="https://arxiv.org/abs/2411.12056",
     citation=r"""
 @article{shahinmoghadam2024benchmarking,
@@ -2143,10 +2219,43 @@ VIDORE_V2 = Benchmark(
 """,
 )
-VISUAL_DOCUMENT_RETRIEVAL = Benchmark(
-    name="VisualDocumentRetrieval",
-    display_name="Visual Document Retrieval",
-    icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-picture.svg",
+VIDORE_V3 = VidoreBenchmark(
+    name="ViDoRe(v3)",
+    display_name="ViDoRe V3",
+    icon="https://cdn-uploads.huggingface.co/production/uploads/66e16a677c2eb2da5109fb5c/x99xqw__fl2UaPbiIdC_f.png",
+    tasks=get_tasks(
+        tasks=[
+            "Vidore3FinanceEnRetrieval",
+            "Vidore3IndustrialRetrieval",
+            "Vidore3ComputerScienceRetrieval",
+            "Vidore3PharmaceuticalsRetrieval",
+            "Vidore3HrRetrieval",
+            "Vidore3FinanceFrRetrieval",
+            "Vidore3PhysicsRetrieval",
+            "Vidore3EnergyRetrieval",
+            "Vidore3TelecomRetrieval",
+            "Vidore3NuclearRetrieval",
+        ]
+    ),
+    description="ViDoRe V3 sets a new industry gold standard for multi-modal, enterprise document visual retrieval evaluation. It addresses a critical challenge in production RAG systems: retrieving accurate information from complex, visually-rich documents. The benchmark includes both open and closed datasets: to submit results on private tasks, please [open an issue](https://github.com/embeddings-benchmark/mteb/issues?template=eval_request.yaml).",
+    reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
+    citation=r"""
+@misc{mace2025vidorev3,
+  author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
+  day = {5},
+  howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
+  journal = {Hugging Face Blog},
+  month = {November},
+  publisher = {Hugging Face},
+  title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
+  year = {2025},
+}
+""",
+)
+VISUAL_DOCUMENT_RETRIEVAL = VidoreBenchmark(
+    name="ViDoRe(v1&v2)",
+    display_name="ViDoRe (V1&V2)",
     tasks=get_tasks(
         tasks=[
             # v1

mteb/benchmarks/get_benchmark.py CHANGED Viewed

@@ -14,7 +14,7 @@ def _build_registry() -> dict[str, Benchmark]:
     benchmark_registry = {
         inst.name: inst
-        for nam, inst in benchmark_module.__dict__.items()
+        for _, inst in benchmark_module.__dict__.items()
         if isinstance(inst, Benchmark)
     }
     return benchmark_registry
@@ -39,6 +39,7 @@ def _get_previous_benchmark_names() -> dict[str, str]:
         MTEB_RETRIEVAL_MEDICAL,
         MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
         SEB,
+        VISUAL_DOCUMENT_RETRIEVAL,
         MTEB_code,
         MTEB_multilingual_v2,
     )
@@ -63,6 +64,7 @@ def _get_previous_benchmark_names() -> dict[str, str]:
         "MTEB(Chinese)": C_MTEB.name,
         "FaMTEB(fas, beta)": FA_MTEB.name,
         "BRIGHT(long)": BRIGHT_LONG.name,
+        "VisualDocumentRetrieval": VISUAL_DOCUMENT_RETRIEVAL.name,
     }
     return previous_benchmark_names

mteb/cache.py CHANGED Viewed

@@ -62,7 +62,11 @@ class ResultCache:
         Returns:
             The path to the results of the task.
         """
-        results_folder = "results" if not remote else "remote"
+        results_folder = (
+            self.cache_path / "results"
+            if not remote
+            else self.cache_path / "remote" / "results"
+        )
         if isinstance(model_name, ModelMeta):
             if model_revision is not None:
@@ -74,7 +78,7 @@ class ResultCache:
         elif isinstance(model_name, str):
             model_name = model_name.replace("/", "__").replace(" ", "_")
-        model_path = self.cache_path / results_folder / model_name
+        model_path = results_folder / model_name
         if model_revision is None:
             logger.warning(
@@ -495,7 +499,7 @@ class ResultCache:
             if validate_and_filter:
                 task = task_names[task_result.task_name]
                 try:
-                    task_result.validate_and_filter_scores(task=task)
+                    task_result = task_result.validate_and_filter_scores(task=task)
                 except Exception as e:
                     logger.info(
                         f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"

mteb/descriptive_stats/Classification/DutchColaClassification.json ADDED Viewed

@@ -0,0 +1,54 @@
+{
+    "test": {
+        "num_samples": 2400,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 92146,
+            "min_text_length": 5,
+            "average_text_length": 38.39416666666666,
+            "max_text_length": 138,
+            "unique_texts": 2400
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1": {
+                    "count": 1200
+                },
+                "0": {
+                    "count": 1200
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 19893,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 761416,
+            "min_text_length": 4,
+            "average_text_length": 38.27557432262605,
+            "max_text_length": 152,
+            "unique_texts": 19893
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1": {
+                    "count": 12604
+                },
+                "0": {
+                    "count": 7289
+                }
+            }
+        }
+    }
+}

mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json ADDED Viewed

@@ -0,0 +1,54 @@
+{
+    "test": {
+        "num_samples": 752,
+        "number_texts_intersect_with_train": 100,
+        "text_statistics": {
+            "total_text_length": 171956,
+            "min_text_length": 32,
+            "average_text_length": 228.66489361702128,
+            "max_text_length": 2746,
+            "unique_texts": 752
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "0.0": {
+                    "count": 555
+                },
+                "1.0": {
+                    "count": 197
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 1718,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 390362,
+            "min_text_length": 18,
+            "average_text_length": 227.2188591385332,
+            "max_text_length": 2662,
+            "unique_texts": 1718
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1.0": {
+                    "count": 470
+                },
+                "0.0": {
+                    "count": 1248
+                }
+            }
+        }
+    }
+}

mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json ADDED Viewed

@@ -0,0 +1,90 @@
+{
+    "test": {
+        "num_samples": 1200,
+        "number_texts_intersect_with_train": 1,
+        "text_statistics": {
+            "total_text_length": 2034506,
+            "min_text_length": 184,
+            "average_text_length": 1695.4216666666666,
+            "max_text_length": 8825,
+            "unique_texts": 1200
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 8,
+            "labels": {
+                "Opmerkelijk": {
+                    "count": 150
+                },
+                "Buitenland": {
+                    "count": 150
+                },
+                "Cultuur & Media": {
+                    "count": 150
+                },
+                "Binnenland": {
+                    "count": 150
+                },
+                "Politiek": {
+                    "count": 150
+                },
+                "Economie": {
+                    "count": 150
+                },
+                "Tech": {
+                    "count": 150
+                },
+                "Regionaal nieuws": {
+                    "count": 150
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 5600,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 9620538,
+            "min_text_length": 106,
+            "average_text_length": 1717.9532142857142,
+            "max_text_length": 29389,
+            "unique_texts": 5600
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 8,
+            "labels": {
+                "Cultuur & Media": {
+                    "count": 700
+                },
+                "Binnenland": {
+                    "count": 700
+                },
+                "Buitenland": {
+                    "count": 700
+                },
+                "Regionaal nieuws": {
+                    "count": 700
+                },
+                "Politiek": {
+                    "count": 700
+                },
+                "Economie": {
+                    "count": 700
+                },
+                "Opmerkelijk": {
+                    "count": 700
+                },
+                "Tech": {
+                    "count": 700
+                }
+            }
+        }
+    }
+}

mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json ADDED Viewed

@@ -0,0 +1,54 @@
+{
+    "test": {
+        "num_samples": 1326,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 82644,
+            "min_text_length": 17,
+            "average_text_length": 62.32579185520362,
+            "max_text_length": 117,
+            "unique_texts": 1326
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "0": {
+                    "count": 826
+                },
+                "1": {
+                    "count": 500
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 10609,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 658787,
+            "min_text_length": 7,
+            "average_text_length": 62.09699311904986,
+            "max_text_length": 161,
+            "unique_texts": 10609
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1": {
+                    "count": 4000
+                },
+                "0": {
+                    "count": 6609
+                }
+            }
+        }
+    }
+}

mteb/descriptive_stats/Classification/IconclassClassification.json ADDED Viewed

@@ -0,0 +1,96 @@
+{
+    "test": {
+        "num_samples": 202,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 11827,
+            "min_text_length": 6,
+            "average_text_length": 58.54950495049505,
+            "max_text_length": 403,
+            "unique_texts": 202
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 9,
+            "labels": {
+                "Geschiedenis": {
+                    "count": 22
+                },
+                "Klassieke mythologie en Oude Geschiedenis": {
+                    "count": 22
+                },
+                "Literatuur": {
+                    "count": 23
+                },
+                "Natuur": {
+                    "count": 23
+                },
+                "De mens, de mensheid in het algemeen": {
+                    "count": 22
+                },
+                "Maatschappij, civilisatie en cultuur": {
+                    "count": 22
+                },
+                "Abstracte idee\u00ebn en concepten": {
+                    "count": 23
+                },
+                "Religie en magie": {
+                    "count": 22
+                },
+                "Bijbel": {
+                    "count": 23
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 945,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 52510,
+            "min_text_length": 3,
+            "average_text_length": 55.56613756613756,
+            "max_text_length": 793,
+            "unique_texts": 945
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 9,
+            "labels": {
+                "Literatuur": {
+                    "count": 105
+                },
+                "Maatschappij, civilisatie en cultuur": {
+                    "count": 105
+                },
+                "Klassieke mythologie en Oude Geschiedenis": {
+                    "count": 105
+                },
+                "Bijbel": {
+                    "count": 105
+                },
+                "De mens, de mensheid in het algemeen": {
+                    "count": 105
+                },
+                "Abstracte idee\u00ebn en concepten": {
+                    "count": 105
+                },
+                "Natuur": {
+                    "count": 105
+                },
+                "Geschiedenis": {
+                    "count": 105
+                },
+                "Religie en magie": {
+                    "count": 105
+                }
+            }
+        }
+    }
+}

mteb 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl

mteb 2.0.5py3-none-any.whl → 2.1.19py3-none-any.whl