PyPI - mteb - Versions diffs - 2.0.5__py3-none-any.whl → 2.1.1__py3-none-any.whl - Mend

mteb 2.0.5py3-none-any.whl → 2.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (156) hide show

mteb/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from importlib.metadata import version
+from mteb import types
 from mteb.abstasks import AbsTask
 from mteb.abstasks.task_metadata import TaskMetadata
 from mteb.deprecated_evaluator import MTEB
@@ -7,7 +8,12 @@ from mteb.evaluate import evaluate
 from mteb.filter_tasks import filter_tasks
 from mteb.get_tasks import get_task, get_tasks
 from mteb.load_results import load_results
-from mteb.models import EncoderProtocol, SentenceTransformerEncoderWrapper
+from mteb.models import (
+    CrossEncoderProtocol,
+    EncoderProtocol,
+    SearchProtocol,
+    SentenceTransformerEncoderWrapper,
+)
 from mteb.models.get_model_meta import get_model, get_model_meta, get_model_metas
 from mteb.results import BenchmarkResults, TaskResult
@@ -21,7 +27,9 @@ __all__ = [
     "AbsTask",
     "Benchmark",
     "BenchmarkResults",
+    "CrossEncoderProtocol",
     "EncoderProtocol",
+    "SearchProtocol",
     "SentenceTransformerEncoderWrapper",
     "TaskMetadata",
     "TaskResult",
@@ -35,4 +43,5 @@ __all__ = [
     "get_task",
     "get_tasks",
     "load_results",
+    "types",
 ]

mteb/_create_dataloaders.py CHANGED Viewed

@@ -277,6 +277,8 @@ def _custom_collate_fn(batch: list[dict[str, Any]]) -> dict[str, Any]:
             # Leave the images as a list to avoid stacking errors.
             collated[key] = [item[key] for item in batch]
         else:
+            if any(item[key] is None for item in batch):
+                raise ValueError(f"Found None in batch for key '{key}'")
             collated[key] = default_collate([item[key] for item in batch])
     return collated

mteb/abstasks/_stratification.py CHANGED Viewed

@@ -134,7 +134,7 @@ def _get_most_desired_combination(samples_with_combination: dict):
 class IterativeStratification(_BaseKFold):
     """Iteratively stratify a multi-label data set into folds
-    Construct an interative stratifier that splits the data set into folds trying to maintain balanced representation
+    Construct an iterative stratifier that splits the data set into folds trying to maintain balanced representation
     with respect to order-th label combinations.
     """

mteb/abstasks/abstask.py CHANGED Viewed

@@ -459,7 +459,7 @@ class AbsTask(ABC):
         """Filter the languages of the task.
         Args:
-            languages: list of languages to filter the task by can be either a 3-letter langauge code (e.g. "eng") or also include the script
+            languages: list of languages to filter the task by can be either a 3-letter language code (e.g. "eng") or also include the script
                 (e.g. "eng-Latn")
             script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included.
                 If the language code does not specify the script the intersection of the language and script will be used.
@@ -491,6 +491,11 @@ class AbsTask(ABC):
                 if lang_scripts.contains_languages(langs):
                     subsets_to_keep.append(hf_subset)
+        if len(subsets_to_keep) == 0:
+            raise ValueError(
+                f"No subsets were found for {self.metadata.name} with filters: language code {languages}, script {script}, hf subsets {hf_subsets}."
+            )
         self.hf_subsets = subsets_to_keep
         return self

mteb/abstasks/dataset_card_template.md CHANGED Viewed

@@ -85,7 +85,7 @@ desc_stats = task.metadata.descriptive_stats
 ```
 ```json
-{{ descritptive_stats | default("{}", true) }}
+{{ descriptive_stats | default("{}", true) }}
 ```
 </details>

mteb/abstasks/retrieval.py CHANGED Viewed

@@ -653,6 +653,8 @@ class AbsTaskRetrieval(AbsTask):
             FileNotFoundError: If the specified path does not exist.
             ValueError: If the loaded top ranked results are not in the expected format.
         """
+        self._top_k = top_k
         top_ranked_path = Path(top_ranked_path)
         if top_ranked_path.is_dir():
             top_ranked_path = self._predictions_path(top_ranked_path)
@@ -682,7 +684,6 @@ class AbsTaskRetrieval(AbsTask):
                     top_k_sorted[query_id] = sorted_keys[: self._top_k]
                 self.dataset[subset][split]["top_ranked"] = top_k_sorted
-        self._top_k = top_k
         return self

mteb/abstasks/retrieval_dataset_loaders.py CHANGED Viewed

@@ -176,7 +176,7 @@ class RetrievalDatasetLoader:
                 {
                     "query-id": Value("string"),
                     "corpus-id": Value("string"),
-                    "score": Value("uint16"),
+                    "score": Value("int32"),
                 }
             )
         )

mteb/abstasks/task_metadata.py CHANGED Viewed

@@ -532,7 +532,7 @@ class TaskMetadata(BaseModel):
                 citation=self.bibtex_citation,
                 dataset_description=self.description,
                 dataset_reference=self.reference,
-                descritptive_stats=descriptive_stats,
+                descriptive_stats=descriptive_stats,
                 dataset_task_name=self.name,
                 category=self.category,
                 domains=", ".join(self.domains) if self.domains else None,

mteb/benchmarks/benchmarks/__init__.py CHANGED Viewed

@@ -27,6 +27,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
     MTEB_KOR,
     MTEB_MAIN_RU,
     MTEB_MINERS_BITEXT_MINING,
+    MTEB_NL,
     MTEB_POL,
     MTEB_RETRIEVAL_LAW,
     MTEB_RETRIEVAL_MEDICAL,
@@ -87,6 +88,7 @@ __all__ = [
     "MTEB_KOR",
     "MTEB_MAIN_RU",
     "MTEB_MINERS_BITEXT_MINING",
+    "MTEB_NL",
     "MTEB_POL",
     "MTEB_RETRIEVAL_LAW",
     "MTEB_RETRIEVAL_MEDICAL",

mteb/benchmarks/benchmarks/benchmarks.py CHANGED Viewed

@@ -641,7 +641,7 @@ MTEB_KOR = Benchmark(
     icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
     tasks=get_tasks(
         languages=["kor"],
-        tasks=[  # @KennethEnevoldsen: We could probably expand this to a more solid benchamrk, but for now I have left it as is.
+        tasks=[  # @KennethEnevoldsen: We could probably expand this to a more solid benchmark, but for now I have left it as is.
             # Classification
             "KLUE-TC",
             # Reranking
@@ -975,8 +975,6 @@ MTEB_INDIC = Benchmark(
                 # Bitext
                 "IN22ConvBitextMining",
                 "IN22GenBitextMining",
-                "IndicGenBenchFloresBitextMining",
-                "LinceMTBitextMining",
                 # clustering
                 "SIB200ClusteringS2S",
                 # classification
@@ -985,7 +983,6 @@ MTEB_INDIC = Benchmark(
                 "HindiDiscourseClassification",
                 "SentimentAnalysisHindi",
                 "MalayalamNewsClassification",
-                "IndicLangClassification",
                 "MTOPIntentClassification",
                 "MultiHateClassification",
                 "TweetSentimentClassification",
@@ -1008,7 +1005,7 @@ MTEB_INDIC = Benchmark(
         # STS
         (get_task("IndicCrosslingualSTS"),)
     ),
-    description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.",
+    description="A regional geopolitical text embedding benchmark targeting embedding performance on Indic languages.",
     reference=None,
     citation=MMTEB_CITATION,
     contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1016,7 +1013,7 @@ MTEB_INDIC = Benchmark(
 eu_languages = [
-    # official EU languages (56) - we could include the whole economic area e.g. Norway - additioanlly we could include minority languages (probably a good idea?)
+    # official EU languages (56) - we could include the whole economic area e.g. Norway - additionally we could include minority languages (probably a good idea?)
     # germanic
     "dan",
     "eng",
@@ -1084,7 +1081,6 @@ MTEB_EU = Benchmark(
             "AmazonCounterfactualClassification",
             "MassiveScenarioClassification",
             "MultiHateClassification",
-            "NordicLangClassification",
             "ScalaClassification",
             "SwissJudgementClassification",
             "TweetSentimentClassification",
@@ -1142,7 +1138,7 @@ MTEB_EU = Benchmark(
         languages=eu_languages,
         exclusive_language_filter=True,
     ),
-    description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.",
+    description="A regional geopolitical text embedding benchmark targeting embedding performance on European languages.",
     reference=None,
     citation=MMTEB_CITATION,
     contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1636,6 +1632,81 @@ BEIR_NL = Benchmark(
 """,
 )
+MTEB_NL = Benchmark(
+    name="MTEB(nld, v1)",
+    display_name="MTEB-NL",
+    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/nl.svg",
+    tasks=MTEBTasks(
+        get_tasks(
+            languages=["nld"],
+            exclusive_language_filter=True,
+            tasks=[
+                # Classification
+                "DutchBookReviewSentimentClassification",
+                "MassiveIntentClassification",
+                "MassiveScenarioClassification",
+                "SIB200Classification",
+                "MultiHateClassification",
+                "VaccinChatNLClassification",
+                "DutchColaClassification",
+                "DutchGovernmentBiasClassification",
+                "DutchSarcasticHeadlinesClassification",
+                "DutchNewsArticlesClassification",
+                "OpenTenderClassification",
+                "IconclassClassification",
+                # # PairClassification
+                "SICKNLPairClassification",
+                "XLWICNLPairClassification",
+                # # MultiLabelClassification
+                "CovidDisinformationNLMultiLabelClassification",
+                "MultiEURLEXMultilabelClassification",
+                "VABBMultiLabelClassification",
+                # # Clustering
+                "DutchNewsArticlesClusteringS2S",
+                "DutchNewsArticlesClusteringP2P",
+                "SIB200ClusteringS2S",
+                "VABBClusteringS2S",
+                "VABBClusteringP2P",
+                "OpenTenderClusteringS2S",
+                "OpenTenderClusteringP2P",
+                "IconclassClusteringS2S",
+                # # Reranking
+                "WikipediaRerankingMultilingual",
+                # # Retrieval
+                "ArguAna-NL",
+                "SCIDOCS-NL",
+                "SciFact-NL",
+                "NFCorpus-NL",
+                "BelebeleRetrieval",
+                # "WebFAQRetrieval",
+                "DutchNewsArticlesRetrieval",
+                "bBSARDNLRetrieval",
+                "LegalQANLRetrieval",
+                "OpenTenderRetrieval",
+                "VABBRetrieval",
+                "WikipediaRetrievalMultilingual",
+                # # STS
+                "SICK-NL-STS",
+                "STSBenchmarkMultilingualSTS",
+            ],
+        )
+    ),
+    description="MTEB-NL",
+    reference="https://arxiv.org/abs/2509.12340",
+    contacts=["nikolay-banar"],
+    citation=r"""
+@misc{banar2025mtebnle5nlembeddingbenchmark,
+  archiveprefix = {arXiv},
+  author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
+  eprint = {22509.12340},
+  primaryclass = {cs.CL},
+  title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
+  url = {https://arxiv.org/abs/2509.12340},
+  year = {2025},
+}
+""",
+)
 MIEB_common_tasks = [
     # Image Classification
     "Birdsnap",  # fine
@@ -1783,7 +1854,7 @@ MIEB_ENG = MIEBBenchmark(
     ),
     description="""MIEB(eng) is a comprehensive image embeddings benchmark, spanning 8 task types, covering 125 tasks.
     In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
-    document undestanding, visual STS, and CV-centric tasks.""",
+    document understanding, visual STS, and CV-centric tasks.""",
     reference="https://arxiv.org/abs/2504.10471",
     contacts=["gowitheflow-1998", "isaac-chung"],
     citation=r"""
@@ -1817,7 +1888,7 @@ MIEB_MULTILINGUAL = MIEBBenchmark(
     ),
     description="""MIEB(Multilingual) is a comprehensive image embeddings benchmark, spanning 10 task types, covering 130 tasks and a total of 39 languages.
     In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
-    document undestanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
+    document understanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
     datasets + the multilingual parts of VisualSTS-b and VisualSTS-16.""",
     reference="https://arxiv.org/abs/2504.10471",
     contacts=["gowitheflow-1998", "isaac-chung"],
@@ -2038,7 +2109,7 @@ BUILT_MTEB = Benchmark(
             "BuiltBenchReranking",
         ],
     ),
-    description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various dicsiplines such as architeture, engineering, constrcution, and operations management of the built environment.',
+    description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various disciplines such as architecture, engineering, construction, and operations management of the built environment.',
     reference="https://arxiv.org/abs/2411.12056",
     citation=r"""
 @article{shahinmoghadam2024benchmarking,

mteb/benchmarks/get_benchmark.py CHANGED Viewed

@@ -14,7 +14,7 @@ def _build_registry() -> dict[str, Benchmark]:
     benchmark_registry = {
         inst.name: inst
-        for nam, inst in benchmark_module.__dict__.items()
+        for _, inst in benchmark_module.__dict__.items()
         if isinstance(inst, Benchmark)
     }
     return benchmark_registry

mteb/descriptive_stats/Classification/DutchColaClassification.json ADDED Viewed

@@ -0,0 +1,54 @@
+{
+    "test": {
+        "num_samples": 2400,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 92146,
+            "min_text_length": 5,
+            "average_text_length": 38.39416666666666,
+            "max_text_length": 138,
+            "unique_texts": 2400
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1": {
+                    "count": 1200
+                },
+                "0": {
+                    "count": 1200
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 19893,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 761416,
+            "min_text_length": 4,
+            "average_text_length": 38.27557432262605,
+            "max_text_length": 152,
+            "unique_texts": 19893
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1": {
+                    "count": 12604
+                },
+                "0": {
+                    "count": 7289
+                }
+            }
+        }
+    }
+}

mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json ADDED Viewed

@@ -0,0 +1,54 @@
+{
+    "test": {
+        "num_samples": 752,
+        "number_texts_intersect_with_train": 100,
+        "text_statistics": {
+            "total_text_length": 171956,
+            "min_text_length": 32,
+            "average_text_length": 228.66489361702128,
+            "max_text_length": 2746,
+            "unique_texts": 752
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "0.0": {
+                    "count": 555
+                },
+                "1.0": {
+                    "count": 197
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 1718,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 390362,
+            "min_text_length": 18,
+            "average_text_length": 227.2188591385332,
+            "max_text_length": 2662,
+            "unique_texts": 1718
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1.0": {
+                    "count": 470
+                },
+                "0.0": {
+                    "count": 1248
+                }
+            }
+        }
+    }
+}

mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json ADDED Viewed

@@ -0,0 +1,90 @@
+{
+    "test": {
+        "num_samples": 1200,
+        "number_texts_intersect_with_train": 1,
+        "text_statistics": {
+            "total_text_length": 2034506,
+            "min_text_length": 184,
+            "average_text_length": 1695.4216666666666,
+            "max_text_length": 8825,
+            "unique_texts": 1200
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 8,
+            "labels": {
+                "Opmerkelijk": {
+                    "count": 150
+                },
+                "Buitenland": {
+                    "count": 150
+                },
+                "Cultuur & Media": {
+                    "count": 150
+                },
+                "Binnenland": {
+                    "count": 150
+                },
+                "Politiek": {
+                    "count": 150
+                },
+                "Economie": {
+                    "count": 150
+                },
+                "Tech": {
+                    "count": 150
+                },
+                "Regionaal nieuws": {
+                    "count": 150
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 5600,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 9620538,
+            "min_text_length": 106,
+            "average_text_length": 1717.9532142857142,
+            "max_text_length": 29389,
+            "unique_texts": 5600
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 8,
+            "labels": {
+                "Cultuur & Media": {
+                    "count": 700
+                },
+                "Binnenland": {
+                    "count": 700
+                },
+                "Buitenland": {
+                    "count": 700
+                },
+                "Regionaal nieuws": {
+                    "count": 700
+                },
+                "Politiek": {
+                    "count": 700
+                },
+                "Economie": {
+                    "count": 700
+                },
+                "Opmerkelijk": {
+                    "count": 700
+                },
+                "Tech": {
+                    "count": 700
+                }
+            }
+        }
+    }
+}

mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json ADDED Viewed

@@ -0,0 +1,54 @@
+{
+    "test": {
+        "num_samples": 1326,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 82644,
+            "min_text_length": 17,
+            "average_text_length": 62.32579185520362,
+            "max_text_length": 117,
+            "unique_texts": 1326
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "0": {
+                    "count": 826
+                },
+                "1": {
+                    "count": 500
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 10609,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 658787,
+            "min_text_length": 7,
+            "average_text_length": 62.09699311904986,
+            "max_text_length": 161,
+            "unique_texts": 10609
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 2,
+            "labels": {
+                "1": {
+                    "count": 4000
+                },
+                "0": {
+                    "count": 6609
+                }
+            }
+        }
+    }
+}

mteb/descriptive_stats/Classification/IconclassClassification.json ADDED Viewed

@@ -0,0 +1,96 @@
+{
+    "test": {
+        "num_samples": 202,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 11827,
+            "min_text_length": 6,
+            "average_text_length": 58.54950495049505,
+            "max_text_length": 403,
+            "unique_texts": 202
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 9,
+            "labels": {
+                "Geschiedenis": {
+                    "count": 22
+                },
+                "Klassieke mythologie en Oude Geschiedenis": {
+                    "count": 22
+                },
+                "Literatuur": {
+                    "count": 23
+                },
+                "Natuur": {
+                    "count": 23
+                },
+                "De mens, de mensheid in het algemeen": {
+                    "count": 22
+                },
+                "Maatschappij, civilisatie en cultuur": {
+                    "count": 22
+                },
+                "Abstracte idee\u00ebn en concepten": {
+                    "count": 23
+                },
+                "Religie en magie": {
+                    "count": 22
+                },
+                "Bijbel": {
+                    "count": 23
+                }
+            }
+        }
+    },
+    "train": {
+        "num_samples": 945,
+        "number_texts_intersect_with_train": null,
+        "text_statistics": {
+            "total_text_length": 52510,
+            "min_text_length": 3,
+            "average_text_length": 55.56613756613756,
+            "max_text_length": 793,
+            "unique_texts": 945
+        },
+        "image_statistics": null,
+        "label_statistics": {
+            "min_labels_per_text": 1,
+            "average_label_per_text": 1.0,
+            "max_labels_per_text": 1,
+            "unique_labels": 9,
+            "labels": {
+                "Literatuur": {
+                    "count": 105
+                },
+                "Maatschappij, civilisatie en cultuur": {
+                    "count": 105
+                },
+                "Klassieke mythologie en Oude Geschiedenis": {
+                    "count": 105
+                },
+                "Bijbel": {
+                    "count": 105
+                },
+                "De mens, de mensheid in het algemeen": {
+                    "count": 105
+                },
+                "Abstracte idee\u00ebn en concepten": {
+                    "count": 105
+                },
+                "Natuur": {
+                    "count": 105
+                },
+                "Geschiedenis": {
+                    "count": 105
+                },
+                "Religie en magie": {
+                    "count": 105
+                }
+            }
+        }
+    }
+}

mteb 2.0.5__py3-none-any.whl → 2.1.1__py3-none-any.whl

mteb 2.0.5py3-none-any.whl → 2.1.1py3-none-any.whl