mteb 2.0.5__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +10 -1
- mteb/_create_dataloaders.py +2 -0
- mteb/abstasks/_stratification.py +1 -1
- mteb/abstasks/abstask.py +6 -1
- mteb/abstasks/dataset_card_template.md +1 -1
- mteb/abstasks/retrieval.py +2 -1
- mteb/abstasks/retrieval_dataset_loaders.py +1 -1
- mteb/abstasks/task_metadata.py +1 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +82 -11
- mteb/benchmarks/get_benchmark.py +1 -1
- mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
- mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
- mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
- mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
- mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
- mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
- mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
- mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
- mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
- mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
- mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
- mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
- mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
- mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
- mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
- mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
- mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
- mteb/languages/check_language_code.py +11 -3
- mteb/languages/language_scripts.py +4 -0
- mteb/leaderboard/text_segments.py +1 -1
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +1 -3
- mteb/models/model_implementations/bmretriever_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +2 -2
- mteb/models/model_implementations/ibm_granite_models.py +1 -1
- mteb/models/model_implementations/inf_models.py +3 -3
- mteb/models/model_implementations/jina_models.py +12 -2
- mteb/models/model_implementations/llm2vec_models.py +1 -1
- mteb/models/model_implementations/misc_models.py +2 -2
- mteb/models/model_implementations/mxbai_models.py +1 -1
- mteb/models/model_implementations/salesforce_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +9 -9
- mteb/results/task_result.py +6 -8
- mteb/tasks/classification/dan/angry_tweets_classification.py +2 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +3 -3
- mteb/tasks/classification/mya/myanmar_news.py +2 -2
- mteb/tasks/classification/nld/__init__.py +16 -0
- mteb/tasks/classification/nld/dutch_cola_classification.py +38 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +37 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +30 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +36 -0
- mteb/tasks/classification/nld/iconclass_classification.py +41 -0
- mteb/tasks/classification/nld/open_tender_classification.py +38 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +46 -0
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/clustering/__init__.py +1 -0
- mteb/tasks/clustering/nld/__init__.py +17 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +37 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +37 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +47 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +51 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +41 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +51 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +51 -0
- mteb/tasks/multilabel_classification/__init__.py +1 -0
- mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +88 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +44 -0
- mteb/tasks/pair_classification/__init__.py +1 -0
- mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
- mteb/tasks/pair_classification/nld/__init__.py +7 -0
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +36 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +41 -0
- mteb/tasks/retrieval/code/code_rag.py +8 -8
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +18 -4
- mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
- mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
- mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
- mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
- mteb/tasks/retrieval/nld/__init__.py +10 -0
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +41 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +30 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +39 -0
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +38 -0
- mteb/tasks/retrieval/nld/vabb_retrieval.py +41 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/rus/__init__.py +11 -2
- mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
- mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
- mteb/tasks/sts/__init__.py +1 -0
- mteb/tasks/sts/nld/__init__.py +5 -0
- mteb/tasks/sts/nld/sick_nl_sts.py +41 -0
- mteb-2.1.1.dist-info/METADATA +253 -0
- {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/RECORD +142 -95
- mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
- mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
- mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
- mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
- mteb-2.0.5.dist-info/METADATA +0 -455
- {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/WHEEL +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/top_level.txt +0 -0
mteb/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from importlib.metadata import version
|
|
2
2
|
|
|
3
|
+
from mteb import types
|
|
3
4
|
from mteb.abstasks import AbsTask
|
|
4
5
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
5
6
|
from mteb.deprecated_evaluator import MTEB
|
|
@@ -7,7 +8,12 @@ from mteb.evaluate import evaluate
|
|
|
7
8
|
from mteb.filter_tasks import filter_tasks
|
|
8
9
|
from mteb.get_tasks import get_task, get_tasks
|
|
9
10
|
from mteb.load_results import load_results
|
|
10
|
-
from mteb.models import
|
|
11
|
+
from mteb.models import (
|
|
12
|
+
CrossEncoderProtocol,
|
|
13
|
+
EncoderProtocol,
|
|
14
|
+
SearchProtocol,
|
|
15
|
+
SentenceTransformerEncoderWrapper,
|
|
16
|
+
)
|
|
11
17
|
from mteb.models.get_model_meta import get_model, get_model_meta, get_model_metas
|
|
12
18
|
from mteb.results import BenchmarkResults, TaskResult
|
|
13
19
|
|
|
@@ -21,7 +27,9 @@ __all__ = [
|
|
|
21
27
|
"AbsTask",
|
|
22
28
|
"Benchmark",
|
|
23
29
|
"BenchmarkResults",
|
|
30
|
+
"CrossEncoderProtocol",
|
|
24
31
|
"EncoderProtocol",
|
|
32
|
+
"SearchProtocol",
|
|
25
33
|
"SentenceTransformerEncoderWrapper",
|
|
26
34
|
"TaskMetadata",
|
|
27
35
|
"TaskResult",
|
|
@@ -35,4 +43,5 @@ __all__ = [
|
|
|
35
43
|
"get_task",
|
|
36
44
|
"get_tasks",
|
|
37
45
|
"load_results",
|
|
46
|
+
"types",
|
|
38
47
|
]
|
mteb/_create_dataloaders.py
CHANGED
|
@@ -277,6 +277,8 @@ def _custom_collate_fn(batch: list[dict[str, Any]]) -> dict[str, Any]:
|
|
|
277
277
|
# Leave the images as a list to avoid stacking errors.
|
|
278
278
|
collated[key] = [item[key] for item in batch]
|
|
279
279
|
else:
|
|
280
|
+
if any(item[key] is None for item in batch):
|
|
281
|
+
raise ValueError(f"Found None in batch for key '{key}'")
|
|
280
282
|
collated[key] = default_collate([item[key] for item in batch])
|
|
281
283
|
return collated
|
|
282
284
|
|
mteb/abstasks/_stratification.py
CHANGED
|
@@ -134,7 +134,7 @@ def _get_most_desired_combination(samples_with_combination: dict):
|
|
|
134
134
|
class IterativeStratification(_BaseKFold):
|
|
135
135
|
"""Iteratively stratify a multi-label data set into folds
|
|
136
136
|
|
|
137
|
-
Construct an
|
|
137
|
+
Construct an iterative stratifier that splits the data set into folds trying to maintain balanced representation
|
|
138
138
|
with respect to order-th label combinations.
|
|
139
139
|
"""
|
|
140
140
|
|
mteb/abstasks/abstask.py
CHANGED
|
@@ -459,7 +459,7 @@ class AbsTask(ABC):
|
|
|
459
459
|
"""Filter the languages of the task.
|
|
460
460
|
|
|
461
461
|
Args:
|
|
462
|
-
languages: list of languages to filter the task by can be either a 3-letter
|
|
462
|
+
languages: list of languages to filter the task by can be either a 3-letter language code (e.g. "eng") or also include the script
|
|
463
463
|
(e.g. "eng-Latn")
|
|
464
464
|
script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included.
|
|
465
465
|
If the language code does not specify the script the intersection of the language and script will be used.
|
|
@@ -491,6 +491,11 @@ class AbsTask(ABC):
|
|
|
491
491
|
if lang_scripts.contains_languages(langs):
|
|
492
492
|
subsets_to_keep.append(hf_subset)
|
|
493
493
|
|
|
494
|
+
if len(subsets_to_keep) == 0:
|
|
495
|
+
raise ValueError(
|
|
496
|
+
f"No subsets were found for {self.metadata.name} with filters: language code {languages}, script {script}, hf subsets {hf_subsets}."
|
|
497
|
+
)
|
|
498
|
+
|
|
494
499
|
self.hf_subsets = subsets_to_keep
|
|
495
500
|
return self
|
|
496
501
|
|
mteb/abstasks/retrieval.py
CHANGED
|
@@ -653,6 +653,8 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
653
653
|
FileNotFoundError: If the specified path does not exist.
|
|
654
654
|
ValueError: If the loaded top ranked results are not in the expected format.
|
|
655
655
|
"""
|
|
656
|
+
self._top_k = top_k
|
|
657
|
+
|
|
656
658
|
top_ranked_path = Path(top_ranked_path)
|
|
657
659
|
if top_ranked_path.is_dir():
|
|
658
660
|
top_ranked_path = self._predictions_path(top_ranked_path)
|
|
@@ -682,7 +684,6 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
682
684
|
top_k_sorted[query_id] = sorted_keys[: self._top_k]
|
|
683
685
|
|
|
684
686
|
self.dataset[subset][split]["top_ranked"] = top_k_sorted
|
|
685
|
-
self._top_k = top_k
|
|
686
687
|
return self
|
|
687
688
|
|
|
688
689
|
|
mteb/abstasks/task_metadata.py
CHANGED
|
@@ -532,7 +532,7 @@ class TaskMetadata(BaseModel):
|
|
|
532
532
|
citation=self.bibtex_citation,
|
|
533
533
|
dataset_description=self.description,
|
|
534
534
|
dataset_reference=self.reference,
|
|
535
|
-
|
|
535
|
+
descriptive_stats=descriptive_stats,
|
|
536
536
|
dataset_task_name=self.name,
|
|
537
537
|
category=self.category,
|
|
538
538
|
domains=", ".join(self.domains) if self.domains else None,
|
|
@@ -27,6 +27,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
27
27
|
MTEB_KOR,
|
|
28
28
|
MTEB_MAIN_RU,
|
|
29
29
|
MTEB_MINERS_BITEXT_MINING,
|
|
30
|
+
MTEB_NL,
|
|
30
31
|
MTEB_POL,
|
|
31
32
|
MTEB_RETRIEVAL_LAW,
|
|
32
33
|
MTEB_RETRIEVAL_MEDICAL,
|
|
@@ -87,6 +88,7 @@ __all__ = [
|
|
|
87
88
|
"MTEB_KOR",
|
|
88
89
|
"MTEB_MAIN_RU",
|
|
89
90
|
"MTEB_MINERS_BITEXT_MINING",
|
|
91
|
+
"MTEB_NL",
|
|
90
92
|
"MTEB_POL",
|
|
91
93
|
"MTEB_RETRIEVAL_LAW",
|
|
92
94
|
"MTEB_RETRIEVAL_MEDICAL",
|
|
@@ -641,7 +641,7 @@ MTEB_KOR = Benchmark(
|
|
|
641
641
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
|
|
642
642
|
tasks=get_tasks(
|
|
643
643
|
languages=["kor"],
|
|
644
|
-
tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid
|
|
644
|
+
tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid benchmark, but for now I have left it as is.
|
|
645
645
|
# Classification
|
|
646
646
|
"KLUE-TC",
|
|
647
647
|
# Reranking
|
|
@@ -975,8 +975,6 @@ MTEB_INDIC = Benchmark(
|
|
|
975
975
|
# Bitext
|
|
976
976
|
"IN22ConvBitextMining",
|
|
977
977
|
"IN22GenBitextMining",
|
|
978
|
-
"IndicGenBenchFloresBitextMining",
|
|
979
|
-
"LinceMTBitextMining",
|
|
980
978
|
# clustering
|
|
981
979
|
"SIB200ClusteringS2S",
|
|
982
980
|
# classification
|
|
@@ -985,7 +983,6 @@ MTEB_INDIC = Benchmark(
|
|
|
985
983
|
"HindiDiscourseClassification",
|
|
986
984
|
"SentimentAnalysisHindi",
|
|
987
985
|
"MalayalamNewsClassification",
|
|
988
|
-
"IndicLangClassification",
|
|
989
986
|
"MTOPIntentClassification",
|
|
990
987
|
"MultiHateClassification",
|
|
991
988
|
"TweetSentimentClassification",
|
|
@@ -1008,7 +1005,7 @@ MTEB_INDIC = Benchmark(
|
|
|
1008
1005
|
# STS
|
|
1009
1006
|
(get_task("IndicCrosslingualSTS"),)
|
|
1010
1007
|
),
|
|
1011
|
-
description="A regional geopolitical text embedding benchmark
|
|
1008
|
+
description="A regional geopolitical text embedding benchmark targeting embedding performance on Indic languages.",
|
|
1012
1009
|
reference=None,
|
|
1013
1010
|
citation=MMTEB_CITATION,
|
|
1014
1011
|
contacts=["KennethEnevoldsen", "isaac-chung"],
|
|
@@ -1016,7 +1013,7 @@ MTEB_INDIC = Benchmark(
|
|
|
1016
1013
|
|
|
1017
1014
|
|
|
1018
1015
|
eu_languages = [
|
|
1019
|
-
# official EU languages (56) - we could include the whole economic area e.g. Norway -
|
|
1016
|
+
# official EU languages (56) - we could include the whole economic area e.g. Norway - additionally we could include minority languages (probably a good idea?)
|
|
1020
1017
|
# germanic
|
|
1021
1018
|
"dan",
|
|
1022
1019
|
"eng",
|
|
@@ -1084,7 +1081,6 @@ MTEB_EU = Benchmark(
|
|
|
1084
1081
|
"AmazonCounterfactualClassification",
|
|
1085
1082
|
"MassiveScenarioClassification",
|
|
1086
1083
|
"MultiHateClassification",
|
|
1087
|
-
"NordicLangClassification",
|
|
1088
1084
|
"ScalaClassification",
|
|
1089
1085
|
"SwissJudgementClassification",
|
|
1090
1086
|
"TweetSentimentClassification",
|
|
@@ -1142,7 +1138,7 @@ MTEB_EU = Benchmark(
|
|
|
1142
1138
|
languages=eu_languages,
|
|
1143
1139
|
exclusive_language_filter=True,
|
|
1144
1140
|
),
|
|
1145
|
-
description="A regional geopolitical text embedding benchmark
|
|
1141
|
+
description="A regional geopolitical text embedding benchmark targeting embedding performance on European languages.",
|
|
1146
1142
|
reference=None,
|
|
1147
1143
|
citation=MMTEB_CITATION,
|
|
1148
1144
|
contacts=["KennethEnevoldsen", "isaac-chung"],
|
|
@@ -1636,6 +1632,81 @@ BEIR_NL = Benchmark(
|
|
|
1636
1632
|
""",
|
|
1637
1633
|
)
|
|
1638
1634
|
|
|
1635
|
+
MTEB_NL = Benchmark(
|
|
1636
|
+
name="MTEB(nld, v1)",
|
|
1637
|
+
display_name="MTEB-NL",
|
|
1638
|
+
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/nl.svg",
|
|
1639
|
+
tasks=MTEBTasks(
|
|
1640
|
+
get_tasks(
|
|
1641
|
+
languages=["nld"],
|
|
1642
|
+
exclusive_language_filter=True,
|
|
1643
|
+
tasks=[
|
|
1644
|
+
# Classification
|
|
1645
|
+
"DutchBookReviewSentimentClassification",
|
|
1646
|
+
"MassiveIntentClassification",
|
|
1647
|
+
"MassiveScenarioClassification",
|
|
1648
|
+
"SIB200Classification",
|
|
1649
|
+
"MultiHateClassification",
|
|
1650
|
+
"VaccinChatNLClassification",
|
|
1651
|
+
"DutchColaClassification",
|
|
1652
|
+
"DutchGovernmentBiasClassification",
|
|
1653
|
+
"DutchSarcasticHeadlinesClassification",
|
|
1654
|
+
"DutchNewsArticlesClassification",
|
|
1655
|
+
"OpenTenderClassification",
|
|
1656
|
+
"IconclassClassification",
|
|
1657
|
+
# # PairClassification
|
|
1658
|
+
"SICKNLPairClassification",
|
|
1659
|
+
"XLWICNLPairClassification",
|
|
1660
|
+
# # MultiLabelClassification
|
|
1661
|
+
"CovidDisinformationNLMultiLabelClassification",
|
|
1662
|
+
"MultiEURLEXMultilabelClassification",
|
|
1663
|
+
"VABBMultiLabelClassification",
|
|
1664
|
+
# # Clustering
|
|
1665
|
+
"DutchNewsArticlesClusteringS2S",
|
|
1666
|
+
"DutchNewsArticlesClusteringP2P",
|
|
1667
|
+
"SIB200ClusteringS2S",
|
|
1668
|
+
"VABBClusteringS2S",
|
|
1669
|
+
"VABBClusteringP2P",
|
|
1670
|
+
"OpenTenderClusteringS2S",
|
|
1671
|
+
"OpenTenderClusteringP2P",
|
|
1672
|
+
"IconclassClusteringS2S",
|
|
1673
|
+
# # Reranking
|
|
1674
|
+
"WikipediaRerankingMultilingual",
|
|
1675
|
+
# # Retrieval
|
|
1676
|
+
"ArguAna-NL",
|
|
1677
|
+
"SCIDOCS-NL",
|
|
1678
|
+
"SciFact-NL",
|
|
1679
|
+
"NFCorpus-NL",
|
|
1680
|
+
"BelebeleRetrieval",
|
|
1681
|
+
# "WebFAQRetrieval",
|
|
1682
|
+
"DutchNewsArticlesRetrieval",
|
|
1683
|
+
"bBSARDNLRetrieval",
|
|
1684
|
+
"LegalQANLRetrieval",
|
|
1685
|
+
"OpenTenderRetrieval",
|
|
1686
|
+
"VABBRetrieval",
|
|
1687
|
+
"WikipediaRetrievalMultilingual",
|
|
1688
|
+
# # STS
|
|
1689
|
+
"SICK-NL-STS",
|
|
1690
|
+
"STSBenchmarkMultilingualSTS",
|
|
1691
|
+
],
|
|
1692
|
+
)
|
|
1693
|
+
),
|
|
1694
|
+
description="MTEB-NL",
|
|
1695
|
+
reference="https://arxiv.org/abs/2509.12340",
|
|
1696
|
+
contacts=["nikolay-banar"],
|
|
1697
|
+
citation=r"""
|
|
1698
|
+
@misc{banar2025mtebnle5nlembeddingbenchmark,
|
|
1699
|
+
archiveprefix = {arXiv},
|
|
1700
|
+
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
|
|
1701
|
+
eprint = {22509.12340},
|
|
1702
|
+
primaryclass = {cs.CL},
|
|
1703
|
+
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
|
|
1704
|
+
url = {https://arxiv.org/abs/2509.12340},
|
|
1705
|
+
year = {2025},
|
|
1706
|
+
}
|
|
1707
|
+
""",
|
|
1708
|
+
)
|
|
1709
|
+
|
|
1639
1710
|
MIEB_common_tasks = [
|
|
1640
1711
|
# Image Classification
|
|
1641
1712
|
"Birdsnap", # fine
|
|
@@ -1783,7 +1854,7 @@ MIEB_ENG = MIEBBenchmark(
|
|
|
1783
1854
|
),
|
|
1784
1855
|
description="""MIEB(eng) is a comprehensive image embeddings benchmark, spanning 8 task types, covering 125 tasks.
|
|
1785
1856
|
In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
|
|
1786
|
-
document
|
|
1857
|
+
document understanding, visual STS, and CV-centric tasks.""",
|
|
1787
1858
|
reference="https://arxiv.org/abs/2504.10471",
|
|
1788
1859
|
contacts=["gowitheflow-1998", "isaac-chung"],
|
|
1789
1860
|
citation=r"""
|
|
@@ -1817,7 +1888,7 @@ MIEB_MULTILINGUAL = MIEBBenchmark(
|
|
|
1817
1888
|
),
|
|
1818
1889
|
description="""MIEB(Multilingual) is a comprehensive image embeddings benchmark, spanning 10 task types, covering 130 tasks and a total of 39 languages.
|
|
1819
1890
|
In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
|
|
1820
|
-
document
|
|
1891
|
+
document understanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
|
|
1821
1892
|
datasets + the multilingual parts of VisualSTS-b and VisualSTS-16.""",
|
|
1822
1893
|
reference="https://arxiv.org/abs/2504.10471",
|
|
1823
1894
|
contacts=["gowitheflow-1998", "isaac-chung"],
|
|
@@ -2038,7 +2109,7 @@ BUILT_MTEB = Benchmark(
|
|
|
2038
2109
|
"BuiltBenchReranking",
|
|
2039
2110
|
],
|
|
2040
2111
|
),
|
|
2041
|
-
description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various
|
|
2112
|
+
description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various disciplines such as architecture, engineering, construction, and operations management of the built environment.',
|
|
2042
2113
|
reference="https://arxiv.org/abs/2411.12056",
|
|
2043
2114
|
citation=r"""
|
|
2044
2115
|
@article{shahinmoghadam2024benchmarking,
|
mteb/benchmarks/get_benchmark.py
CHANGED
|
@@ -14,7 +14,7 @@ def _build_registry() -> dict[str, Benchmark]:
|
|
|
14
14
|
|
|
15
15
|
benchmark_registry = {
|
|
16
16
|
inst.name: inst
|
|
17
|
-
for
|
|
17
|
+
for _, inst in benchmark_module.__dict__.items()
|
|
18
18
|
if isinstance(inst, Benchmark)
|
|
19
19
|
}
|
|
20
20
|
return benchmark_registry
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 2400,
|
|
4
|
+
"number_texts_intersect_with_train": null,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 92146,
|
|
7
|
+
"min_text_length": 5,
|
|
8
|
+
"average_text_length": 38.39416666666666,
|
|
9
|
+
"max_text_length": 138,
|
|
10
|
+
"unique_texts": 2400
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 2,
|
|
18
|
+
"labels": {
|
|
19
|
+
"1": {
|
|
20
|
+
"count": 1200
|
|
21
|
+
},
|
|
22
|
+
"0": {
|
|
23
|
+
"count": 1200
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"train": {
|
|
29
|
+
"num_samples": 19893,
|
|
30
|
+
"number_texts_intersect_with_train": null,
|
|
31
|
+
"text_statistics": {
|
|
32
|
+
"total_text_length": 761416,
|
|
33
|
+
"min_text_length": 4,
|
|
34
|
+
"average_text_length": 38.27557432262605,
|
|
35
|
+
"max_text_length": 152,
|
|
36
|
+
"unique_texts": 19893
|
|
37
|
+
},
|
|
38
|
+
"image_statistics": null,
|
|
39
|
+
"label_statistics": {
|
|
40
|
+
"min_labels_per_text": 1,
|
|
41
|
+
"average_label_per_text": 1.0,
|
|
42
|
+
"max_labels_per_text": 1,
|
|
43
|
+
"unique_labels": 2,
|
|
44
|
+
"labels": {
|
|
45
|
+
"1": {
|
|
46
|
+
"count": 12604
|
|
47
|
+
},
|
|
48
|
+
"0": {
|
|
49
|
+
"count": 7289
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 752,
|
|
4
|
+
"number_texts_intersect_with_train": 100,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 171956,
|
|
7
|
+
"min_text_length": 32,
|
|
8
|
+
"average_text_length": 228.66489361702128,
|
|
9
|
+
"max_text_length": 2746,
|
|
10
|
+
"unique_texts": 752
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 2,
|
|
18
|
+
"labels": {
|
|
19
|
+
"0.0": {
|
|
20
|
+
"count": 555
|
|
21
|
+
},
|
|
22
|
+
"1.0": {
|
|
23
|
+
"count": 197
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"train": {
|
|
29
|
+
"num_samples": 1718,
|
|
30
|
+
"number_texts_intersect_with_train": null,
|
|
31
|
+
"text_statistics": {
|
|
32
|
+
"total_text_length": 390362,
|
|
33
|
+
"min_text_length": 18,
|
|
34
|
+
"average_text_length": 227.2188591385332,
|
|
35
|
+
"max_text_length": 2662,
|
|
36
|
+
"unique_texts": 1718
|
|
37
|
+
},
|
|
38
|
+
"image_statistics": null,
|
|
39
|
+
"label_statistics": {
|
|
40
|
+
"min_labels_per_text": 1,
|
|
41
|
+
"average_label_per_text": 1.0,
|
|
42
|
+
"max_labels_per_text": 1,
|
|
43
|
+
"unique_labels": 2,
|
|
44
|
+
"labels": {
|
|
45
|
+
"1.0": {
|
|
46
|
+
"count": 470
|
|
47
|
+
},
|
|
48
|
+
"0.0": {
|
|
49
|
+
"count": 1248
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 1200,
|
|
4
|
+
"number_texts_intersect_with_train": 1,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 2034506,
|
|
7
|
+
"min_text_length": 184,
|
|
8
|
+
"average_text_length": 1695.4216666666666,
|
|
9
|
+
"max_text_length": 8825,
|
|
10
|
+
"unique_texts": 1200
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 8,
|
|
18
|
+
"labels": {
|
|
19
|
+
"Opmerkelijk": {
|
|
20
|
+
"count": 150
|
|
21
|
+
},
|
|
22
|
+
"Buitenland": {
|
|
23
|
+
"count": 150
|
|
24
|
+
},
|
|
25
|
+
"Cultuur & Media": {
|
|
26
|
+
"count": 150
|
|
27
|
+
},
|
|
28
|
+
"Binnenland": {
|
|
29
|
+
"count": 150
|
|
30
|
+
},
|
|
31
|
+
"Politiek": {
|
|
32
|
+
"count": 150
|
|
33
|
+
},
|
|
34
|
+
"Economie": {
|
|
35
|
+
"count": 150
|
|
36
|
+
},
|
|
37
|
+
"Tech": {
|
|
38
|
+
"count": 150
|
|
39
|
+
},
|
|
40
|
+
"Regionaal nieuws": {
|
|
41
|
+
"count": 150
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"train": {
|
|
47
|
+
"num_samples": 5600,
|
|
48
|
+
"number_texts_intersect_with_train": null,
|
|
49
|
+
"text_statistics": {
|
|
50
|
+
"total_text_length": 9620538,
|
|
51
|
+
"min_text_length": 106,
|
|
52
|
+
"average_text_length": 1717.9532142857142,
|
|
53
|
+
"max_text_length": 29389,
|
|
54
|
+
"unique_texts": 5600
|
|
55
|
+
},
|
|
56
|
+
"image_statistics": null,
|
|
57
|
+
"label_statistics": {
|
|
58
|
+
"min_labels_per_text": 1,
|
|
59
|
+
"average_label_per_text": 1.0,
|
|
60
|
+
"max_labels_per_text": 1,
|
|
61
|
+
"unique_labels": 8,
|
|
62
|
+
"labels": {
|
|
63
|
+
"Cultuur & Media": {
|
|
64
|
+
"count": 700
|
|
65
|
+
},
|
|
66
|
+
"Binnenland": {
|
|
67
|
+
"count": 700
|
|
68
|
+
},
|
|
69
|
+
"Buitenland": {
|
|
70
|
+
"count": 700
|
|
71
|
+
},
|
|
72
|
+
"Regionaal nieuws": {
|
|
73
|
+
"count": 700
|
|
74
|
+
},
|
|
75
|
+
"Politiek": {
|
|
76
|
+
"count": 700
|
|
77
|
+
},
|
|
78
|
+
"Economie": {
|
|
79
|
+
"count": 700
|
|
80
|
+
},
|
|
81
|
+
"Opmerkelijk": {
|
|
82
|
+
"count": 700
|
|
83
|
+
},
|
|
84
|
+
"Tech": {
|
|
85
|
+
"count": 700
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 1326,
|
|
4
|
+
"number_texts_intersect_with_train": null,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 82644,
|
|
7
|
+
"min_text_length": 17,
|
|
8
|
+
"average_text_length": 62.32579185520362,
|
|
9
|
+
"max_text_length": 117,
|
|
10
|
+
"unique_texts": 1326
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 2,
|
|
18
|
+
"labels": {
|
|
19
|
+
"0": {
|
|
20
|
+
"count": 826
|
|
21
|
+
},
|
|
22
|
+
"1": {
|
|
23
|
+
"count": 500
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"train": {
|
|
29
|
+
"num_samples": 10609,
|
|
30
|
+
"number_texts_intersect_with_train": null,
|
|
31
|
+
"text_statistics": {
|
|
32
|
+
"total_text_length": 658787,
|
|
33
|
+
"min_text_length": 7,
|
|
34
|
+
"average_text_length": 62.09699311904986,
|
|
35
|
+
"max_text_length": 161,
|
|
36
|
+
"unique_texts": 10609
|
|
37
|
+
},
|
|
38
|
+
"image_statistics": null,
|
|
39
|
+
"label_statistics": {
|
|
40
|
+
"min_labels_per_text": 1,
|
|
41
|
+
"average_label_per_text": 1.0,
|
|
42
|
+
"max_labels_per_text": 1,
|
|
43
|
+
"unique_labels": 2,
|
|
44
|
+
"labels": {
|
|
45
|
+
"1": {
|
|
46
|
+
"count": 4000
|
|
47
|
+
},
|
|
48
|
+
"0": {
|
|
49
|
+
"count": 6609
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 202,
|
|
4
|
+
"number_texts_intersect_with_train": null,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 11827,
|
|
7
|
+
"min_text_length": 6,
|
|
8
|
+
"average_text_length": 58.54950495049505,
|
|
9
|
+
"max_text_length": 403,
|
|
10
|
+
"unique_texts": 202
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 9,
|
|
18
|
+
"labels": {
|
|
19
|
+
"Geschiedenis": {
|
|
20
|
+
"count": 22
|
|
21
|
+
},
|
|
22
|
+
"Klassieke mythologie en Oude Geschiedenis": {
|
|
23
|
+
"count": 22
|
|
24
|
+
},
|
|
25
|
+
"Literatuur": {
|
|
26
|
+
"count": 23
|
|
27
|
+
},
|
|
28
|
+
"Natuur": {
|
|
29
|
+
"count": 23
|
|
30
|
+
},
|
|
31
|
+
"De mens, de mensheid in het algemeen": {
|
|
32
|
+
"count": 22
|
|
33
|
+
},
|
|
34
|
+
"Maatschappij, civilisatie en cultuur": {
|
|
35
|
+
"count": 22
|
|
36
|
+
},
|
|
37
|
+
"Abstracte idee\u00ebn en concepten": {
|
|
38
|
+
"count": 23
|
|
39
|
+
},
|
|
40
|
+
"Religie en magie": {
|
|
41
|
+
"count": 22
|
|
42
|
+
},
|
|
43
|
+
"Bijbel": {
|
|
44
|
+
"count": 23
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"train": {
|
|
50
|
+
"num_samples": 945,
|
|
51
|
+
"number_texts_intersect_with_train": null,
|
|
52
|
+
"text_statistics": {
|
|
53
|
+
"total_text_length": 52510,
|
|
54
|
+
"min_text_length": 3,
|
|
55
|
+
"average_text_length": 55.56613756613756,
|
|
56
|
+
"max_text_length": 793,
|
|
57
|
+
"unique_texts": 945
|
|
58
|
+
},
|
|
59
|
+
"image_statistics": null,
|
|
60
|
+
"label_statistics": {
|
|
61
|
+
"min_labels_per_text": 1,
|
|
62
|
+
"average_label_per_text": 1.0,
|
|
63
|
+
"max_labels_per_text": 1,
|
|
64
|
+
"unique_labels": 9,
|
|
65
|
+
"labels": {
|
|
66
|
+
"Literatuur": {
|
|
67
|
+
"count": 105
|
|
68
|
+
},
|
|
69
|
+
"Maatschappij, civilisatie en cultuur": {
|
|
70
|
+
"count": 105
|
|
71
|
+
},
|
|
72
|
+
"Klassieke mythologie en Oude Geschiedenis": {
|
|
73
|
+
"count": 105
|
|
74
|
+
},
|
|
75
|
+
"Bijbel": {
|
|
76
|
+
"count": 105
|
|
77
|
+
},
|
|
78
|
+
"De mens, de mensheid in het algemeen": {
|
|
79
|
+
"count": 105
|
|
80
|
+
},
|
|
81
|
+
"Abstracte idee\u00ebn en concepten": {
|
|
82
|
+
"count": 105
|
|
83
|
+
},
|
|
84
|
+
"Natuur": {
|
|
85
|
+
"count": 105
|
|
86
|
+
},
|
|
87
|
+
"Geschiedenis": {
|
|
88
|
+
"count": 105
|
|
89
|
+
},
|
|
90
|
+
"Religie en magie": {
|
|
91
|
+
"count": 105
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|