mteb 2.0.5__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. mteb/__init__.py +10 -1
  2. mteb/_create_dataloaders.py +2 -0
  3. mteb/abstasks/_stratification.py +1 -1
  4. mteb/abstasks/abstask.py +6 -1
  5. mteb/abstasks/dataset_card_template.md +1 -1
  6. mteb/abstasks/retrieval.py +2 -1
  7. mteb/abstasks/retrieval_dataset_loaders.py +1 -1
  8. mteb/abstasks/task_metadata.py +1 -1
  9. mteb/benchmarks/benchmarks/__init__.py +2 -0
  10. mteb/benchmarks/benchmarks/benchmarks.py +82 -11
  11. mteb/benchmarks/get_benchmark.py +1 -1
  12. mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
  13. mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
  14. mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
  15. mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
  16. mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
  17. mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
  18. mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
  19. mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
  20. mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
  21. mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
  22. mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
  23. mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
  24. mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
  25. mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
  26. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
  27. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
  28. mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
  29. mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
  30. mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
  31. mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
  32. mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
  33. mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
  34. mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
  35. mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
  36. mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
  37. mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
  38. mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
  39. mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
  40. mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
  41. mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
  42. mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
  43. mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
  44. mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
  45. mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
  46. mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
  47. mteb/languages/check_language_code.py +11 -3
  48. mteb/languages/language_scripts.py +4 -0
  49. mteb/leaderboard/text_segments.py +1 -1
  50. mteb/models/model_implementations/b1ade_models.py +1 -1
  51. mteb/models/model_implementations/bge_models.py +1 -3
  52. mteb/models/model_implementations/bmretriever_models.py +1 -1
  53. mteb/models/model_implementations/gme_v_models.py +2 -2
  54. mteb/models/model_implementations/ibm_granite_models.py +1 -1
  55. mteb/models/model_implementations/inf_models.py +3 -3
  56. mteb/models/model_implementations/jina_models.py +12 -2
  57. mteb/models/model_implementations/llm2vec_models.py +1 -1
  58. mteb/models/model_implementations/misc_models.py +2 -2
  59. mteb/models/model_implementations/mxbai_models.py +1 -1
  60. mteb/models/model_implementations/salesforce_models.py +1 -1
  61. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
  62. mteb/models/model_implementations/voyage_v.py +9 -9
  63. mteb/results/task_result.py +6 -8
  64. mteb/tasks/classification/dan/angry_tweets_classification.py +2 -2
  65. mteb/tasks/classification/eng/legal_bench_classification.py +3 -3
  66. mteb/tasks/classification/mya/myanmar_news.py +2 -2
  67. mteb/tasks/classification/nld/__init__.py +16 -0
  68. mteb/tasks/classification/nld/dutch_cola_classification.py +38 -0
  69. mteb/tasks/classification/nld/dutch_government_bias_classification.py +37 -0
  70. mteb/tasks/classification/nld/dutch_news_articles_classification.py +30 -0
  71. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +36 -0
  72. mteb/tasks/classification/nld/iconclass_classification.py +41 -0
  73. mteb/tasks/classification/nld/open_tender_classification.py +38 -0
  74. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +46 -0
  75. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  76. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  77. mteb/tasks/clustering/__init__.py +1 -0
  78. mteb/tasks/clustering/nld/__init__.py +17 -0
  79. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +37 -0
  80. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +37 -0
  81. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +47 -0
  82. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +51 -0
  83. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +41 -0
  84. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +51 -0
  85. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +51 -0
  86. mteb/tasks/multilabel_classification/__init__.py +1 -0
  87. mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
  88. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +88 -0
  89. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +44 -0
  90. mteb/tasks/pair_classification/__init__.py +1 -0
  91. mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
  92. mteb/tasks/pair_classification/nld/__init__.py +7 -0
  93. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +36 -0
  94. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +41 -0
  95. mteb/tasks/retrieval/code/code_rag.py +8 -8
  96. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  97. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  98. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  99. mteb/tasks/retrieval/eng/__init__.py +18 -4
  100. mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
  101. mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
  102. mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
  103. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
  104. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
  105. mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
  106. mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
  107. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
  108. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
  109. mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
  110. mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
  111. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
  112. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
  113. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
  114. mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
  115. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +1 -1
  116. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
  117. mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
  118. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
  119. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
  120. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
  121. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
  122. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
  123. mteb/tasks/retrieval/nld/__init__.py +10 -0
  124. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +41 -0
  125. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +30 -0
  126. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +39 -0
  127. mteb/tasks/retrieval/nld/open_tender_retrieval.py +38 -0
  128. mteb/tasks/retrieval/nld/vabb_retrieval.py +41 -0
  129. mteb/tasks/retrieval/nob/norquad.py +2 -2
  130. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  131. mteb/tasks/retrieval/rus/__init__.py +11 -2
  132. mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
  133. mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
  134. mteb/tasks/sts/__init__.py +1 -0
  135. mteb/tasks/sts/nld/__init__.py +5 -0
  136. mteb/tasks/sts/nld/sick_nl_sts.py +41 -0
  137. mteb-2.1.1.dist-info/METADATA +253 -0
  138. {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/RECORD +142 -95
  139. mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
  140. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
  141. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
  142. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
  143. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
  144. mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
  145. mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
  146. mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
  147. mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
  148. mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
  149. mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
  150. mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
  151. mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
  152. mteb-2.0.5.dist-info/METADATA +0 -455
  153. {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/WHEEL +0 -0
  154. {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/entry_points.txt +0 -0
  155. {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/licenses/LICENSE +0 -0
  156. {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/top_level.txt +0 -0
mteb/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from importlib.metadata import version
2
2
 
3
+ from mteb import types
3
4
  from mteb.abstasks import AbsTask
4
5
  from mteb.abstasks.task_metadata import TaskMetadata
5
6
  from mteb.deprecated_evaluator import MTEB
@@ -7,7 +8,12 @@ from mteb.evaluate import evaluate
7
8
  from mteb.filter_tasks import filter_tasks
8
9
  from mteb.get_tasks import get_task, get_tasks
9
10
  from mteb.load_results import load_results
10
- from mteb.models import EncoderProtocol, SentenceTransformerEncoderWrapper
11
+ from mteb.models import (
12
+ CrossEncoderProtocol,
13
+ EncoderProtocol,
14
+ SearchProtocol,
15
+ SentenceTransformerEncoderWrapper,
16
+ )
11
17
  from mteb.models.get_model_meta import get_model, get_model_meta, get_model_metas
12
18
  from mteb.results import BenchmarkResults, TaskResult
13
19
 
@@ -21,7 +27,9 @@ __all__ = [
21
27
  "AbsTask",
22
28
  "Benchmark",
23
29
  "BenchmarkResults",
30
+ "CrossEncoderProtocol",
24
31
  "EncoderProtocol",
32
+ "SearchProtocol",
25
33
  "SentenceTransformerEncoderWrapper",
26
34
  "TaskMetadata",
27
35
  "TaskResult",
@@ -35,4 +43,5 @@ __all__ = [
35
43
  "get_task",
36
44
  "get_tasks",
37
45
  "load_results",
46
+ "types",
38
47
  ]
@@ -277,6 +277,8 @@ def _custom_collate_fn(batch: list[dict[str, Any]]) -> dict[str, Any]:
277
277
  # Leave the images as a list to avoid stacking errors.
278
278
  collated[key] = [item[key] for item in batch]
279
279
  else:
280
+ if any(item[key] is None for item in batch):
281
+ raise ValueError(f"Found None in batch for key '{key}'")
280
282
  collated[key] = default_collate([item[key] for item in batch])
281
283
  return collated
282
284
 
@@ -134,7 +134,7 @@ def _get_most_desired_combination(samples_with_combination: dict):
134
134
  class IterativeStratification(_BaseKFold):
135
135
  """Iteratively stratify a multi-label data set into folds
136
136
 
137
- Construct an interative stratifier that splits the data set into folds trying to maintain balanced representation
137
+ Construct an iterative stratifier that splits the data set into folds trying to maintain balanced representation
138
138
  with respect to order-th label combinations.
139
139
  """
140
140
 
mteb/abstasks/abstask.py CHANGED
@@ -459,7 +459,7 @@ class AbsTask(ABC):
459
459
  """Filter the languages of the task.
460
460
 
461
461
  Args:
462
- languages: list of languages to filter the task by can be either a 3-letter langauge code (e.g. "eng") or also include the script
462
+ languages: list of languages to filter the task by can be either a 3-letter language code (e.g. "eng") or also include the script
463
463
  (e.g. "eng-Latn")
464
464
  script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included.
465
465
  If the language code does not specify the script the intersection of the language and script will be used.
@@ -491,6 +491,11 @@ class AbsTask(ABC):
491
491
  if lang_scripts.contains_languages(langs):
492
492
  subsets_to_keep.append(hf_subset)
493
493
 
494
+ if len(subsets_to_keep) == 0:
495
+ raise ValueError(
496
+ f"No subsets were found for {self.metadata.name} with filters: language code {languages}, script {script}, hf subsets {hf_subsets}."
497
+ )
498
+
494
499
  self.hf_subsets = subsets_to_keep
495
500
  return self
496
501
 
@@ -85,7 +85,7 @@ desc_stats = task.metadata.descriptive_stats
85
85
  ```
86
86
 
87
87
  ```json
88
- {{ descritptive_stats | default("{}", true) }}
88
+ {{ descriptive_stats | default("{}", true) }}
89
89
  ```
90
90
 
91
91
  </details>
@@ -653,6 +653,8 @@ class AbsTaskRetrieval(AbsTask):
653
653
  FileNotFoundError: If the specified path does not exist.
654
654
  ValueError: If the loaded top ranked results are not in the expected format.
655
655
  """
656
+ self._top_k = top_k
657
+
656
658
  top_ranked_path = Path(top_ranked_path)
657
659
  if top_ranked_path.is_dir():
658
660
  top_ranked_path = self._predictions_path(top_ranked_path)
@@ -682,7 +684,6 @@ class AbsTaskRetrieval(AbsTask):
682
684
  top_k_sorted[query_id] = sorted_keys[: self._top_k]
683
685
 
684
686
  self.dataset[subset][split]["top_ranked"] = top_k_sorted
685
- self._top_k = top_k
686
687
  return self
687
688
 
688
689
 
@@ -176,7 +176,7 @@ class RetrievalDatasetLoader:
176
176
  {
177
177
  "query-id": Value("string"),
178
178
  "corpus-id": Value("string"),
179
- "score": Value("uint16"),
179
+ "score": Value("int32"),
180
180
  }
181
181
  )
182
182
  )
@@ -532,7 +532,7 @@ class TaskMetadata(BaseModel):
532
532
  citation=self.bibtex_citation,
533
533
  dataset_description=self.description,
534
534
  dataset_reference=self.reference,
535
- descritptive_stats=descriptive_stats,
535
+ descriptive_stats=descriptive_stats,
536
536
  dataset_task_name=self.name,
537
537
  category=self.category,
538
538
  domains=", ".join(self.domains) if self.domains else None,
@@ -27,6 +27,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
27
27
  MTEB_KOR,
28
28
  MTEB_MAIN_RU,
29
29
  MTEB_MINERS_BITEXT_MINING,
30
+ MTEB_NL,
30
31
  MTEB_POL,
31
32
  MTEB_RETRIEVAL_LAW,
32
33
  MTEB_RETRIEVAL_MEDICAL,
@@ -87,6 +88,7 @@ __all__ = [
87
88
  "MTEB_KOR",
88
89
  "MTEB_MAIN_RU",
89
90
  "MTEB_MINERS_BITEXT_MINING",
91
+ "MTEB_NL",
90
92
  "MTEB_POL",
91
93
  "MTEB_RETRIEVAL_LAW",
92
94
  "MTEB_RETRIEVAL_MEDICAL",
@@ -641,7 +641,7 @@ MTEB_KOR = Benchmark(
641
641
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
642
642
  tasks=get_tasks(
643
643
  languages=["kor"],
644
- tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid benchamrk, but for now I have left it as is.
644
+ tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid benchmark, but for now I have left it as is.
645
645
  # Classification
646
646
  "KLUE-TC",
647
647
  # Reranking
@@ -975,8 +975,6 @@ MTEB_INDIC = Benchmark(
975
975
  # Bitext
976
976
  "IN22ConvBitextMining",
977
977
  "IN22GenBitextMining",
978
- "IndicGenBenchFloresBitextMining",
979
- "LinceMTBitextMining",
980
978
  # clustering
981
979
  "SIB200ClusteringS2S",
982
980
  # classification
@@ -985,7 +983,6 @@ MTEB_INDIC = Benchmark(
985
983
  "HindiDiscourseClassification",
986
984
  "SentimentAnalysisHindi",
987
985
  "MalayalamNewsClassification",
988
- "IndicLangClassification",
989
986
  "MTOPIntentClassification",
990
987
  "MultiHateClassification",
991
988
  "TweetSentimentClassification",
@@ -1008,7 +1005,7 @@ MTEB_INDIC = Benchmark(
1008
1005
  # STS
1009
1006
  (get_task("IndicCrosslingualSTS"),)
1010
1007
  ),
1011
- description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.",
1008
+ description="A regional geopolitical text embedding benchmark targeting embedding performance on Indic languages.",
1012
1009
  reference=None,
1013
1010
  citation=MMTEB_CITATION,
1014
1011
  contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1016,7 +1013,7 @@ MTEB_INDIC = Benchmark(
1016
1013
 
1017
1014
 
1018
1015
  eu_languages = [
1019
- # official EU languages (56) - we could include the whole economic area e.g. Norway - additioanlly we could include minority languages (probably a good idea?)
1016
+ # official EU languages (56) - we could include the whole economic area e.g. Norway - additionally we could include minority languages (probably a good idea?)
1020
1017
  # germanic
1021
1018
  "dan",
1022
1019
  "eng",
@@ -1084,7 +1081,6 @@ MTEB_EU = Benchmark(
1084
1081
  "AmazonCounterfactualClassification",
1085
1082
  "MassiveScenarioClassification",
1086
1083
  "MultiHateClassification",
1087
- "NordicLangClassification",
1088
1084
  "ScalaClassification",
1089
1085
  "SwissJudgementClassification",
1090
1086
  "TweetSentimentClassification",
@@ -1142,7 +1138,7 @@ MTEB_EU = Benchmark(
1142
1138
  languages=eu_languages,
1143
1139
  exclusive_language_filter=True,
1144
1140
  ),
1145
- description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.",
1141
+ description="A regional geopolitical text embedding benchmark targeting embedding performance on European languages.",
1146
1142
  reference=None,
1147
1143
  citation=MMTEB_CITATION,
1148
1144
  contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1636,6 +1632,81 @@ BEIR_NL = Benchmark(
1636
1632
  """,
1637
1633
  )
1638
1634
 
1635
+ MTEB_NL = Benchmark(
1636
+ name="MTEB(nld, v1)",
1637
+ display_name="MTEB-NL",
1638
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/nl.svg",
1639
+ tasks=MTEBTasks(
1640
+ get_tasks(
1641
+ languages=["nld"],
1642
+ exclusive_language_filter=True,
1643
+ tasks=[
1644
+ # Classification
1645
+ "DutchBookReviewSentimentClassification",
1646
+ "MassiveIntentClassification",
1647
+ "MassiveScenarioClassification",
1648
+ "SIB200Classification",
1649
+ "MultiHateClassification",
1650
+ "VaccinChatNLClassification",
1651
+ "DutchColaClassification",
1652
+ "DutchGovernmentBiasClassification",
1653
+ "DutchSarcasticHeadlinesClassification",
1654
+ "DutchNewsArticlesClassification",
1655
+ "OpenTenderClassification",
1656
+ "IconclassClassification",
1657
+ # # PairClassification
1658
+ "SICKNLPairClassification",
1659
+ "XLWICNLPairClassification",
1660
+ # # MultiLabelClassification
1661
+ "CovidDisinformationNLMultiLabelClassification",
1662
+ "MultiEURLEXMultilabelClassification",
1663
+ "VABBMultiLabelClassification",
1664
+ # # Clustering
1665
+ "DutchNewsArticlesClusteringS2S",
1666
+ "DutchNewsArticlesClusteringP2P",
1667
+ "SIB200ClusteringS2S",
1668
+ "VABBClusteringS2S",
1669
+ "VABBClusteringP2P",
1670
+ "OpenTenderClusteringS2S",
1671
+ "OpenTenderClusteringP2P",
1672
+ "IconclassClusteringS2S",
1673
+ # # Reranking
1674
+ "WikipediaRerankingMultilingual",
1675
+ # # Retrieval
1676
+ "ArguAna-NL",
1677
+ "SCIDOCS-NL",
1678
+ "SciFact-NL",
1679
+ "NFCorpus-NL",
1680
+ "BelebeleRetrieval",
1681
+ # "WebFAQRetrieval",
1682
+ "DutchNewsArticlesRetrieval",
1683
+ "bBSARDNLRetrieval",
1684
+ "LegalQANLRetrieval",
1685
+ "OpenTenderRetrieval",
1686
+ "VABBRetrieval",
1687
+ "WikipediaRetrievalMultilingual",
1688
+ # # STS
1689
+ "SICK-NL-STS",
1690
+ "STSBenchmarkMultilingualSTS",
1691
+ ],
1692
+ )
1693
+ ),
1694
+ description="MTEB-NL",
1695
+ reference="https://arxiv.org/abs/2509.12340",
1696
+ contacts=["nikolay-banar"],
1697
+ citation=r"""
1698
+ @misc{banar2025mtebnle5nlembeddingbenchmark,
1699
+ archiveprefix = {arXiv},
1700
+ author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
1701
+ eprint = {22509.12340},
1702
+ primaryclass = {cs.CL},
1703
+ title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
1704
+ url = {https://arxiv.org/abs/2509.12340},
1705
+ year = {2025},
1706
+ }
1707
+ """,
1708
+ )
1709
+
1639
1710
  MIEB_common_tasks = [
1640
1711
  # Image Classification
1641
1712
  "Birdsnap", # fine
@@ -1783,7 +1854,7 @@ MIEB_ENG = MIEBBenchmark(
1783
1854
  ),
1784
1855
  description="""MIEB(eng) is a comprehensive image embeddings benchmark, spanning 8 task types, covering 125 tasks.
1785
1856
  In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
1786
- document undestanding, visual STS, and CV-centric tasks.""",
1857
+ document understanding, visual STS, and CV-centric tasks.""",
1787
1858
  reference="https://arxiv.org/abs/2504.10471",
1788
1859
  contacts=["gowitheflow-1998", "isaac-chung"],
1789
1860
  citation=r"""
@@ -1817,7 +1888,7 @@ MIEB_MULTILINGUAL = MIEBBenchmark(
1817
1888
  ),
1818
1889
  description="""MIEB(Multilingual) is a comprehensive image embeddings benchmark, spanning 10 task types, covering 130 tasks and a total of 39 languages.
1819
1890
  In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
1820
- document undestanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
1891
+ document understanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
1821
1892
  datasets + the multilingual parts of VisualSTS-b and VisualSTS-16.""",
1822
1893
  reference="https://arxiv.org/abs/2504.10471",
1823
1894
  contacts=["gowitheflow-1998", "isaac-chung"],
@@ -2038,7 +2109,7 @@ BUILT_MTEB = Benchmark(
2038
2109
  "BuiltBenchReranking",
2039
2110
  ],
2040
2111
  ),
2041
- description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various dicsiplines such as architeture, engineering, constrcution, and operations management of the built environment.',
2112
+ description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various disciplines such as architecture, engineering, construction, and operations management of the built environment.',
2042
2113
  reference="https://arxiv.org/abs/2411.12056",
2043
2114
  citation=r"""
2044
2115
  @article{shahinmoghadam2024benchmarking,
@@ -14,7 +14,7 @@ def _build_registry() -> dict[str, Benchmark]:
14
14
 
15
15
  benchmark_registry = {
16
16
  inst.name: inst
17
- for nam, inst in benchmark_module.__dict__.items()
17
+ for _, inst in benchmark_module.__dict__.items()
18
18
  if isinstance(inst, Benchmark)
19
19
  }
20
20
  return benchmark_registry
@@ -0,0 +1,54 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 2400,
4
+ "number_texts_intersect_with_train": null,
5
+ "text_statistics": {
6
+ "total_text_length": 92146,
7
+ "min_text_length": 5,
8
+ "average_text_length": 38.39416666666666,
9
+ "max_text_length": 138,
10
+ "unique_texts": 2400
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "1": {
20
+ "count": 1200
21
+ },
22
+ "0": {
23
+ "count": 1200
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "train": {
29
+ "num_samples": 19893,
30
+ "number_texts_intersect_with_train": null,
31
+ "text_statistics": {
32
+ "total_text_length": 761416,
33
+ "min_text_length": 4,
34
+ "average_text_length": 38.27557432262605,
35
+ "max_text_length": 152,
36
+ "unique_texts": 19893
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "1": {
46
+ "count": 12604
47
+ },
48
+ "0": {
49
+ "count": 7289
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,54 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 752,
4
+ "number_texts_intersect_with_train": 100,
5
+ "text_statistics": {
6
+ "total_text_length": 171956,
7
+ "min_text_length": 32,
8
+ "average_text_length": 228.66489361702128,
9
+ "max_text_length": 2746,
10
+ "unique_texts": 752
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "0.0": {
20
+ "count": 555
21
+ },
22
+ "1.0": {
23
+ "count": 197
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "train": {
29
+ "num_samples": 1718,
30
+ "number_texts_intersect_with_train": null,
31
+ "text_statistics": {
32
+ "total_text_length": 390362,
33
+ "min_text_length": 18,
34
+ "average_text_length": 227.2188591385332,
35
+ "max_text_length": 2662,
36
+ "unique_texts": 1718
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "1.0": {
46
+ "count": 470
47
+ },
48
+ "0.0": {
49
+ "count": 1248
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,90 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1200,
4
+ "number_texts_intersect_with_train": 1,
5
+ "text_statistics": {
6
+ "total_text_length": 2034506,
7
+ "min_text_length": 184,
8
+ "average_text_length": 1695.4216666666666,
9
+ "max_text_length": 8825,
10
+ "unique_texts": 1200
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 8,
18
+ "labels": {
19
+ "Opmerkelijk": {
20
+ "count": 150
21
+ },
22
+ "Buitenland": {
23
+ "count": 150
24
+ },
25
+ "Cultuur & Media": {
26
+ "count": 150
27
+ },
28
+ "Binnenland": {
29
+ "count": 150
30
+ },
31
+ "Politiek": {
32
+ "count": 150
33
+ },
34
+ "Economie": {
35
+ "count": 150
36
+ },
37
+ "Tech": {
38
+ "count": 150
39
+ },
40
+ "Regionaal nieuws": {
41
+ "count": 150
42
+ }
43
+ }
44
+ }
45
+ },
46
+ "train": {
47
+ "num_samples": 5600,
48
+ "number_texts_intersect_with_train": null,
49
+ "text_statistics": {
50
+ "total_text_length": 9620538,
51
+ "min_text_length": 106,
52
+ "average_text_length": 1717.9532142857142,
53
+ "max_text_length": 29389,
54
+ "unique_texts": 5600
55
+ },
56
+ "image_statistics": null,
57
+ "label_statistics": {
58
+ "min_labels_per_text": 1,
59
+ "average_label_per_text": 1.0,
60
+ "max_labels_per_text": 1,
61
+ "unique_labels": 8,
62
+ "labels": {
63
+ "Cultuur & Media": {
64
+ "count": 700
65
+ },
66
+ "Binnenland": {
67
+ "count": 700
68
+ },
69
+ "Buitenland": {
70
+ "count": 700
71
+ },
72
+ "Regionaal nieuws": {
73
+ "count": 700
74
+ },
75
+ "Politiek": {
76
+ "count": 700
77
+ },
78
+ "Economie": {
79
+ "count": 700
80
+ },
81
+ "Opmerkelijk": {
82
+ "count": 700
83
+ },
84
+ "Tech": {
85
+ "count": 700
86
+ }
87
+ }
88
+ }
89
+ }
90
+ }
@@ -0,0 +1,54 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1326,
4
+ "number_texts_intersect_with_train": null,
5
+ "text_statistics": {
6
+ "total_text_length": 82644,
7
+ "min_text_length": 17,
8
+ "average_text_length": 62.32579185520362,
9
+ "max_text_length": 117,
10
+ "unique_texts": 1326
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "0": {
20
+ "count": 826
21
+ },
22
+ "1": {
23
+ "count": 500
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "train": {
29
+ "num_samples": 10609,
30
+ "number_texts_intersect_with_train": null,
31
+ "text_statistics": {
32
+ "total_text_length": 658787,
33
+ "min_text_length": 7,
34
+ "average_text_length": 62.09699311904986,
35
+ "max_text_length": 161,
36
+ "unique_texts": 10609
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "1": {
46
+ "count": 4000
47
+ },
48
+ "0": {
49
+ "count": 6609
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,96 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 202,
4
+ "number_texts_intersect_with_train": null,
5
+ "text_statistics": {
6
+ "total_text_length": 11827,
7
+ "min_text_length": 6,
8
+ "average_text_length": 58.54950495049505,
9
+ "max_text_length": 403,
10
+ "unique_texts": 202
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 9,
18
+ "labels": {
19
+ "Geschiedenis": {
20
+ "count": 22
21
+ },
22
+ "Klassieke mythologie en Oude Geschiedenis": {
23
+ "count": 22
24
+ },
25
+ "Literatuur": {
26
+ "count": 23
27
+ },
28
+ "Natuur": {
29
+ "count": 23
30
+ },
31
+ "De mens, de mensheid in het algemeen": {
32
+ "count": 22
33
+ },
34
+ "Maatschappij, civilisatie en cultuur": {
35
+ "count": 22
36
+ },
37
+ "Abstracte idee\u00ebn en concepten": {
38
+ "count": 23
39
+ },
40
+ "Religie en magie": {
41
+ "count": 22
42
+ },
43
+ "Bijbel": {
44
+ "count": 23
45
+ }
46
+ }
47
+ }
48
+ },
49
+ "train": {
50
+ "num_samples": 945,
51
+ "number_texts_intersect_with_train": null,
52
+ "text_statistics": {
53
+ "total_text_length": 52510,
54
+ "min_text_length": 3,
55
+ "average_text_length": 55.56613756613756,
56
+ "max_text_length": 793,
57
+ "unique_texts": 945
58
+ },
59
+ "image_statistics": null,
60
+ "label_statistics": {
61
+ "min_labels_per_text": 1,
62
+ "average_label_per_text": 1.0,
63
+ "max_labels_per_text": 1,
64
+ "unique_labels": 9,
65
+ "labels": {
66
+ "Literatuur": {
67
+ "count": 105
68
+ },
69
+ "Maatschappij, civilisatie en cultuur": {
70
+ "count": 105
71
+ },
72
+ "Klassieke mythologie en Oude Geschiedenis": {
73
+ "count": 105
74
+ },
75
+ "Bijbel": {
76
+ "count": 105
77
+ },
78
+ "De mens, de mensheid in het algemeen": {
79
+ "count": 105
80
+ },
81
+ "Abstracte idee\u00ebn en concepten": {
82
+ "count": 105
83
+ },
84
+ "Natuur": {
85
+ "count": 105
86
+ },
87
+ "Geschiedenis": {
88
+ "count": 105
89
+ },
90
+ "Religie en magie": {
91
+ "count": 105
92
+ }
93
+ }
94
+ }
95
+ }
96
+ }