mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +17 -18
  3. mteb/_evaluators/any_sts_evaluator.py +3 -3
  4. mteb/_evaluators/clustering_evaluator.py +2 -2
  5. mteb/_evaluators/evaluator.py +4 -2
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
  7. mteb/_evaluators/pair_classification_evaluator.py +5 -3
  8. mteb/_evaluators/retrieval_evaluator.py +2 -2
  9. mteb/_evaluators/retrieval_metrics.py +18 -17
  10. mteb/_evaluators/sklearn_evaluator.py +11 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
  12. mteb/_evaluators/text/summarization_evaluator.py +23 -18
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  14. mteb/abstasks/_data_filter/filters.py +1 -1
  15. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  16. mteb/abstasks/_statistics_calculation.py +18 -10
  17. mteb/abstasks/_stratification.py +18 -18
  18. mteb/abstasks/abstask.py +35 -28
  19. mteb/abstasks/aggregate_task_metadata.py +1 -9
  20. mteb/abstasks/aggregated_task.py +10 -29
  21. mteb/abstasks/classification.py +15 -10
  22. mteb/abstasks/clustering.py +19 -15
  23. mteb/abstasks/clustering_legacy.py +10 -10
  24. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  25. mteb/abstasks/multilabel_classification.py +23 -19
  26. mteb/abstasks/pair_classification.py +20 -11
  27. mteb/abstasks/regression.py +4 -4
  28. mteb/abstasks/retrieval.py +28 -24
  29. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  30. mteb/abstasks/sts.py +8 -5
  31. mteb/abstasks/task_metadata.py +31 -33
  32. mteb/abstasks/text/bitext_mining.py +39 -28
  33. mteb/abstasks/text/reranking.py +8 -6
  34. mteb/abstasks/text/summarization.py +10 -5
  35. mteb/abstasks/zeroshot_classification.py +8 -4
  36. mteb/benchmarks/benchmark.py +4 -2
  37. mteb/benchmarks/benchmarks/__init__.py +4 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +112 -11
  39. mteb/benchmarks/get_benchmark.py +14 -55
  40. mteb/cache.py +182 -29
  41. mteb/cli/_display_tasks.py +2 -2
  42. mteb/cli/build_cli.py +110 -14
  43. mteb/cli/generate_model_card.py +43 -23
  44. mteb/deprecated_evaluator.py +63 -49
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  49. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  50. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  51. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  53. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  54. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  55. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  56. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  57. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  58. mteb/evaluate.py +44 -33
  59. mteb/filter_tasks.py +25 -26
  60. mteb/get_tasks.py +29 -30
  61. mteb/languages/language_scripts.py +5 -3
  62. mteb/leaderboard/app.py +162 -34
  63. mteb/load_results.py +12 -12
  64. mteb/models/abs_encoder.py +10 -6
  65. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  66. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  67. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  68. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  69. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  70. mteb/models/get_model_meta.py +21 -3
  71. mteb/models/instruct_wrapper.py +28 -8
  72. mteb/models/model_implementations/align_models.py +1 -1
  73. mteb/models/model_implementations/andersborges.py +4 -4
  74. mteb/models/model_implementations/ara_models.py +1 -1
  75. mteb/models/model_implementations/arctic_models.py +8 -8
  76. mteb/models/model_implementations/b1ade_models.py +1 -1
  77. mteb/models/model_implementations/bge_models.py +45 -21
  78. mteb/models/model_implementations/bica_model.py +3 -3
  79. mteb/models/model_implementations/blip2_models.py +2 -2
  80. mteb/models/model_implementations/blip_models.py +16 -16
  81. mteb/models/model_implementations/bm25.py +4 -4
  82. mteb/models/model_implementations/bmretriever_models.py +6 -4
  83. mteb/models/model_implementations/cadet_models.py +1 -1
  84. mteb/models/model_implementations/cde_models.py +11 -4
  85. mteb/models/model_implementations/clip_models.py +6 -6
  86. mteb/models/model_implementations/clips_models.py +3 -3
  87. mteb/models/model_implementations/codefuse_models.py +5 -5
  88. mteb/models/model_implementations/codesage_models.py +3 -3
  89. mteb/models/model_implementations/cohere_models.py +5 -5
  90. mteb/models/model_implementations/cohere_v.py +2 -2
  91. mteb/models/model_implementations/colpali_models.py +3 -3
  92. mteb/models/model_implementations/colqwen_models.py +8 -8
  93. mteb/models/model_implementations/colsmol_models.py +2 -2
  94. mteb/models/model_implementations/conan_models.py +1 -1
  95. mteb/models/model_implementations/dino_models.py +42 -42
  96. mteb/models/model_implementations/e5_instruct.py +23 -4
  97. mteb/models/model_implementations/e5_models.py +9 -9
  98. mteb/models/model_implementations/e5_v.py +6 -6
  99. mteb/models/model_implementations/eagerworks_models.py +1 -1
  100. mteb/models/model_implementations/emillykkejensen_models.py +6 -6
  101. mteb/models/model_implementations/en_code_retriever.py +1 -1
  102. mteb/models/model_implementations/euler_models.py +2 -2
  103. mteb/models/model_implementations/fa_models.py +9 -9
  104. mteb/models/model_implementations/facebookai.py +14 -2
  105. mteb/models/model_implementations/geogpt_models.py +1 -1
  106. mteb/models/model_implementations/gme_v_models.py +6 -5
  107. mteb/models/model_implementations/google_models.py +1 -1
  108. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
  109. mteb/models/model_implementations/gritlm_models.py +2 -2
  110. mteb/models/model_implementations/gte_models.py +25 -13
  111. mteb/models/model_implementations/hinvec_models.py +1 -1
  112. mteb/models/model_implementations/ibm_granite_models.py +30 -6
  113. mteb/models/model_implementations/inf_models.py +2 -2
  114. mteb/models/model_implementations/jasper_models.py +2 -2
  115. mteb/models/model_implementations/jina_clip.py +48 -10
  116. mteb/models/model_implementations/jina_models.py +18 -11
  117. mteb/models/model_implementations/kblab.py +12 -6
  118. mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
  119. mteb/models/model_implementations/kfst.py +1 -1
  120. mteb/models/model_implementations/kowshik24_models.py +1 -1
  121. mteb/models/model_implementations/lgai_embedding_models.py +1 -1
  122. mteb/models/model_implementations/linq_models.py +1 -1
  123. mteb/models/model_implementations/listconranker.py +1 -1
  124. mteb/models/model_implementations/llm2clip_models.py +6 -6
  125. mteb/models/model_implementations/llm2vec_models.py +8 -8
  126. mteb/models/model_implementations/mcinext_models.py +4 -1
  127. mteb/models/model_implementations/mdbr_models.py +17 -3
  128. mteb/models/model_implementations/misc_models.py +68 -68
  129. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  130. mteb/models/model_implementations/mme5_models.py +1 -1
  131. mteb/models/model_implementations/moco_models.py +4 -4
  132. mteb/models/model_implementations/mod_models.py +1 -1
  133. mteb/models/model_implementations/model2vec_models.py +14 -14
  134. mteb/models/model_implementations/moka_models.py +1 -1
  135. mteb/models/model_implementations/nbailab.py +3 -3
  136. mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
  137. mteb/models/model_implementations/nomic_models.py +30 -15
  138. mteb/models/model_implementations/nomic_models_vision.py +1 -1
  139. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
  140. mteb/models/model_implementations/nvidia_models.py +151 -19
  141. mteb/models/model_implementations/octen_models.py +61 -2
  142. mteb/models/model_implementations/openclip_models.py +13 -13
  143. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
  144. mteb/models/model_implementations/ops_moa_models.py +1 -1
  145. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  146. mteb/models/model_implementations/pawan_models.py +1 -1
  147. mteb/models/model_implementations/piccolo_models.py +1 -1
  148. mteb/models/model_implementations/pixie_models.py +56 -0
  149. mteb/models/model_implementations/promptriever_models.py +4 -4
  150. mteb/models/model_implementations/pylate_models.py +10 -9
  151. mteb/models/model_implementations/qodo_models.py +2 -2
  152. mteb/models/model_implementations/qtack_models.py +1 -1
  153. mteb/models/model_implementations/qwen3_models.py +3 -3
  154. mteb/models/model_implementations/qzhou_models.py +2 -2
  155. mteb/models/model_implementations/random_baseline.py +3 -3
  156. mteb/models/model_implementations/rasgaard_models.py +2 -2
  157. mteb/models/model_implementations/reasonir_model.py +1 -1
  158. mteb/models/model_implementations/repllama_models.py +3 -3
  159. mteb/models/model_implementations/rerankers_custom.py +12 -6
  160. mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
  161. mteb/models/model_implementations/richinfoai_models.py +1 -1
  162. mteb/models/model_implementations/ru_sentence_models.py +20 -20
  163. mteb/models/model_implementations/ruri_models.py +10 -10
  164. mteb/models/model_implementations/salesforce_models.py +3 -3
  165. mteb/models/model_implementations/samilpwc_models.py +1 -1
  166. mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
  167. mteb/models/model_implementations/searchmap_models.py +1 -1
  168. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  169. mteb/models/model_implementations/sentence_transformers_models.py +124 -22
  170. mteb/models/model_implementations/shuu_model.py +1 -1
  171. mteb/models/model_implementations/siglip_models.py +20 -20
  172. mteb/models/model_implementations/slm_models.py +416 -0
  173. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
  174. mteb/models/model_implementations/stella_models.py +17 -4
  175. mteb/models/model_implementations/tarka_models.py +2 -2
  176. mteb/models/model_implementations/text2vec_models.py +9 -3
  177. mteb/models/model_implementations/ua_sentence_models.py +1 -1
  178. mteb/models/model_implementations/uae_models.py +7 -1
  179. mteb/models/model_implementations/vdr_models.py +1 -1
  180. mteb/models/model_implementations/vi_vn_models.py +6 -6
  181. mteb/models/model_implementations/vlm2vec_models.py +3 -3
  182. mteb/models/model_implementations/voyage_models.py +84 -0
  183. mteb/models/model_implementations/voyage_v.py +9 -7
  184. mteb/models/model_implementations/youtu_models.py +1 -1
  185. mteb/models/model_implementations/yuan_models.py +1 -1
  186. mteb/models/model_implementations/yuan_models_en.py +1 -1
  187. mteb/models/model_meta.py +80 -31
  188. mteb/models/models_protocols.py +22 -6
  189. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  190. mteb/models/search_wrappers.py +33 -18
  191. mteb/models/sentence_transformer_wrapper.py +50 -25
  192. mteb/models/vllm_wrapper.py +327 -0
  193. mteb/py.typed +0 -0
  194. mteb/results/benchmark_results.py +29 -21
  195. mteb/results/model_result.py +52 -22
  196. mteb/results/task_result.py +80 -58
  197. mteb/similarity_functions.py +11 -7
  198. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  199. mteb/tasks/classification/est/estonian_valence.py +1 -1
  200. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  201. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  202. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  203. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  204. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  205. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  206. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  207. mteb/tasks/retrieval/code/code_rag.py +12 -12
  208. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  209. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  210. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  211. mteb/tasks/retrieval/eng/__init__.py +2 -0
  212. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  213. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  214. mteb/tasks/retrieval/kor/__init__.py +15 -1
  215. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  216. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  217. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  218. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  219. mteb/tasks/retrieval/nob/norquad.py +2 -2
  220. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  221. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  222. mteb/tasks/retrieval/vie/__init__.py +14 -6
  223. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  224. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  225. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  226. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  227. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  228. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  229. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  230. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  231. mteb/types/__init__.py +2 -0
  232. mteb/types/_encoder_io.py +12 -0
  233. mteb/types/_result.py +2 -1
  234. mteb/types/statistics.py +9 -3
  235. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
  236. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
  237. mteb/models/model_implementations/mxbai_models.py +0 -111
  238. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  239. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  240. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  241. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -5,29 +5,30 @@ import logging
5
5
  import os
6
6
  import sys
7
7
  import traceback
8
- from collections.abc import Iterable
8
+ import warnings
9
+ from collections.abc import Iterable, Sequence
9
10
  from copy import deepcopy
10
11
  from datetime import datetime
11
12
  from itertools import chain
12
13
  from pathlib import Path
13
14
  from time import time
14
- from typing import TYPE_CHECKING, Any
15
+ from typing import TYPE_CHECKING, Any, cast
15
16
 
16
17
  import datasets
17
18
 
18
19
  import mteb
19
20
  from mteb.abstasks import AbsTask
21
+ from mteb.abstasks.aggregated_task import AbsTaskAggregate
20
22
  from mteb.abstasks.task_metadata import TaskCategory, TaskType
21
23
  from mteb.benchmarks import Benchmark
22
24
  from mteb.models import (
23
25
  CrossEncoderWrapper,
24
- EncoderProtocol,
25
26
  ModelMeta,
26
27
  MTEBModels,
27
28
  SentenceTransformerEncoderWrapper,
28
29
  )
29
30
  from mteb.results import TaskResult
30
- from mteb.types import ScoresDict
31
+ from mteb.types import EncodeKwargs, ScoresDict
31
32
 
32
33
  if sys.version_info >= (3, 13):
33
34
  from warnings import deprecated
@@ -52,7 +53,7 @@ class MTEB:
52
53
  )
53
54
  def __init__(
54
55
  self,
55
- tasks: Iterable[AbsTask | Benchmark],
56
+ tasks: Iterable[AbsTask] | Iterable[Benchmark],
56
57
  *,
57
58
  err_logs_path: str = "error_logs.txt",
58
59
  ) -> None:
@@ -63,15 +64,14 @@ class MTEB:
63
64
  `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
64
65
  err_logs_path: Path to save error logs.
65
66
  """
66
- from mteb.benchmarks import Benchmark
67
-
68
- self.tasks = list(tasks)
69
- if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
67
+ if isinstance(next(iter(tasks)), Benchmark):
70
68
  self.benchmarks = tasks
71
- self.tasks = list(chain.from_iterable(self.tasks))
69
+ self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
70
+ elif isinstance(next(iter(tasks)), AbsTask):
71
+ self.tasks = list(cast(Iterable[AbsTask], tasks))
72
72
 
73
73
  self.err_logs_path = Path(err_logs_path)
74
- self.last_evaluated_splits = {}
74
+ self._last_evaluated_splits: dict[str, list[str]] = {}
75
75
 
76
76
  @property
77
77
  def available_tasks(self) -> list[str]:
@@ -84,7 +84,7 @@ class MTEB:
84
84
  return sorted({x.metadata.type for x in self.tasks})
85
85
 
86
86
  @property
87
- def available_task_categories(self) -> set[TaskCategory]:
87
+ def available_task_categories(self) -> set[TaskCategory | None]:
88
88
  """Set of available task categories."""
89
89
  return {x.metadata.category for x in self.tasks}
90
90
 
@@ -174,7 +174,7 @@ class MTEB:
174
174
  split: str,
175
175
  subsets_to_run: list[str] | None = None,
176
176
  *,
177
- encode_kwargs: dict[str, Any],
177
+ encode_kwargs: EncodeKwargs,
178
178
  **kwargs: Any,
179
179
  ):
180
180
  tick = time()
@@ -231,13 +231,14 @@ class MTEB:
231
231
  merged_kg_co2_emissions = None
232
232
  if existing_kg_co2_emissions and new_kg_co2_emissions:
233
233
  merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions
234
+ existing_evaluation_time = existing_results.evaluation_time or 0
235
+ new_evaluation_time = new_results.evaluation_time or 0
234
236
  merged_results = TaskResult(
235
237
  dataset_revision=new_results.dataset_revision,
236
238
  task_name=new_results.task_name,
237
239
  mteb_version=new_results.mteb_version,
238
240
  scores=merged_scores,
239
- evaluation_time=existing_results.evaluation_time
240
- + new_results.evaluation_time,
241
+ evaluation_time=existing_evaluation_time + new_evaluation_time,
241
242
  kg_co2_emissions=merged_kg_co2_emissions,
242
243
  )
243
244
 
@@ -262,7 +263,7 @@ class MTEB:
262
263
  overwrite_results: bool = False,
263
264
  raise_error: bool = True,
264
265
  co2_tracker: bool = False,
265
- encode_kwargs: dict[str, Any] | None = None,
266
+ encode_kwargs: EncodeKwargs | None = None,
266
267
  **kwargs,
267
268
  ) -> list[TaskResult]:
268
269
  """Run the evaluation pipeline on the selected tasks.
@@ -306,13 +307,16 @@ class MTEB:
306
307
  elif verbosity == 3:
307
308
  datasets.logging.set_verbosity(logging.DEBUG)
308
309
 
309
- meta = self.create_model_meta(model)
310
- output_path = self._create_output_folder(meta, output_folder)
311
-
310
+ mteb_model: MTEBModels
312
311
  if isinstance(model, SentenceTransformer):
313
- model = SentenceTransformerEncoderWrapper(model)
312
+ mteb_model = SentenceTransformerEncoderWrapper(model)
314
313
  elif isinstance(model, CrossEncoder):
315
- model = CrossEncoderWrapper(model)
314
+ mteb_model = CrossEncoderWrapper(model)
315
+ else:
316
+ mteb_model = cast(MTEBModels, model)
317
+
318
+ meta = self.create_model_meta(mteb_model)
319
+ output_path = self._create_output_folder(meta, output_folder)
316
320
 
317
321
  # Disable co2_tracker for API models
318
322
  if "API" in meta.framework:
@@ -333,7 +337,7 @@ class MTEB:
333
337
  ) # save them in case we re-use the object (e.g. for reranking)
334
338
 
335
339
  # To evaluate missing splits, we keep track of the task name and the corresponding splits.
336
- self.last_evaluated_splits = {}
340
+ self._last_evaluated_splits = {}
337
341
 
338
342
  while len(self.tasks) > 0:
339
343
  task = self.tasks[0]
@@ -342,9 +346,10 @@ class MTEB:
342
346
  )
343
347
 
344
348
  if task.is_aggregate:
345
- self_ = MTEB(tasks=task.metadata.tasks)
346
- task_results = self_.run(
347
- model,
349
+ aggregated_task = cast(AbsTaskAggregate, task)
350
+ self_ = MTEB(tasks=aggregated_task.metadata.tasks)
351
+ aggregated_task_results = self_.run(
352
+ mteb_model,
348
353
  verbosity=verbosity - 1,
349
354
  output_folder=output_folder,
350
355
  eval_splits=eval_splits,
@@ -355,12 +360,15 @@ class MTEB:
355
360
  encode_kwargs=encode_kwargs,
356
361
  **kwargs,
357
362
  )
358
- new_results = task.combine_task_results(task_results)
363
+ new_results = aggregated_task.combine_task_results(
364
+ aggregated_task_results
365
+ )
359
366
  evaluation_results.append(new_results)
360
367
 
361
368
  if output_path:
362
- save_path = output_path / f"{task.metadata.name}.json"
363
- new_results.to_disk(save_path)
369
+ new_results.to_disk(
370
+ output_path / f"{aggregated_task.metadata.name}.json"
371
+ )
364
372
  del self.tasks[0]
365
373
  continue
366
374
 
@@ -382,7 +390,7 @@ class MTEB:
382
390
  task_subsets = task.hf_subsets
383
391
 
384
392
  existing_results = None
385
- save_path = None
393
+ save_path: Path | None = None
386
394
  final_splits_to_run = task_eval_splits
387
395
  missing_evaluations = self._get_missing_evaluations(
388
396
  existing_results,
@@ -432,7 +440,7 @@ class MTEB:
432
440
  logger.info(
433
441
  f"No splits to evaluate for {task.metadata.name}. Skipping evaluation."
434
442
  )
435
- self.last_evaluated_splits[task.metadata.name] = []
443
+ self._last_evaluated_splits[task.metadata.name] = []
436
444
  del self.tasks[0]
437
445
  continue
438
446
 
@@ -440,11 +448,11 @@ class MTEB:
440
448
  task.check_if_dataset_is_superseded()
441
449
  task.load_data()
442
450
 
443
- task_results = {}
451
+ task_results: dict[str, dict[str, dict[str, Any]]] = {}
444
452
  evaluation_time = 0
445
453
  kg_co2_emissions: int | None = 0 if co2_tracker else None
446
454
 
447
- self.last_evaluated_splits[task.metadata.name] = []
455
+ self._last_evaluated_splits[task.metadata.name] = []
448
456
 
449
457
  for split in final_splits_to_run:
450
458
  info = missing_evaluations[split]
@@ -465,14 +473,16 @@ class MTEB:
465
473
 
466
474
  if co2_tracker:
467
475
  try:
468
- from codecarbon import EmissionsTracker
476
+ from codecarbon import ( # type: ignore[import-not-found,import-untyped]
477
+ EmissionsTracker,
478
+ )
469
479
  except ImportError:
470
480
  raise ImportError(
471
481
  "codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions."
472
482
  )
473
- logger.warning(
474
- "Evaluating multiple MTEB runs simultaneously will produce incorrect CO₂ results"
475
- )
483
+ msg = "Evaluating multiple MTEB runs simultaneously will produce incorrect CO₂ results"
484
+ logger.warning(msg)
485
+ warnings.warn(msg)
476
486
  with EmissionsTracker(
477
487
  save_to_file=False,
478
488
  save_to_api=False,
@@ -481,7 +491,7 @@ class MTEB:
481
491
  ) as tracker:
482
492
  results, tick, tock = self._run_eval(
483
493
  task,
484
- model,
494
+ mteb_model,
485
495
  split,
486
496
  encode_kwargs=encode_kwargs,
487
497
  subsets_to_run=subsets_to_run,
@@ -494,7 +504,7 @@ class MTEB:
494
504
  else:
495
505
  results, tick, tock = self._run_eval(
496
506
  task,
497
- model,
507
+ mteb_model,
498
508
  split,
499
509
  subsets_to_run=subsets_to_run,
500
510
  encode_kwargs=encode_kwargs,
@@ -510,25 +520,25 @@ class MTEB:
510
520
  if verbosity >= 1:
511
521
  logger.info(f"Scores: {task_results[split]}")
512
522
 
513
- self.last_evaluated_splits[task.metadata.name].append(split)
523
+ self._last_evaluated_splits[task.metadata.name].append(split)
514
524
 
515
525
  # Create new TaskResult
516
526
  new_results = TaskResult.from_task_results(
517
527
  task,
518
- task_results,
528
+ task_results, # type: ignore[arg-type]
519
529
  evaluation_time=evaluation_time,
520
530
  kg_co2_emissions=kg_co2_emissions,
521
531
  )
522
532
 
523
533
  # Merge with existing if needed
524
- if output_path and save_path.exists():
534
+ if output_path and save_path and save_path.exists():
525
535
  existing_results = TaskResult.from_disk(save_path)
526
536
  if existing_results:
527
537
  merged_results = self._merge_results(existing_results, new_results)
528
538
  else:
529
539
  merged_results = new_results
530
540
 
531
- if output_path:
541
+ if output_path and save_path:
532
542
  merged_results.to_disk(save_path)
533
543
 
534
544
  evaluation_results.append(merged_results)
@@ -555,7 +565,7 @@ class MTEB:
555
565
  def create_model_meta(model: MTEBModels) -> ModelMeta:
556
566
  """Create a ModelMeta object for the given model."""
557
567
  if hasattr(model, "mteb_model_meta") and model.mteb_model_meta is not None:
558
- meta = model.mteb_model_meta # type: ignore
568
+ meta = model.mteb_model_meta
559
569
  else:
560
570
  meta = MTEB._get_model_meta(model)
561
571
 
@@ -581,7 +591,11 @@ class MTEB:
581
591
  if output_folder is None:
582
592
  return None
583
593
 
584
- model_revision: str = model_meta.revision # type: ignore
594
+ model_revision: str = (
595
+ model_meta.revision
596
+ if model_meta.revision is not None
597
+ else "no_revision_available"
598
+ )
585
599
  model_path_name = model_meta.model_name_as_path()
586
600
 
587
601
  output_path = Path(output_folder) / model_path_name / model_revision
@@ -603,15 +617,15 @@ class MTEB:
603
617
  Tasks with empty lists indicate that results already existed and no splits were evaluated.
604
618
  """
605
619
  return deepcopy(
606
- {task: list(splits) for task, splits in self.last_evaluated_splits.items()}
620
+ {task: list(splits) for task, splits in self._last_evaluated_splits.items()}
607
621
  )
608
622
 
609
623
  @staticmethod
610
624
  def _get_missing_evaluations(
611
625
  existing_results: TaskResult | None,
612
- task_eval_splits: list[str],
613
- task_eval_langs: list[str],
614
- eval_subsets: list[str] | None,
626
+ task_eval_splits: Sequence[str],
627
+ task_eval_langs: Sequence[str],
628
+ eval_subsets: Sequence[str] | None,
615
629
  ) -> dict[str, dict[str, Any]]:
616
630
  """Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing."""
617
631
  missing_evaluations = {
@@ -660,7 +674,7 @@ class MTEB:
660
674
  return missing_evaluations
661
675
 
662
676
  @staticmethod
663
- def _get_model_meta(model: EncoderProtocol) -> ModelMeta:
677
+ def _get_model_meta(model: MTEBModels) -> ModelMeta:
664
678
  from sentence_transformers import CrossEncoder, SentenceTransformer
665
679
 
666
680
  if isinstance(model, CrossEncoder):
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1299,
4
+ "number_of_characters": 9254,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 2245,
8
+ "average_image_width": 2370.324347826087,
9
+ "max_image_width": 3508,
10
+ "min_image_height": 2481,
11
+ "average_image_height": 3289.8060869565215,
12
+ "max_image_height": 3580,
13
+ "unique_images": 1132
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 9254,
17
+ "min_text_length": 15,
18
+ "average_text_length": 62.10738255033557,
19
+ "max_text_length": 108,
20
+ "unique_texts": 149
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 409,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 2.7449664429530203,
27
+ "max_relevant_docs_per_query": 7,
28
+ "unique_relevant_docs": 316
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1640,
4
+ "number_of_characters": 8331,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 2313,
8
+ "average_image_width": 2347.5321597833445,
9
+ "max_image_width": 2481,
10
+ "min_image_height": 3138,
11
+ "average_image_height": 3214.301963439404,
12
+ "max_image_height": 3508,
13
+ "unique_images": 1442
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 8331,
17
+ "min_text_length": 23,
18
+ "average_text_length": 51.11042944785276,
19
+ "max_text_length": 110,
20
+ "unique_texts": 163
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 413,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 2.5337423312883436,
27
+ "max_relevant_docs_per_query": 6,
28
+ "unique_relevant_docs": 349
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 2166,
4
+ "number_of_characters": 9764,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 2221,
8
+ "average_image_width": 2339.4957350727545,
9
+ "max_image_width": 2480,
10
+ "min_image_height": 3036,
11
+ "average_image_height": 3242.8138484696437,
12
+ "max_image_height": 3508,
13
+ "unique_images": 1974
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 9764,
17
+ "min_text_length": 22,
18
+ "average_text_length": 56.4393063583815,
19
+ "max_text_length": 103,
20
+ "unique_texts": 173
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 525,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 3.0346820809248554,
27
+ "max_relevant_docs_per_query": 7,
28
+ "unique_relevant_docs": 442
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 2330,
4
+ "number_of_characters": 13131,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 1949,
8
+ "average_image_width": 2430.1152204836417,
9
+ "max_image_width": 3505,
10
+ "min_image_height": 2480,
11
+ "average_image_height": 3350.3921289710765,
12
+ "max_image_height": 3626,
13
+ "unique_images": 2096
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 13131,
17
+ "min_text_length": 21,
18
+ "average_text_length": 59.41628959276018,
19
+ "max_text_length": 112,
20
+ "unique_texts": 221
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 726,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 3.2850678733031673,
27
+ "max_relevant_docs_per_query": 7,
28
+ "unique_relevant_docs": 575
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 74457,
4
+ "number_of_characters": 76109543,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 75549698,
7
+ "min_text_length": 121,
8
+ "average_text_length": 1087.7189916063176,
9
+ "max_text_length": 25438,
10
+ "unique_texts": 69150
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 559845,
15
+ "min_text_length": 57,
16
+ "average_text_length": 111.969,
17
+ "max_text_length": 224,
18
+ "unique_texts": 5000
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 5000,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.0,
25
+ "max_relevant_docs_per_query": 1,
26
+ "unique_relevant_docs": 5000
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,116 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 30300,
4
+ "number_of_characters": 17320243,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 17276572,
7
+ "min_text_length": 316,
8
+ "average_text_length": 575.8857333333333,
9
+ "max_text_length": 1008,
10
+ "unique_texts": 28361
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 43671,
15
+ "min_text_length": 67,
16
+ "average_text_length": 145.57,
17
+ "max_text_length": 345,
18
+ "unique_texts": 300
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 300,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.0,
25
+ "max_relevant_docs_per_query": 1,
26
+ "unique_relevant_docs": 300
27
+ },
28
+ "top_ranked_statistics": null,
29
+ "hf_subset_descriptive_stats": {
30
+ "en": {
31
+ "num_samples": 10100,
32
+ "number_of_characters": 5517678,
33
+ "documents_text_statistics": {
34
+ "total_text_length": 5503635,
35
+ "min_text_length": 316,
36
+ "average_text_length": 550.3635,
37
+ "max_text_length": 726,
38
+ "unique_texts": 9422
39
+ },
40
+ "documents_image_statistics": null,
41
+ "queries_text_statistics": {
42
+ "total_text_length": 14043,
43
+ "min_text_length": 68,
44
+ "average_text_length": 140.43,
45
+ "max_text_length": 305,
46
+ "unique_texts": 100
47
+ },
48
+ "queries_image_statistics": null,
49
+ "relevant_docs_statistics": {
50
+ "num_relevant_docs": 100,
51
+ "min_relevant_docs_per_query": 1,
52
+ "average_relevant_docs_per_query": 1.0,
53
+ "max_relevant_docs_per_query": 1,
54
+ "unique_relevant_docs": 100
55
+ },
56
+ "top_ranked_statistics": null
57
+ },
58
+ "fi": {
59
+ "num_samples": 10100,
60
+ "number_of_characters": 5953462,
61
+ "documents_text_statistics": {
62
+ "total_text_length": 5938809,
63
+ "min_text_length": 326,
64
+ "average_text_length": 593.8809,
65
+ "max_text_length": 1008,
66
+ "unique_texts": 9422
67
+ },
68
+ "documents_image_statistics": null,
69
+ "queries_text_statistics": {
70
+ "total_text_length": 14653,
71
+ "min_text_length": 67,
72
+ "average_text_length": 146.53,
73
+ "max_text_length": 345,
74
+ "unique_texts": 100
75
+ },
76
+ "queries_image_statistics": null,
77
+ "relevant_docs_statistics": {
78
+ "num_relevant_docs": 100,
79
+ "min_relevant_docs_per_query": 1,
80
+ "average_relevant_docs_per_query": 1.0,
81
+ "max_relevant_docs_per_query": 1,
82
+ "unique_relevant_docs": 100
83
+ },
84
+ "top_ranked_statistics": null
85
+ },
86
+ "pt": {
87
+ "num_samples": 10100,
88
+ "number_of_characters": 5849103,
89
+ "documents_text_statistics": {
90
+ "total_text_length": 5834128,
91
+ "min_text_length": 325,
92
+ "average_text_length": 583.4128,
93
+ "max_text_length": 774,
94
+ "unique_texts": 9517
95
+ },
96
+ "documents_image_statistics": null,
97
+ "queries_text_statistics": {
98
+ "total_text_length": 14975,
99
+ "min_text_length": 69,
100
+ "average_text_length": 149.75,
101
+ "max_text_length": 320,
102
+ "unique_texts": 100
103
+ },
104
+ "queries_image_statistics": null,
105
+ "relevant_docs_statistics": {
106
+ "num_relevant_docs": 100,
107
+ "min_relevant_docs_per_query": 1,
108
+ "average_relevant_docs_per_query": 1.0,
109
+ "max_relevant_docs_per_query": 1,
110
+ "unique_relevant_docs": 100
111
+ },
112
+ "top_ranked_statistics": null
113
+ }
114
+ }
115
+ }
116
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 102198,
4
+ "number_of_characters": 47870352,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 47719757,
7
+ "min_text_length": 9,
8
+ "average_text_length": 472.01951591046225,
9
+ "max_text_length": 8686,
10
+ "unique_texts": 101097
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 150595,
15
+ "min_text_length": 30,
16
+ "average_text_length": 136.78019981834694,
17
+ "max_text_length": 404,
18
+ "unique_texts": 1099
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 3401,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 3.089009990917348,
25
+ "max_relevant_docs_per_query": 5,
26
+ "unique_relevant_docs": 1123
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 132137,
4
+ "number_of_characters": 43323279,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 43311486,
7
+ "min_text_length": 11,
8
+ "average_text_length": 328.5778249819823,
9
+ "max_text_length": 8576,
10
+ "unique_texts": 131814
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 11793,
15
+ "min_text_length": 6,
16
+ "average_text_length": 36.62422360248447,
17
+ "max_text_length": 100,
18
+ "unique_texts": 321
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 11620,
23
+ "min_relevant_docs_per_query": 31,
24
+ "average_relevant_docs_per_query": 36.08695652173913,
25
+ "max_relevant_docs_per_query": 1288,
26
+ "unique_relevant_docs": 32537
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }