mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +17 -18
  3. mteb/_evaluators/any_sts_evaluator.py +3 -3
  4. mteb/_evaluators/clustering_evaluator.py +2 -2
  5. mteb/_evaluators/evaluator.py +4 -2
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
  7. mteb/_evaluators/pair_classification_evaluator.py +5 -3
  8. mteb/_evaluators/retrieval_evaluator.py +2 -2
  9. mteb/_evaluators/retrieval_metrics.py +18 -17
  10. mteb/_evaluators/sklearn_evaluator.py +11 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
  12. mteb/_evaluators/text/summarization_evaluator.py +23 -18
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  14. mteb/abstasks/_data_filter/filters.py +1 -1
  15. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  16. mteb/abstasks/_statistics_calculation.py +18 -10
  17. mteb/abstasks/_stratification.py +18 -18
  18. mteb/abstasks/abstask.py +35 -28
  19. mteb/abstasks/aggregate_task_metadata.py +1 -9
  20. mteb/abstasks/aggregated_task.py +10 -29
  21. mteb/abstasks/classification.py +15 -10
  22. mteb/abstasks/clustering.py +19 -15
  23. mteb/abstasks/clustering_legacy.py +10 -10
  24. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  25. mteb/abstasks/multilabel_classification.py +23 -19
  26. mteb/abstasks/pair_classification.py +20 -11
  27. mteb/abstasks/regression.py +4 -4
  28. mteb/abstasks/retrieval.py +28 -24
  29. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  30. mteb/abstasks/sts.py +8 -5
  31. mteb/abstasks/task_metadata.py +31 -33
  32. mteb/abstasks/text/bitext_mining.py +39 -28
  33. mteb/abstasks/text/reranking.py +8 -6
  34. mteb/abstasks/text/summarization.py +10 -5
  35. mteb/abstasks/zeroshot_classification.py +8 -4
  36. mteb/benchmarks/benchmark.py +4 -2
  37. mteb/benchmarks/benchmarks/__init__.py +4 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +112 -11
  39. mteb/benchmarks/get_benchmark.py +14 -55
  40. mteb/cache.py +182 -29
  41. mteb/cli/_display_tasks.py +2 -2
  42. mteb/cli/build_cli.py +110 -14
  43. mteb/cli/generate_model_card.py +43 -23
  44. mteb/deprecated_evaluator.py +63 -49
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  49. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  50. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  51. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  53. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  54. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  55. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  56. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  57. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  58. mteb/evaluate.py +44 -33
  59. mteb/filter_tasks.py +25 -26
  60. mteb/get_tasks.py +29 -30
  61. mteb/languages/language_scripts.py +5 -3
  62. mteb/leaderboard/app.py +162 -34
  63. mteb/load_results.py +12 -12
  64. mteb/models/abs_encoder.py +10 -6
  65. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  66. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  67. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  68. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  69. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  70. mteb/models/get_model_meta.py +21 -3
  71. mteb/models/instruct_wrapper.py +28 -8
  72. mteb/models/model_implementations/align_models.py +1 -1
  73. mteb/models/model_implementations/andersborges.py +4 -4
  74. mteb/models/model_implementations/ara_models.py +1 -1
  75. mteb/models/model_implementations/arctic_models.py +8 -8
  76. mteb/models/model_implementations/b1ade_models.py +1 -1
  77. mteb/models/model_implementations/bge_models.py +45 -21
  78. mteb/models/model_implementations/bica_model.py +3 -3
  79. mteb/models/model_implementations/blip2_models.py +2 -2
  80. mteb/models/model_implementations/blip_models.py +16 -16
  81. mteb/models/model_implementations/bm25.py +4 -4
  82. mteb/models/model_implementations/bmretriever_models.py +6 -4
  83. mteb/models/model_implementations/cadet_models.py +1 -1
  84. mteb/models/model_implementations/cde_models.py +11 -4
  85. mteb/models/model_implementations/clip_models.py +6 -6
  86. mteb/models/model_implementations/clips_models.py +3 -3
  87. mteb/models/model_implementations/codefuse_models.py +5 -5
  88. mteb/models/model_implementations/codesage_models.py +3 -3
  89. mteb/models/model_implementations/cohere_models.py +5 -5
  90. mteb/models/model_implementations/cohere_v.py +2 -2
  91. mteb/models/model_implementations/colpali_models.py +3 -3
  92. mteb/models/model_implementations/colqwen_models.py +8 -8
  93. mteb/models/model_implementations/colsmol_models.py +2 -2
  94. mteb/models/model_implementations/conan_models.py +1 -1
  95. mteb/models/model_implementations/dino_models.py +42 -42
  96. mteb/models/model_implementations/e5_instruct.py +23 -4
  97. mteb/models/model_implementations/e5_models.py +9 -9
  98. mteb/models/model_implementations/e5_v.py +6 -6
  99. mteb/models/model_implementations/eagerworks_models.py +1 -1
  100. mteb/models/model_implementations/emillykkejensen_models.py +6 -6
  101. mteb/models/model_implementations/en_code_retriever.py +1 -1
  102. mteb/models/model_implementations/euler_models.py +2 -2
  103. mteb/models/model_implementations/fa_models.py +9 -9
  104. mteb/models/model_implementations/facebookai.py +14 -2
  105. mteb/models/model_implementations/geogpt_models.py +1 -1
  106. mteb/models/model_implementations/gme_v_models.py +6 -5
  107. mteb/models/model_implementations/google_models.py +1 -1
  108. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
  109. mteb/models/model_implementations/gritlm_models.py +2 -2
  110. mteb/models/model_implementations/gte_models.py +25 -13
  111. mteb/models/model_implementations/hinvec_models.py +1 -1
  112. mteb/models/model_implementations/ibm_granite_models.py +30 -6
  113. mteb/models/model_implementations/inf_models.py +2 -2
  114. mteb/models/model_implementations/jasper_models.py +2 -2
  115. mteb/models/model_implementations/jina_clip.py +48 -10
  116. mteb/models/model_implementations/jina_models.py +18 -11
  117. mteb/models/model_implementations/kblab.py +12 -6
  118. mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
  119. mteb/models/model_implementations/kfst.py +1 -1
  120. mteb/models/model_implementations/kowshik24_models.py +1 -1
  121. mteb/models/model_implementations/lgai_embedding_models.py +1 -1
  122. mteb/models/model_implementations/linq_models.py +1 -1
  123. mteb/models/model_implementations/listconranker.py +1 -1
  124. mteb/models/model_implementations/llm2clip_models.py +6 -6
  125. mteb/models/model_implementations/llm2vec_models.py +8 -8
  126. mteb/models/model_implementations/mcinext_models.py +4 -1
  127. mteb/models/model_implementations/mdbr_models.py +17 -3
  128. mteb/models/model_implementations/misc_models.py +68 -68
  129. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  130. mteb/models/model_implementations/mme5_models.py +1 -1
  131. mteb/models/model_implementations/moco_models.py +4 -4
  132. mteb/models/model_implementations/mod_models.py +1 -1
  133. mteb/models/model_implementations/model2vec_models.py +14 -14
  134. mteb/models/model_implementations/moka_models.py +1 -1
  135. mteb/models/model_implementations/nbailab.py +3 -3
  136. mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
  137. mteb/models/model_implementations/nomic_models.py +30 -15
  138. mteb/models/model_implementations/nomic_models_vision.py +1 -1
  139. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
  140. mteb/models/model_implementations/nvidia_models.py +151 -19
  141. mteb/models/model_implementations/octen_models.py +61 -2
  142. mteb/models/model_implementations/openclip_models.py +13 -13
  143. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
  144. mteb/models/model_implementations/ops_moa_models.py +1 -1
  145. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  146. mteb/models/model_implementations/pawan_models.py +1 -1
  147. mteb/models/model_implementations/piccolo_models.py +1 -1
  148. mteb/models/model_implementations/pixie_models.py +56 -0
  149. mteb/models/model_implementations/promptriever_models.py +4 -4
  150. mteb/models/model_implementations/pylate_models.py +10 -9
  151. mteb/models/model_implementations/qodo_models.py +2 -2
  152. mteb/models/model_implementations/qtack_models.py +1 -1
  153. mteb/models/model_implementations/qwen3_models.py +3 -3
  154. mteb/models/model_implementations/qzhou_models.py +2 -2
  155. mteb/models/model_implementations/random_baseline.py +3 -3
  156. mteb/models/model_implementations/rasgaard_models.py +2 -2
  157. mteb/models/model_implementations/reasonir_model.py +1 -1
  158. mteb/models/model_implementations/repllama_models.py +3 -3
  159. mteb/models/model_implementations/rerankers_custom.py +12 -6
  160. mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
  161. mteb/models/model_implementations/richinfoai_models.py +1 -1
  162. mteb/models/model_implementations/ru_sentence_models.py +20 -20
  163. mteb/models/model_implementations/ruri_models.py +10 -10
  164. mteb/models/model_implementations/salesforce_models.py +3 -3
  165. mteb/models/model_implementations/samilpwc_models.py +1 -1
  166. mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
  167. mteb/models/model_implementations/searchmap_models.py +1 -1
  168. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  169. mteb/models/model_implementations/sentence_transformers_models.py +124 -22
  170. mteb/models/model_implementations/shuu_model.py +1 -1
  171. mteb/models/model_implementations/siglip_models.py +20 -20
  172. mteb/models/model_implementations/slm_models.py +416 -0
  173. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
  174. mteb/models/model_implementations/stella_models.py +17 -4
  175. mteb/models/model_implementations/tarka_models.py +2 -2
  176. mteb/models/model_implementations/text2vec_models.py +9 -3
  177. mteb/models/model_implementations/ua_sentence_models.py +1 -1
  178. mteb/models/model_implementations/uae_models.py +7 -1
  179. mteb/models/model_implementations/vdr_models.py +1 -1
  180. mteb/models/model_implementations/vi_vn_models.py +6 -6
  181. mteb/models/model_implementations/vlm2vec_models.py +3 -3
  182. mteb/models/model_implementations/voyage_models.py +84 -0
  183. mteb/models/model_implementations/voyage_v.py +9 -7
  184. mteb/models/model_implementations/youtu_models.py +1 -1
  185. mteb/models/model_implementations/yuan_models.py +1 -1
  186. mteb/models/model_implementations/yuan_models_en.py +1 -1
  187. mteb/models/model_meta.py +80 -31
  188. mteb/models/models_protocols.py +22 -6
  189. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  190. mteb/models/search_wrappers.py +33 -18
  191. mteb/models/sentence_transformer_wrapper.py +50 -25
  192. mteb/models/vllm_wrapper.py +327 -0
  193. mteb/py.typed +0 -0
  194. mteb/results/benchmark_results.py +29 -21
  195. mteb/results/model_result.py +52 -22
  196. mteb/results/task_result.py +80 -58
  197. mteb/similarity_functions.py +11 -7
  198. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  199. mteb/tasks/classification/est/estonian_valence.py +1 -1
  200. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  201. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  202. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  203. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  204. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  205. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  206. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  207. mteb/tasks/retrieval/code/code_rag.py +12 -12
  208. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  209. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  210. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  211. mteb/tasks/retrieval/eng/__init__.py +2 -0
  212. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  213. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  214. mteb/tasks/retrieval/kor/__init__.py +15 -1
  215. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  216. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  217. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  218. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  219. mteb/tasks/retrieval/nob/norquad.py +2 -2
  220. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  221. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  222. mteb/tasks/retrieval/vie/__init__.py +14 -6
  223. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  224. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  225. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  226. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  227. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  228. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  229. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  230. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  231. mteb/types/__init__.py +2 -0
  232. mteb/types/_encoder_io.py +12 -0
  233. mteb/types/_result.py +2 -1
  234. mteb/types/statistics.py +9 -3
  235. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
  236. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
  237. mteb/models/model_implementations/mxbai_models.py +0 -111
  238. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  239. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  240. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  241. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 106558,
4
+ "number_of_characters": 48164581,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 47886101,
7
+ "min_text_length": 9,
8
+ "average_text_length": 472.6783768310499,
9
+ "max_text_length": 8689,
10
+ "unique_texts": 101308
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 278480,
15
+ "min_text_length": 11,
16
+ "average_text_length": 53.04380952380952,
17
+ "max_text_length": 196,
18
+ "unique_texts": 5124
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 6254,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.1912380952380952,
25
+ "max_relevant_docs_per_query": 15,
26
+ "unique_relevant_docs": 1324
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 117974,
4
+ "number_of_characters": 35927363,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 35335613,
7
+ "min_text_length": 22,
8
+ "average_text_length": 316.47705838625023,
9
+ "max_text_length": 4105,
10
+ "unique_texts": 111651
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 591750,
15
+ "min_text_length": 21,
16
+ "average_text_length": 93.61651637399146,
17
+ "max_text_length": 280,
18
+ "unique_texts": 6321
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 12642,
23
+ "min_relevant_docs_per_query": 2,
24
+ "average_relevant_docs_per_query": 2.0,
25
+ "max_relevant_docs_per_query": 2,
26
+ "unique_relevant_docs": 11874
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "dev": {
3
+ "num_samples": 107153,
4
+ "number_of_characters": 33316879,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 33200903,
7
+ "min_text_length": 2,
8
+ "average_text_length": 320.30199218561575,
9
+ "max_text_length": 1712,
10
+ "unique_texts": 103641
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 115976,
15
+ "min_text_length": 8,
16
+ "average_text_length": 33.15494568324757,
17
+ "max_text_length": 190,
18
+ "unique_texts": 3498
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 3700,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.0577472841623785,
25
+ "max_relevant_docs_per_query": 4,
26
+ "unique_relevant_docs": 3698
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 104095,
4
+ "number_of_characters": 52312680,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 52220289,
7
+ "min_text_length": 10,
8
+ "average_text_length": 510.98673124908265,
9
+ "max_text_length": 10245,
10
+ "unique_texts": 102181
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 92391,
15
+ "min_text_length": 22,
16
+ "average_text_length": 48.62684210526316,
17
+ "max_text_length": 113,
18
+ "unique_texts": 1900
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 2283,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.201578947368421,
25
+ "max_relevant_docs_per_query": 4,
26
+ "unique_relevant_docs": 2283
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 20561,
4
+ "number_of_characters": 10832770,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 9929303,
7
+ "min_text_length": 9,
8
+ "average_text_length": 938.8524016641452,
9
+ "max_text_length": 6319,
10
+ "unique_texts": 10573
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 903467,
15
+ "min_text_length": 13,
16
+ "average_text_length": 90.48242363545317,
17
+ "max_text_length": 228,
18
+ "unique_texts": 9985
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 11158,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.1174762143214823,
25
+ "max_relevant_docs_per_query": 8,
26
+ "unique_relevant_docs": 10576
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
mteb/evaluate.py CHANGED
@@ -1,10 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import warnings
4
5
  from collections.abc import Iterable
5
6
  from pathlib import Path
6
7
  from time import time
7
- from typing import TYPE_CHECKING, Any, cast
8
+ from typing import TYPE_CHECKING, cast
8
9
 
9
10
  from datasets.exceptions import DatasetNotFoundError
10
11
  from tqdm.auto import tqdm
@@ -13,11 +14,10 @@ from mteb._helpful_enum import HelpfulStrEnum
13
14
  from mteb.abstasks import AbsTaskRetrieval
14
15
  from mteb.abstasks.abstask import AbsTask
15
16
  from mteb.abstasks.aggregated_task import AbsTaskAggregate
17
+ from mteb.benchmarks.benchmark import Benchmark
16
18
  from mteb.cache import ResultCache
17
19
  from mteb.models.model_meta import ModelMeta
18
20
  from mteb.models.models_protocols import (
19
- CrossEncoderProtocol,
20
- EncoderProtocol,
21
21
  MTEBModels,
22
22
  )
23
23
  from mteb.models.sentence_transformer_wrapper import (
@@ -27,6 +27,7 @@ from mteb.models.sentence_transformer_wrapper import (
27
27
  from mteb.results import ModelResult, TaskResult
28
28
  from mteb.results.task_result import TaskError
29
29
  from mteb.types import HFSubset, PromptType, SplitName
30
+ from mteb.types._encoder_io import EncodeKwargs
30
31
  from mteb.types._metadata import ModelName, Revision
31
32
 
32
33
  if TYPE_CHECKING:
@@ -57,27 +58,26 @@ def _sanitize_model(
57
58
  ) -> tuple[MTEBModels | ModelMeta, ModelMeta, ModelName, Revision]:
58
59
  from sentence_transformers import CrossEncoder, SentenceTransformer
59
60
 
61
+ wrapped_model: MTEBModels | ModelMeta
60
62
  if isinstance(model, SentenceTransformer):
61
- _mdl = SentenceTransformerEncoderWrapper(model)
62
- meta = _mdl.mteb_model_meta
63
- _mdl = cast(EncoderProtocol, _mdl)
64
- model = _mdl
63
+ wrapped_model = SentenceTransformerEncoderWrapper(model)
64
+ meta = wrapped_model.mteb_model_meta
65
65
  elif isinstance(model, CrossEncoder):
66
- _mdl = CrossEncoderWrapper(model)
67
- _mdl = cast(CrossEncoderProtocol, _mdl)
68
- meta = _mdl.mteb_model_meta
69
- model = _mdl
66
+ wrapped_model = CrossEncoderWrapper(model)
67
+ meta = wrapped_model.mteb_model_meta
70
68
  elif hasattr(model, "mteb_model_meta"):
71
- meta = model.mteb_model_meta # type: ignore[attr-defined]
69
+ meta = getattr(model, "mteb_model_meta")
72
70
  if not isinstance(meta, ModelMeta):
73
- meta = ModelMeta.from_hub(None)
71
+ meta = ModelMeta._from_hub(None)
72
+ wrapped_model = cast(MTEBModels | ModelMeta, model)
74
73
  else:
75
- meta = ModelMeta.from_hub(None) if not isinstance(model, ModelMeta) else model
74
+ meta = ModelMeta._from_hub(None) if not isinstance(model, ModelMeta) else model
75
+ wrapped_model = meta
76
76
 
77
77
  model_name = cast(str, meta.name)
78
78
  model_revision = cast(str, meta.revision)
79
79
 
80
- return model, meta, model_name, model_revision
80
+ return wrapped_model, meta, model_name, model_revision
81
81
 
82
82
 
83
83
  def _evaluate_task(
@@ -86,9 +86,10 @@ def _evaluate_task(
86
86
  *,
87
87
  splits: dict[SplitName, list[HFSubset]],
88
88
  co2_tracker: bool | None,
89
- encode_kwargs: dict[str, Any],
89
+ encode_kwargs: EncodeKwargs,
90
90
  prediction_folder: Path | None,
91
91
  public_only: bool | None,
92
+ num_proc: int = 1,
92
93
  ) -> TaskResult | TaskError:
93
94
  """The core logic to run a model on a given task. See `evaluate` for more details.
94
95
 
@@ -123,7 +124,8 @@ def _evaluate_task(
123
124
  prediction_folder=prediction_folder,
124
125
  public_only=public_only,
125
126
  )
126
- result.kg_co2_emissions = tracker.final_emissions
127
+ if isinstance(result, TaskResult):
128
+ result.kg_co2_emissions = tracker.final_emissions
127
129
  return result
128
130
 
129
131
  task_results = {}
@@ -136,10 +138,12 @@ def _evaluate_task(
136
138
  task.load_data()
137
139
  except DatasetNotFoundError as e:
138
140
  if not task.metadata.is_public and public_only is None:
139
- logger.warning(
141
+ msg = (
140
142
  f"Dataset for private task '{task.metadata.name}' not found. "
141
143
  "Make sure you have access to the dataset and that you have set up the authentication correctly. To disable this warning set `public_only=False`"
142
144
  )
145
+ logger.warning(msg)
146
+ warnings.warn(msg)
143
147
  return TaskError(
144
148
  task_name=task.metadata.name,
145
149
  exception=str(e),
@@ -147,7 +151,7 @@ def _evaluate_task(
147
151
  if public_only is False:
148
152
  raise e
149
153
 
150
- evaluation_time = 0
154
+ evaluation_time = 0.0
151
155
 
152
156
  for split, hf_subsets in splits.items():
153
157
  tick = time()
@@ -194,12 +198,18 @@ def _check_model_modalities(
194
198
  return
195
199
 
196
200
  model_modalities = set(model.modalities)
201
+ check_tasks: Iterable[AbsTask] = []
197
202
  if isinstance(tasks, AbsTask):
198
- tasks = [tasks]
203
+ check_tasks = [tasks]
204
+ elif isinstance(tasks, Benchmark):
205
+ benchmark = cast(Benchmark, tasks)
206
+ check_tasks = benchmark.tasks
207
+ else:
208
+ check_tasks = cast(Iterable[AbsTask], tasks)
199
209
 
200
210
  warnings, errors = [], []
201
211
 
202
- for task in tasks:
212
+ for task in check_tasks:
203
213
  # only retrieval tasks have different modalities for query and document and can be run with partial overlaps
204
214
  if isinstance(task, AbsTaskRetrieval):
205
215
  query_mods = set(task.metadata.get_modalities(PromptType.query))
@@ -262,7 +272,7 @@ def evaluate(
262
272
  *,
263
273
  co2_tracker: bool | None = None,
264
274
  raise_error: bool = True,
265
- encode_kwargs: dict[str, Any] | None = None,
275
+ encode_kwargs: EncodeKwargs | None = None,
266
276
  cache: ResultCache | None = ResultCache(),
267
277
  overwrite_strategy: str | OverwriteStrategy = "only-missing",
268
278
  prediction_folder: Path | str | None = None,
@@ -332,10 +342,10 @@ def evaluate(
332
342
 
333
343
  # AbsTaskAggregate is a special case where we have to run multiple tasks and combine the results
334
344
  if isinstance(tasks, AbsTaskAggregate):
335
- task = cast(AbsTaskAggregate, tasks)
345
+ aggregated_task = cast(AbsTaskAggregate, tasks)
336
346
  results = evaluate(
337
347
  model,
338
- task.metadata.tasks,
348
+ aggregated_task.metadata.tasks,
339
349
  co2_tracker=co2_tracker,
340
350
  raise_error=raise_error,
341
351
  encode_kwargs=encode_kwargs,
@@ -345,17 +355,18 @@ def evaluate(
345
355
  show_progress_bar=show_progress_bar,
346
356
  public_only=public_only,
347
357
  )
348
- result = task.combine_task_results(results.task_results)
358
+ combined_results = aggregated_task.combine_task_results(results.task_results)
349
359
  return ModelResult(
350
360
  model_name=results.model_name,
351
361
  model_revision=results.model_revision,
352
- task_results=[result],
362
+ task_results=[combined_results],
353
363
  )
354
364
 
355
365
  if isinstance(tasks, AbsTask):
356
366
  task = tasks
357
367
  else:
358
- results = []
368
+ tasks = cast(Iterable[AbsTask], tasks)
369
+ evaluate_results = []
359
370
  exceptions = []
360
371
  tasks_tqdm = tqdm(
361
372
  tasks,
@@ -376,23 +387,23 @@ def evaluate(
376
387
  show_progress_bar=False,
377
388
  public_only=public_only,
378
389
  )
379
- results.extend(_res.task_results)
390
+ evaluate_results.extend(_res.task_results)
380
391
  if _res.exceptions:
381
392
  exceptions.extend(_res.exceptions)
382
393
  return ModelResult(
383
394
  model_name=_res.model_name,
384
395
  model_revision=_res.model_revision,
385
- task_results=results,
396
+ task_results=evaluate_results,
386
397
  exceptions=exceptions,
387
398
  )
388
399
 
389
400
  overwrite_strategy = OverwriteStrategy.from_str(overwrite_strategy)
390
401
 
391
- existing_results = None
402
+ existing_results: TaskResult | None = None
392
403
  if cache and overwrite_strategy != OverwriteStrategy.ALWAYS:
393
- results = cache.load_task_result(task.metadata.name, meta)
394
- if results:
395
- existing_results = results
404
+ cache_results = cache.load_task_result(task.metadata.name, meta)
405
+ if cache_results:
406
+ existing_results = cache_results
396
407
 
397
408
  if (
398
409
  existing_results
mteb/filter_tasks.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """This script contains functions that are used to get an overview of the MTEB benchmark."""
2
2
 
3
3
  import logging
4
- from collections.abc import Sequence
4
+ from collections.abc import Iterable, Sequence
5
5
  from typing import overload
6
6
 
7
7
  from mteb.abstasks import (
@@ -34,14 +34,14 @@ def _check_is_valid_language(lang: str) -> None:
34
34
 
35
35
  @overload
36
36
  def filter_tasks(
37
- tasks: Sequence[AbsTask],
37
+ tasks: Iterable[AbsTask],
38
38
  *,
39
- languages: list[str] | None = None,
40
- script: list[str] | None = None,
41
- domains: list[TaskDomain] | None = None,
42
- task_types: list[TaskType] | None = None, # type: ignore
43
- categories: list[TaskCategory] | None = None,
44
- modalities: list[Modalities] | None = None,
39
+ languages: Sequence[str] | None = None,
40
+ script: Sequence[str] | None = None,
41
+ domains: Iterable[TaskDomain] | None = None,
42
+ task_types: Iterable[TaskType] | None = None,
43
+ categories: Iterable[TaskCategory] | None = None,
44
+ modalities: Iterable[Modalities] | None = None,
45
45
  exclusive_modality_filter: bool = False,
46
46
  exclude_superseded: bool = False,
47
47
  exclude_aggregate: bool = False,
@@ -51,14 +51,14 @@ def filter_tasks(
51
51
 
52
52
  @overload
53
53
  def filter_tasks(
54
- tasks: Sequence[type[AbsTask]],
54
+ tasks: Iterable[type[AbsTask]],
55
55
  *,
56
- languages: list[str] | None = None,
57
- script: list[str] | None = None,
58
- domains: list[TaskDomain] | None = None,
59
- task_types: list[TaskType] | None = None, # type: ignore
60
- categories: list[TaskCategory] | None = None,
61
- modalities: list[Modalities] | None = None,
56
+ languages: Sequence[str] | None = None,
57
+ script: Sequence[str] | None = None,
58
+ domains: Iterable[TaskDomain] | None = None,
59
+ task_types: Iterable[TaskType] | None = None,
60
+ categories: Iterable[TaskCategory] | None = None,
61
+ modalities: Iterable[Modalities] | None = None,
62
62
  exclusive_modality_filter: bool = False,
63
63
  exclude_superseded: bool = False,
64
64
  exclude_aggregate: bool = False,
@@ -67,14 +67,14 @@ def filter_tasks(
67
67
 
68
68
 
69
69
  def filter_tasks(
70
- tasks: Sequence[AbsTask] | Sequence[type[AbsTask]],
70
+ tasks: Iterable[AbsTask] | Iterable[type[AbsTask]],
71
71
  *,
72
- languages: list[str] | None = None,
73
- script: list[str] | None = None,
74
- domains: list[TaskDomain] | None = None,
75
- task_types: list[TaskType] | None = None, # type: ignore
76
- categories: list[TaskCategory] | None = None,
77
- modalities: list[Modalities] | None = None,
72
+ languages: Sequence[str] | None = None,
73
+ script: Sequence[str] | None = None,
74
+ domains: Iterable[TaskDomain] | None = None,
75
+ task_types: Iterable[TaskType] | None = None,
76
+ categories: Iterable[TaskCategory] | None = None,
77
+ modalities: Iterable[Modalities] | None = None,
78
78
  exclusive_modality_filter: bool = False,
79
79
  exclude_superseded: bool = False,
80
80
  exclude_aggregate: bool = False,
@@ -92,7 +92,6 @@ def filter_tasks(
92
92
  task_types: A string specifying the type of task e.g. "Classification" or "Retrieval". If None, all tasks are included.
93
93
  categories: A list of task categories these include "t2t" (text to text), "t2i" (text to image). See TaskMetadata for the full list.
94
94
  exclude_superseded: A boolean flag to exclude datasets which are superseded by another.
95
- eval_splits: A list of evaluation splits to include. If None, all splits are included.
96
95
  modalities: A list of modalities to include. If None, all modalities are included.
97
96
  exclusive_modality_filter: If True, only keep tasks where _all_ filter modalities are included in the
98
97
  task's modalities and ALL task modalities are in filter modalities (exact match).
@@ -113,12 +112,12 @@ def filter_tasks(
113
112
  """
114
113
  langs_to_keep = None
115
114
  if languages:
116
- [_check_is_valid_language(lang) for lang in languages]
115
+ [_check_is_valid_language(lang) for lang in languages] # type: ignore[func-returns-value]
117
116
  langs_to_keep = set(languages)
118
117
 
119
118
  script_to_keep = None
120
119
  if script:
121
- [_check_is_valid_script(s) for s in script]
120
+ [_check_is_valid_script(s) for s in script] # type: ignore[func-returns-value]
122
121
  script_to_keep = set(script)
123
122
 
124
123
  domains_to_keep = None
@@ -178,4 +177,4 @@ def filter_tasks(
178
177
 
179
178
  _tasks.append(t)
180
179
 
181
- return _tasks
180
+ return _tasks # type: ignore[return-value] # type checker cannot infer the overload return type
mteb/get_tasks.py CHANGED
@@ -2,8 +2,9 @@
2
2
 
3
3
  import difflib
4
4
  import logging
5
+ import warnings
5
6
  from collections import Counter, defaultdict
6
- from collections.abc import Sequence
7
+ from collections.abc import Iterable, Sequence
7
8
  from typing import Any
8
9
 
9
10
  import pandas as pd
@@ -22,12 +23,11 @@ logger = logging.getLogger(__name__)
22
23
  def _gather_tasks() -> tuple[type[AbsTask], ...]:
23
24
  import mteb.tasks as tasks
24
25
 
25
- tasks = [
26
+ return tuple(
26
27
  t
27
28
  for t in tasks.__dict__.values()
28
29
  if isinstance(t, type) and issubclass(t, AbsTask)
29
- ]
30
- return tuple(tasks)
30
+ )
31
31
 
32
32
 
33
33
  def _create_name_to_task_mapping(
@@ -43,7 +43,7 @@ def _create_name_to_task_mapping(
43
43
  return metadata_names
44
44
 
45
45
 
46
- def _create_similar_tasks(tasks: Sequence[type[AbsTask]]) -> dict[str, list[str]]:
46
+ def _create_similar_tasks(tasks: Iterable[type[AbsTask]]) -> dict[str, list[str]]:
47
47
  """Create a dictionary of similar tasks.
48
48
 
49
49
  Returns:
@@ -194,9 +194,8 @@ class MTEBTasks(tuple[AbsTask]):
194
194
  string with a LaTeX table.
195
195
  """
196
196
  if include_citation_in_name and "name" in properties:
197
- properties += ["intext_citation"]
198
- df = self.to_dataframe(properties)
199
- df["name"] = df["name"] + " " + df["intext_citation"]
197
+ df = self.to_dataframe(tuple(properties) + ("intext_citation",))
198
+ df["name"] = df["name"] + " " + df["intext_citation"] # type: ignore[operator]
200
199
  df = df.drop(columns=["intext_citation"])
201
200
  else:
202
201
  df = self.to_dataframe(properties)
@@ -221,17 +220,17 @@ class MTEBTasks(tuple[AbsTask]):
221
220
 
222
221
 
223
222
  def get_tasks(
224
- tasks: list[str] | None = None,
223
+ tasks: Sequence[str] | None = None,
225
224
  *,
226
- languages: list[str] | None = None,
227
- script: list[str] | None = None,
228
- domains: list[TaskDomain] | None = None,
229
- task_types: list[TaskType] | None = None, # type: ignore
230
- categories: list[TaskCategory] | None = None,
225
+ languages: Sequence[str] | None = None,
226
+ script: Sequence[str] | None = None,
227
+ domains: Sequence[TaskDomain] | None = None,
228
+ task_types: Sequence[TaskType] | None = None,
229
+ categories: Sequence[TaskCategory] | None = None,
231
230
  exclude_superseded: bool = True,
232
- eval_splits: list[str] | None = None,
231
+ eval_splits: Sequence[str] | None = None,
233
232
  exclusive_language_filter: bool = False,
234
- modalities: list[Modalities] | None = None,
233
+ modalities: Sequence[Modalities] | None = None,
235
234
  exclusive_modality_filter: bool = False,
236
235
  exclude_aggregate: bool = False,
237
236
  exclude_private: bool = True,
@@ -287,7 +286,7 @@ def get_tasks(
287
286
  ]
288
287
  return MTEBTasks(_tasks)
289
288
 
290
- _tasks = filter_tasks(
289
+ tasks_: Sequence[type[AbsTask]] = filter_tasks(
291
290
  TASK_LIST,
292
291
  languages=languages,
293
292
  script=script,
@@ -300,12 +299,12 @@ def get_tasks(
300
299
  exclude_aggregate=exclude_aggregate,
301
300
  exclude_private=exclude_private,
302
301
  )
303
- _tasks = [
304
- cls().filter_languages(languages, script).filter_eval_splits(eval_splits)
305
- for cls in _tasks
306
- ]
307
-
308
- return MTEBTasks(_tasks)
302
+ return MTEBTasks(
303
+ [
304
+ cls().filter_languages(languages, script).filter_eval_splits(eval_splits)
305
+ for cls in tasks_
306
+ ]
307
+ )
309
308
 
310
309
 
311
310
  _TASK_RENAMES = {"PersianTextTone": "SynPerTextToneClassification"}
@@ -313,10 +312,10 @@ _TASK_RENAMES = {"PersianTextTone": "SynPerTextToneClassification"}
313
312
 
314
313
  def get_task(
315
314
  task_name: str,
316
- languages: list[str] | None = None,
317
- script: list[str] | None = None,
318
- eval_splits: list[str] | None = None,
319
- hf_subsets: list[str] | None = None,
315
+ languages: Sequence[str] | None = None,
316
+ script: Sequence[str] | None = None,
317
+ eval_splits: Sequence[str] | None = None,
318
+ hf_subsets: Sequence[str] | None = None,
320
319
  exclusive_language_filter: bool = False,
321
320
  ) -> AbsTask:
322
321
  """Get a task by name.
@@ -340,9 +339,9 @@ def get_task(
340
339
  """
341
340
  if task_name in _TASK_RENAMES:
342
341
  _task_name = _TASK_RENAMES[task_name]
343
- logger.warning(
344
- f"The task with the given name '{task_name}' has been renamed to '{_task_name}'. To prevent this warning use the new name."
345
- )
342
+ msg = f"The task with the given name '{task_name}' has been renamed to '{_task_name}'. To prevent this warning use the new name."
343
+ logger.warning(msg)
344
+ warnings.warn(msg)
346
345
 
347
346
  if task_name not in _TASKS_REGISTRY:
348
347
  close_matches = difflib.get_close_matches(task_name, _TASKS_REGISTRY.keys())
@@ -1,9 +1,9 @@
1
- from collections.abc import Iterable
1
+ from collections.abc import Iterable, Sequence
2
2
  from dataclasses import dataclass
3
3
 
4
4
  from typing_extensions import Self
5
5
 
6
- from mteb.languages import check_language_code
6
+ from mteb.languages.check_language_code import check_language_code
7
7
 
8
8
 
9
9
  @dataclass
@@ -25,7 +25,9 @@ class LanguageScripts:
25
25
 
26
26
  @classmethod
27
27
  def from_languages_and_scripts(
28
- cls, languages: list[str] | None = None, scripts: list[str] | None = None
28
+ cls,
29
+ languages: Sequence[str] | None = None,
30
+ scripts: Sequence[str] | None = None,
29
31
  ) -> Self:
30
32
  """Create a LanguageScripts object from lists of languages and scripts.
31
33