mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +17 -18
  3. mteb/_evaluators/any_sts_evaluator.py +3 -3
  4. mteb/_evaluators/clustering_evaluator.py +2 -2
  5. mteb/_evaluators/evaluator.py +4 -2
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
  7. mteb/_evaluators/pair_classification_evaluator.py +5 -3
  8. mteb/_evaluators/retrieval_evaluator.py +2 -2
  9. mteb/_evaluators/retrieval_metrics.py +18 -17
  10. mteb/_evaluators/sklearn_evaluator.py +11 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
  12. mteb/_evaluators/text/summarization_evaluator.py +23 -18
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  14. mteb/abstasks/_data_filter/filters.py +1 -1
  15. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  16. mteb/abstasks/_statistics_calculation.py +18 -10
  17. mteb/abstasks/_stratification.py +18 -18
  18. mteb/abstasks/abstask.py +35 -28
  19. mteb/abstasks/aggregate_task_metadata.py +1 -9
  20. mteb/abstasks/aggregated_task.py +10 -29
  21. mteb/abstasks/classification.py +15 -10
  22. mteb/abstasks/clustering.py +19 -15
  23. mteb/abstasks/clustering_legacy.py +10 -10
  24. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  25. mteb/abstasks/multilabel_classification.py +23 -19
  26. mteb/abstasks/pair_classification.py +20 -11
  27. mteb/abstasks/regression.py +4 -4
  28. mteb/abstasks/retrieval.py +28 -24
  29. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  30. mteb/abstasks/sts.py +8 -5
  31. mteb/abstasks/task_metadata.py +31 -33
  32. mteb/abstasks/text/bitext_mining.py +39 -28
  33. mteb/abstasks/text/reranking.py +8 -6
  34. mteb/abstasks/text/summarization.py +10 -5
  35. mteb/abstasks/zeroshot_classification.py +8 -4
  36. mteb/benchmarks/benchmark.py +4 -2
  37. mteb/benchmarks/benchmarks/__init__.py +4 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +112 -11
  39. mteb/benchmarks/get_benchmark.py +14 -55
  40. mteb/cache.py +182 -29
  41. mteb/cli/_display_tasks.py +2 -2
  42. mteb/cli/build_cli.py +110 -14
  43. mteb/cli/generate_model_card.py +43 -23
  44. mteb/deprecated_evaluator.py +63 -49
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  49. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  50. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  51. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  53. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  54. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  55. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  56. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  57. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  58. mteb/evaluate.py +44 -33
  59. mteb/filter_tasks.py +25 -26
  60. mteb/get_tasks.py +29 -30
  61. mteb/languages/language_scripts.py +5 -3
  62. mteb/leaderboard/app.py +162 -34
  63. mteb/load_results.py +12 -12
  64. mteb/models/abs_encoder.py +10 -6
  65. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  66. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  67. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  68. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  69. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  70. mteb/models/get_model_meta.py +21 -3
  71. mteb/models/instruct_wrapper.py +28 -8
  72. mteb/models/model_implementations/align_models.py +1 -1
  73. mteb/models/model_implementations/andersborges.py +4 -4
  74. mteb/models/model_implementations/ara_models.py +1 -1
  75. mteb/models/model_implementations/arctic_models.py +8 -8
  76. mteb/models/model_implementations/b1ade_models.py +1 -1
  77. mteb/models/model_implementations/bge_models.py +45 -21
  78. mteb/models/model_implementations/bica_model.py +3 -3
  79. mteb/models/model_implementations/blip2_models.py +2 -2
  80. mteb/models/model_implementations/blip_models.py +16 -16
  81. mteb/models/model_implementations/bm25.py +4 -4
  82. mteb/models/model_implementations/bmretriever_models.py +6 -4
  83. mteb/models/model_implementations/cadet_models.py +1 -1
  84. mteb/models/model_implementations/cde_models.py +11 -4
  85. mteb/models/model_implementations/clip_models.py +6 -6
  86. mteb/models/model_implementations/clips_models.py +3 -3
  87. mteb/models/model_implementations/codefuse_models.py +5 -5
  88. mteb/models/model_implementations/codesage_models.py +3 -3
  89. mteb/models/model_implementations/cohere_models.py +5 -5
  90. mteb/models/model_implementations/cohere_v.py +2 -2
  91. mteb/models/model_implementations/colpali_models.py +3 -3
  92. mteb/models/model_implementations/colqwen_models.py +8 -8
  93. mteb/models/model_implementations/colsmol_models.py +2 -2
  94. mteb/models/model_implementations/conan_models.py +1 -1
  95. mteb/models/model_implementations/dino_models.py +42 -42
  96. mteb/models/model_implementations/e5_instruct.py +23 -4
  97. mteb/models/model_implementations/e5_models.py +9 -9
  98. mteb/models/model_implementations/e5_v.py +6 -6
  99. mteb/models/model_implementations/eagerworks_models.py +1 -1
  100. mteb/models/model_implementations/emillykkejensen_models.py +6 -6
  101. mteb/models/model_implementations/en_code_retriever.py +1 -1
  102. mteb/models/model_implementations/euler_models.py +2 -2
  103. mteb/models/model_implementations/fa_models.py +9 -9
  104. mteb/models/model_implementations/facebookai.py +14 -2
  105. mteb/models/model_implementations/geogpt_models.py +1 -1
  106. mteb/models/model_implementations/gme_v_models.py +6 -5
  107. mteb/models/model_implementations/google_models.py +1 -1
  108. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
  109. mteb/models/model_implementations/gritlm_models.py +2 -2
  110. mteb/models/model_implementations/gte_models.py +25 -13
  111. mteb/models/model_implementations/hinvec_models.py +1 -1
  112. mteb/models/model_implementations/ibm_granite_models.py +30 -6
  113. mteb/models/model_implementations/inf_models.py +2 -2
  114. mteb/models/model_implementations/jasper_models.py +2 -2
  115. mteb/models/model_implementations/jina_clip.py +48 -10
  116. mteb/models/model_implementations/jina_models.py +18 -11
  117. mteb/models/model_implementations/kblab.py +12 -6
  118. mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
  119. mteb/models/model_implementations/kfst.py +1 -1
  120. mteb/models/model_implementations/kowshik24_models.py +1 -1
  121. mteb/models/model_implementations/lgai_embedding_models.py +1 -1
  122. mteb/models/model_implementations/linq_models.py +1 -1
  123. mteb/models/model_implementations/listconranker.py +1 -1
  124. mteb/models/model_implementations/llm2clip_models.py +6 -6
  125. mteb/models/model_implementations/llm2vec_models.py +8 -8
  126. mteb/models/model_implementations/mcinext_models.py +4 -1
  127. mteb/models/model_implementations/mdbr_models.py +17 -3
  128. mteb/models/model_implementations/misc_models.py +68 -68
  129. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  130. mteb/models/model_implementations/mme5_models.py +1 -1
  131. mteb/models/model_implementations/moco_models.py +4 -4
  132. mteb/models/model_implementations/mod_models.py +1 -1
  133. mteb/models/model_implementations/model2vec_models.py +14 -14
  134. mteb/models/model_implementations/moka_models.py +1 -1
  135. mteb/models/model_implementations/nbailab.py +3 -3
  136. mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
  137. mteb/models/model_implementations/nomic_models.py +30 -15
  138. mteb/models/model_implementations/nomic_models_vision.py +1 -1
  139. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
  140. mteb/models/model_implementations/nvidia_models.py +151 -19
  141. mteb/models/model_implementations/octen_models.py +61 -2
  142. mteb/models/model_implementations/openclip_models.py +13 -13
  143. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
  144. mteb/models/model_implementations/ops_moa_models.py +1 -1
  145. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  146. mteb/models/model_implementations/pawan_models.py +1 -1
  147. mteb/models/model_implementations/piccolo_models.py +1 -1
  148. mteb/models/model_implementations/pixie_models.py +56 -0
  149. mteb/models/model_implementations/promptriever_models.py +4 -4
  150. mteb/models/model_implementations/pylate_models.py +10 -9
  151. mteb/models/model_implementations/qodo_models.py +2 -2
  152. mteb/models/model_implementations/qtack_models.py +1 -1
  153. mteb/models/model_implementations/qwen3_models.py +3 -3
  154. mteb/models/model_implementations/qzhou_models.py +2 -2
  155. mteb/models/model_implementations/random_baseline.py +3 -3
  156. mteb/models/model_implementations/rasgaard_models.py +2 -2
  157. mteb/models/model_implementations/reasonir_model.py +1 -1
  158. mteb/models/model_implementations/repllama_models.py +3 -3
  159. mteb/models/model_implementations/rerankers_custom.py +12 -6
  160. mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
  161. mteb/models/model_implementations/richinfoai_models.py +1 -1
  162. mteb/models/model_implementations/ru_sentence_models.py +20 -20
  163. mteb/models/model_implementations/ruri_models.py +10 -10
  164. mteb/models/model_implementations/salesforce_models.py +3 -3
  165. mteb/models/model_implementations/samilpwc_models.py +1 -1
  166. mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
  167. mteb/models/model_implementations/searchmap_models.py +1 -1
  168. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  169. mteb/models/model_implementations/sentence_transformers_models.py +124 -22
  170. mteb/models/model_implementations/shuu_model.py +1 -1
  171. mteb/models/model_implementations/siglip_models.py +20 -20
  172. mteb/models/model_implementations/slm_models.py +416 -0
  173. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
  174. mteb/models/model_implementations/stella_models.py +17 -4
  175. mteb/models/model_implementations/tarka_models.py +2 -2
  176. mteb/models/model_implementations/text2vec_models.py +9 -3
  177. mteb/models/model_implementations/ua_sentence_models.py +1 -1
  178. mteb/models/model_implementations/uae_models.py +7 -1
  179. mteb/models/model_implementations/vdr_models.py +1 -1
  180. mteb/models/model_implementations/vi_vn_models.py +6 -6
  181. mteb/models/model_implementations/vlm2vec_models.py +3 -3
  182. mteb/models/model_implementations/voyage_models.py +84 -0
  183. mteb/models/model_implementations/voyage_v.py +9 -7
  184. mteb/models/model_implementations/youtu_models.py +1 -1
  185. mteb/models/model_implementations/yuan_models.py +1 -1
  186. mteb/models/model_implementations/yuan_models_en.py +1 -1
  187. mteb/models/model_meta.py +80 -31
  188. mteb/models/models_protocols.py +22 -6
  189. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  190. mteb/models/search_wrappers.py +33 -18
  191. mteb/models/sentence_transformer_wrapper.py +50 -25
  192. mteb/models/vllm_wrapper.py +327 -0
  193. mteb/py.typed +0 -0
  194. mteb/results/benchmark_results.py +29 -21
  195. mteb/results/model_result.py +52 -22
  196. mteb/results/task_result.py +80 -58
  197. mteb/similarity_functions.py +11 -7
  198. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  199. mteb/tasks/classification/est/estonian_valence.py +1 -1
  200. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  201. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  202. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  203. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  204. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  205. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  206. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  207. mteb/tasks/retrieval/code/code_rag.py +12 -12
  208. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  209. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  210. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  211. mteb/tasks/retrieval/eng/__init__.py +2 -0
  212. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  213. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  214. mteb/tasks/retrieval/kor/__init__.py +15 -1
  215. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  216. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  217. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  218. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  219. mteb/tasks/retrieval/nob/norquad.py +2 -2
  220. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  221. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  222. mteb/tasks/retrieval/vie/__init__.py +14 -6
  223. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  224. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  225. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  226. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  227. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  228. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  229. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  230. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  231. mteb/types/__init__.py +2 -0
  232. mteb/types/_encoder_io.py +12 -0
  233. mteb/types/_result.py +2 -1
  234. mteb/types/statistics.py +9 -3
  235. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
  236. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
  237. mteb/models/model_implementations/mxbai_models.py +0 -111
  238. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  239. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  240. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  241. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/leaderboard/app.py CHANGED
@@ -5,7 +5,7 @@ import tempfile
5
5
  import time
6
6
  import warnings
7
7
  from pathlib import Path
8
- from typing import Literal
8
+ from typing import Literal, get_args
9
9
  from urllib.parse import urlencode
10
10
 
11
11
  import cachetools
@@ -29,40 +29,115 @@ from mteb.leaderboard.table import (
29
29
  apply_summary_styling_from_benchmark,
30
30
  )
31
31
  from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
32
+ from mteb.models.model_meta import MODEL_TYPES
32
33
 
33
34
  logger = logging.getLogger(__name__)
34
35
 
36
+
35
37
  LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.languages})
38
+ MODEL_TYPE_CHOICES = list(get_args(MODEL_TYPES))
36
39
 
37
40
 
38
41
  def _load_results(cache: ResultCache) -> BenchmarkResults:
42
+ """Load benchmark results using an optimized caching strategy.
43
+
44
+ This function implements a two-tier caching strategy for faster leaderboard startup:
45
+
46
+ 1. **Primary Strategy (Fast)**: Download pre-computed cached results from the
47
+ 'cached-data' branch as a compressed JSON file (~2MB vs ~200MB full repo).
48
+ This avoids the need to clone the entire results repository and provides
49
+ near-instantaneous loading for most users.
50
+
51
+ 2. **Fallback Strategy (Slower)**: If the cached download fails, fall back to
52
+ the original approach of downloading the full results repository and
53
+ building the cache from scratch.
54
+
55
+ The cached results file contains pre-aggregated benchmark data that eliminates
56
+ the need for expensive operations like task selection and revision joining
57
+ during app startup.
58
+
59
+ Args:
60
+ cache: ResultCache instance used for both optimized and fallback operations
61
+
62
+ Returns:
63
+ BenchmarkResults: Complete benchmark results ready for leaderboard display
64
+
65
+ Raises:
66
+ Various exceptions related to network issues, file I/O, or data validation
67
+ are logged and may cause fallback to the slower repository-based approach.
68
+ """
39
69
  start_time = time.time()
40
70
  results_cache_path = Path(__file__).parent.joinpath("__cached_results.json")
71
+
41
72
  if not results_cache_path.exists():
42
- logger.info("Cached results not found, downloading from remote...")
43
- cache.download_from_remote()
44
- download_time = time.time() - start_time
45
- logger.info(f"Downloaded remote results in {download_time:.2f}s")
46
-
47
- load_start = time.time()
48
- all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
49
-
50
- all_results = cache.load_results(
51
- models=all_model_names,
52
- only_main_score=True,
53
- require_model_meta=False,
54
- include_remote=True,
73
+ # First try to download the cached results file from the cached-data branch
74
+ # This is faster than cloning the entire results repository
75
+ logger.info(
76
+ "Cached results not found, trying to download from cached-data branch..."
55
77
  )
56
- load_time = time.time() - load_start
57
- logger.info(f"Loaded results from cache in {load_time:.2f}s")
58
- return all_results
59
- else:
60
- logger.info("Loading cached results from disk...")
78
+
79
+ try:
80
+ # Use ResultCache's optimized download method
81
+ # Default saves to mteb/leaderboard/__cached_results.json
82
+ results_cache_path = cache._download_cached_results_from_branch()
83
+ download_time = time.time() - start_time
84
+ logger.info(
85
+ f"Downloaded cached results from cached-data branch in {download_time:.2f}s"
86
+ )
87
+
88
+ except Exception as e:
89
+ logger.error(
90
+ f"Failed to download from cached-data branch: {type(e).__name__}: {e}"
91
+ )
92
+ logger.info("Falling back to downloading full remote repository...")
93
+
94
+ # Fall back to the original approach: clone the full repo
95
+ cache.download_from_remote()
96
+ download_time = time.time() - start_time
97
+ logger.info(f"Downloaded remote results in {download_time:.2f}s")
98
+
99
+ load_start = time.time()
100
+ all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
101
+
102
+ all_results = cache.load_results(
103
+ models=all_model_names,
104
+ only_main_score=True,
105
+ require_model_meta=False,
106
+ include_remote=True,
107
+ )
108
+ load_time = time.time() - load_start
109
+ logger.info(f"Loaded results from cache in {load_time:.2f}s")
110
+ return all_results
111
+
112
+ # Load the cached results file (either pre-existing or just downloaded)
113
+ logger.info("Loading cached results from disk...")
114
+ try:
115
+ logger.info(f"Opening file: {results_cache_path}")
116
+
117
+ file_size = results_cache_path.stat().st_size
118
+ logger.info(f"File exists, size: {file_size} bytes")
119
+
61
120
  with results_cache_path.open() as cache_file:
62
- results = mteb.BenchmarkResults.from_validated(**json.load(cache_file))
63
- total_time = time.time() - start_time
64
- logger.info(f"Loaded cached results in {total_time:.2f}s")
65
- return results
121
+ logger.info("File opened successfully, attempting JSON parse...")
122
+ json_data = json.load(cache_file)
123
+ logger.info(
124
+ f"JSON parsed successfully, keys: {list(json_data.keys()) if isinstance(json_data, dict) else 'not a dict'}"
125
+ )
126
+
127
+ logger.info("Attempting BenchmarkResults.from_validated...")
128
+ results = mteb.BenchmarkResults.from_validated(**json_data)
129
+ logger.info("BenchmarkResults.from_validated successful")
130
+
131
+ except Exception as e:
132
+ # TODO: Handle the case when we fail to load cached results from disk.
133
+ logger.error(
134
+ f"Failed to load cached results from disk: {type(e).__name__}: {e}"
135
+ )
136
+ raise
137
+
138
+ total_time = time.time() - start_time
139
+ logger.info(f"Loaded cached results in {total_time:.2f}s")
140
+ return results
66
141
 
67
142
 
68
143
  def _produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str:
@@ -169,7 +244,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
169
244
  df = df.drop(columns="reference")
170
245
  return gr.DataFrame(
171
246
  df,
172
- datatype=["markdown"] + ["str"] * (len(df.columns) - 1), # type: ignore
247
+ datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
173
248
  buttons=["copy", "fullscreen"],
174
249
  show_search="filter",
175
250
  )
@@ -187,6 +262,7 @@ def _filter_models(
187
262
  instructions: bool | None,
188
263
  max_model_size: int,
189
264
  zero_shot_setting: Literal["only_zero_shot", "allow_all", "remove_unknown"],
265
+ model_types: list[str] | None,
190
266
  ):
191
267
  lower, upper = 0, max_model_size
192
268
  # Setting to None, when the user doesn't specify anything
@@ -205,6 +281,7 @@ def _filter_models(
205
281
  use_instructions=instructions,
206
282
  frameworks=compatibility,
207
283
  n_parameters_range=(lower, upper),
284
+ model_types=model_types,
208
285
  )
209
286
 
210
287
  models_to_keep = set()
@@ -269,6 +346,7 @@ def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
269
346
  instructions=None,
270
347
  max_model_size=MAX_MODEL_SIZE,
271
348
  zero_shot_setting="allow_all",
349
+ model_types=MODEL_TYPE_CHOICES,
272
350
  )
273
351
  # Sort to ensure consistency with update_models
274
352
  initial_models = sorted(initial_models)
@@ -387,6 +465,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
387
465
  instructions=None,
388
466
  max_model_size=MAX_MODEL_SIZE,
389
467
  zero_shot_setting="allow_all",
468
+ model_types=MODEL_TYPE_CHOICES,
390
469
  )
391
470
  default_filtered_scores = [
392
471
  entry for entry in default_scores if entry["model_name"] in filtered_models
@@ -583,6 +662,12 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
583
662
  label="Model Parameters",
584
663
  interactive=True,
585
664
  )
665
+ with gr.Column():
666
+ model_type_select = gr.CheckboxGroup(
667
+ MODEL_TYPE_CHOICES,
668
+ value=MODEL_TYPE_CHOICES,
669
+ label="Model Type",
670
+ )
586
671
 
587
672
  with gr.Tab("Summary"):
588
673
  summary_table.render()
@@ -755,7 +840,8 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
755
840
  compatibility,
756
841
  instructions,
757
842
  max_model_size,
758
- zero_shot: hash(
843
+ zero_shot,
844
+ model_type_select: hash(
759
845
  (
760
846
  id(scores),
761
847
  hash(tuple(tasks)),
@@ -764,6 +850,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
764
850
  hash(instructions),
765
851
  hash(max_model_size),
766
852
  hash(zero_shot),
853
+ hash(tuple(model_type_select)),
767
854
  )
768
855
  ),
769
856
  )
@@ -775,6 +862,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
775
862
  instructions: bool | None,
776
863
  max_model_size: int,
777
864
  zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"],
865
+ model_type_select: list[str],
778
866
  ):
779
867
  start_time = time.time()
780
868
  model_names = list({entry["model_name"] for entry in scores})
@@ -786,6 +874,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
786
874
  instructions,
787
875
  max_model_size,
788
876
  zero_shot_setting=zero_shot,
877
+ model_types=model_type_select,
789
878
  )
790
879
  elapsed = time.time() - start_time
791
880
  logger.debug(f"update_models callback: {elapsed}s")
@@ -803,6 +892,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
803
892
  instructions,
804
893
  max_model_size,
805
894
  zero_shot,
895
+ model_type_select,
806
896
  ],
807
897
  outputs=[models],
808
898
  )
@@ -817,6 +907,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
817
907
  instructions,
818
908
  max_model_size,
819
909
  zero_shot,
910
+ model_type_select,
820
911
  ],
821
912
  outputs=[models],
822
913
  )
@@ -830,6 +921,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
830
921
  instructions,
831
922
  max_model_size,
832
923
  zero_shot,
924
+ model_type_select,
833
925
  ],
834
926
  outputs=[models],
835
927
  )
@@ -843,6 +935,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
843
935
  instructions,
844
936
  max_model_size,
845
937
  zero_shot,
938
+ model_type_select,
846
939
  ],
847
940
  outputs=[models],
848
941
  )
@@ -856,6 +949,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
856
949
  instructions,
857
950
  max_model_size,
858
951
  zero_shot,
952
+ model_type_select,
859
953
  ],
860
954
  outputs=[models],
861
955
  )
@@ -869,6 +963,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
869
963
  instructions,
870
964
  max_model_size,
871
965
  zero_shot,
966
+ model_type_select,
872
967
  ],
873
968
  outputs=[models],
874
969
  )
@@ -882,6 +977,21 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
882
977
  instructions,
883
978
  max_model_size,
884
979
  zero_shot,
980
+ model_type_select,
981
+ ],
982
+ outputs=[models],
983
+ )
984
+ model_type_select.change(
985
+ update_models,
986
+ inputs=[
987
+ scores,
988
+ task_select,
989
+ availability,
990
+ compatibility,
991
+ instructions,
992
+ max_model_size,
993
+ zero_shot,
994
+ model_type_select,
885
995
  ],
886
996
  outputs=[models],
887
997
  )
@@ -1023,16 +1133,34 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
1023
1133
 
1024
1134
 
1025
1135
  if __name__ == "__main__":
1026
- logging.getLogger("mteb.load_results.task_results").setLevel(
1027
- logging.ERROR
1028
- ) # Warnings related to task split
1029
- logging.getLogger("mteb.model_meta").setLevel(
1030
- logging.ERROR
1031
- ) # Warning related to model metadata (fetch_from_hf=False)
1032
- logging.getLogger("mteb.load_results.benchmark_results").setLevel(
1033
- logging.ERROR
1034
- ) # Warning related to model metadata (fetch_from_hf=False)
1136
+ import os
1137
+
1138
+ # Add process ID to logging for multiprocessing debugging
1139
+ logging.basicConfig(
1140
+ level=logging.INFO,
1141
+ format="%(asctime)s - PID:%(process)d - %(name)s - %(levelname)s - %(message)s",
1142
+ force=True, # Override any existing handlers
1143
+ )
1144
+
1145
+ # Flush log handlers immediately (helpful for multiprocessing)
1146
+ for handler in logging.root.handlers:
1147
+ handler.flush()
1148
+
1149
+ logger.info(f"Starting leaderboard app in process {os.getpid()}")
1150
+
1151
+ # Suppress specific WARNING messages while keeping INFO level for the app
1152
+ logging.getLogger("mteb.results.task_result").setLevel(logging.ERROR)
1153
+ logging.getLogger("mteb.models.model_meta").setLevel(logging.ERROR)
1154
+ logging.getLogger("mteb.results.benchmark_results").setLevel(logging.ERROR)
1155
+
1035
1156
  warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
1157
+ warnings.filterwarnings("ignore", message="Could not get source model: .*")
1158
+ warnings.filterwarnings(
1159
+ "ignore", message="No scores data available. Returning empty DataFrame."
1160
+ )
1161
+ warnings.filterwarnings("ignore", message="Main score .* not found in scores")
1162
+ warnings.filterwarnings("ignore", message=".*: Missing subsets .* for split .*")
1163
+ warnings.filterwarnings("ignore", message=".*: Missing splits .*")
1036
1164
 
1037
1165
  app = get_leaderboard_app()
1038
1166
 
mteb/load_results.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import sys
4
- from collections.abc import Sequence
4
+ from collections.abc import Iterable, Sequence
5
5
  from pathlib import Path
6
6
 
7
7
  from mteb.abstasks.abstask import AbsTask
@@ -45,8 +45,8 @@ def _model_name_and_revision(
45
45
  def load_results(
46
46
  results_repo: str = "https://github.com/embeddings-benchmark/results",
47
47
  download_latest: bool = True,
48
- models: Sequence[ModelMeta] | Sequence[str] | None = None,
49
- tasks: Sequence[AbsTask] | Sequence[str] | None = None,
48
+ models: Iterable[ModelMeta] | Sequence[str] | None = None,
49
+ tasks: Iterable[AbsTask] | Sequence[str] | None = None,
50
50
  validate_and_filter: bool = True,
51
51
  require_model_meta: bool = True,
52
52
  only_main_score: bool = False,
@@ -83,21 +83,21 @@ def load_results(
83
83
 
84
84
  if models is not None:
85
85
  models_to_keep = {}
86
- for model_path in models:
87
- if isinstance(model_path, ModelMeta):
88
- models_to_keep[model_path.name] = model_path.revision
86
+ for model in models:
87
+ if isinstance(model, ModelMeta):
88
+ models_to_keep[model.name] = model.revision
89
89
  else:
90
- models_to_keep[model_path] = None
90
+ models_to_keep[model] = None
91
91
  else:
92
92
  models_to_keep = None
93
93
 
94
- task_names = {}
94
+ task_names: dict[str, AbsTask | None] = {}
95
95
  if tasks is not None:
96
- for task in tasks:
97
- if isinstance(task, AbsTask):
98
- task_names[task.metadata.name] = task
96
+ for task_ in tasks:
97
+ if isinstance(task_, AbsTask):
98
+ task_names[task_.metadata.name] = task_
99
99
  else:
100
- task_names[task] = None
100
+ task_names[task_] = None
101
101
 
102
102
  model_results = []
103
103
  for model_path in model_paths:
@@ -1,9 +1,11 @@
1
1
  import logging
2
+ import warnings
2
3
  from abc import ABC, abstractmethod
3
4
  from collections.abc import Callable, Sequence
4
5
  from typing import Any, Literal, cast, get_args, overload
5
6
 
6
7
  from torch.utils.data import DataLoader
8
+ from typing_extensions import Unpack
7
9
 
8
10
  import mteb
9
11
  from mteb.abstasks.task_metadata import TaskMetadata, TaskType
@@ -18,6 +20,7 @@ from mteb.similarity_functions import (
18
20
  from mteb.types import (
19
21
  Array,
20
22
  BatchedInput,
23
+ EncodeKwargs,
21
24
  PromptType,
22
25
  )
23
26
 
@@ -43,7 +46,7 @@ class AbsEncoder(ABC):
43
46
  model: Any
44
47
  mteb_model_meta: ModelMeta | None = None
45
48
  model_prompts: dict[str, str] | None = None
46
- instruction_template: str | Callable[[str, PromptType], str] | None = None
49
+ instruction_template: str | Callable[[str, PromptType | None], str] | None = None
47
50
  prompts_dict: dict[str, str] | None = None
48
51
 
49
52
  def get_prompt_name(
@@ -110,7 +113,7 @@ class AbsEncoder(ABC):
110
113
  if not self.model_prompts:
111
114
  return None
112
115
  prompt_name = self.get_prompt_name(task_metadata, prompt_type)
113
- return self.model_prompts.get(prompt_name)
116
+ return self.model_prompts.get(prompt_name) if prompt_name else None
114
117
 
115
118
  @staticmethod
116
119
  @overload
@@ -187,6 +190,7 @@ class AbsEncoder(ABC):
187
190
  except KeyError:
188
191
  msg = f"Task name {task_name} is not valid. {valid_keys_msg}"
189
192
  logger.warning(msg)
193
+ warnings.warn(msg)
190
194
  invalid_task_messages.add(msg)
191
195
  invalid_keys.add(task_key)
192
196
 
@@ -232,9 +236,9 @@ class AbsEncoder(ABC):
232
236
  if isinstance(prompt, dict) and prompt_type:
233
237
  if prompt.get(prompt_type.value):
234
238
  return prompt[prompt_type.value]
235
- logger.warning(
236
- f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
237
- )
239
+ msg = f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
240
+ logger.warning(msg)
241
+ warnings.warn(msg)
238
242
  return ""
239
243
 
240
244
  if prompt:
@@ -368,7 +372,7 @@ class AbsEncoder(ABC):
368
372
  hf_split: str,
369
373
  hf_subset: str,
370
374
  prompt_type: PromptType | None = None,
371
- **kwargs: Any,
375
+ **kwargs: Unpack[EncodeKwargs],
372
376
  ) -> Array:
373
377
  """Encodes the given sentences using the encoder.
374
378
 
@@ -5,8 +5,6 @@ from typing import Any, Protocol, runtime_checkable
5
5
 
6
6
  import numpy as np
7
7
 
8
- from mteb.types import BatchedInput
9
-
10
8
 
11
9
  @runtime_checkable
12
10
  class CacheBackendProtocol(Protocol):
@@ -26,7 +24,7 @@ class CacheBackendProtocol(Protocol):
26
24
  **kwargs: Additional backend-specific arguments.
27
25
  """
28
26
 
29
- def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None:
27
+ def add(self, item: list[dict[str, Any]], vectors: np.ndarray) -> None:
30
28
  """Add a vector to the cache.
31
29
 
32
30
  Args:
@@ -34,7 +32,7 @@ class CacheBackendProtocol(Protocol):
34
32
  vectors: Embedding vector of shape (dim,) or (1, dim).
35
33
  """
36
34
 
37
- def get_vector(self, item: BatchedInput) -> np.ndarray | None:
35
+ def get_vector(self, item: dict[str, Any]) -> np.ndarray | None:
38
36
  """Retrieve the cached vector for the given item.
39
37
 
40
38
  Args:
@@ -53,5 +51,5 @@ class CacheBackendProtocol(Protocol):
53
51
  def close(self) -> None:
54
52
  """Release resources or flush data."""
55
53
 
56
- def __contains__(self, item: BatchedInput) -> bool:
54
+ def __contains__(self, item: dict[str, Any]) -> bool:
57
55
  """Check whether the cache contains an item."""
@@ -1,12 +1,13 @@
1
1
  import hashlib
2
+ from collections.abc import Mapping
3
+ from typing import Any
2
4
 
3
- from mteb.types import BatchedInput
4
5
 
5
-
6
- def _hash_item(item: BatchedInput) -> str:
6
+ def _hash_item(item: Mapping[str, Any]) -> str:
7
7
  item_hash = ""
8
8
  if "text" in item:
9
- item_hash = hashlib.sha256(item["text"].encode()).hexdigest()
9
+ item_text: str = item["text"]
10
+ item_hash = hashlib.sha256(item_text.encode()).hexdigest()
10
11
 
11
12
  if "image" in item:
12
13
  from PIL import Image
@@ -1,6 +1,8 @@
1
1
  import json
2
2
  import logging
3
+ import warnings
3
4
  from pathlib import Path
5
+ from typing import Any
4
6
 
5
7
  import numpy as np
6
8
 
@@ -36,7 +38,7 @@ class FaissCache:
36
38
  logger.info(f"Initialized FAISS VectorCacheMap in {self.directory}")
37
39
  self.load()
38
40
 
39
- def add(self, items: list[BatchedInput], vectors: np.ndarray) -> None:
41
+ def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None:
40
42
  """Add vector to FAISS index."""
41
43
  import faiss
42
44
 
@@ -71,7 +73,9 @@ class FaissCache:
71
73
  try:
72
74
  return self.index.reconstruct(idx)
73
75
  except Exception:
74
- logger.warning(f"Vector id {idx} missing for hash {item_hash}")
76
+ msg = f"Vector id {idx} missing for hash {item_hash}"
77
+ logger.warning(msg)
78
+ warnings.warn(msg)
75
79
  return None
76
80
 
77
81
  def save(self) -> None: