mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +17 -18
  3. mteb/_evaluators/any_sts_evaluator.py +3 -3
  4. mteb/_evaluators/clustering_evaluator.py +2 -2
  5. mteb/_evaluators/evaluator.py +4 -2
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
  7. mteb/_evaluators/pair_classification_evaluator.py +5 -3
  8. mteb/_evaluators/retrieval_evaluator.py +2 -2
  9. mteb/_evaluators/retrieval_metrics.py +18 -17
  10. mteb/_evaluators/sklearn_evaluator.py +11 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
  12. mteb/_evaluators/text/summarization_evaluator.py +23 -18
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  14. mteb/abstasks/_data_filter/filters.py +1 -1
  15. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  16. mteb/abstasks/_statistics_calculation.py +18 -10
  17. mteb/abstasks/_stratification.py +18 -18
  18. mteb/abstasks/abstask.py +35 -28
  19. mteb/abstasks/aggregate_task_metadata.py +1 -9
  20. mteb/abstasks/aggregated_task.py +10 -29
  21. mteb/abstasks/classification.py +15 -10
  22. mteb/abstasks/clustering.py +19 -15
  23. mteb/abstasks/clustering_legacy.py +10 -10
  24. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  25. mteb/abstasks/multilabel_classification.py +23 -19
  26. mteb/abstasks/pair_classification.py +20 -11
  27. mteb/abstasks/regression.py +4 -4
  28. mteb/abstasks/retrieval.py +28 -24
  29. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  30. mteb/abstasks/sts.py +8 -5
  31. mteb/abstasks/task_metadata.py +31 -33
  32. mteb/abstasks/text/bitext_mining.py +39 -28
  33. mteb/abstasks/text/reranking.py +8 -6
  34. mteb/abstasks/text/summarization.py +10 -5
  35. mteb/abstasks/zeroshot_classification.py +8 -4
  36. mteb/benchmarks/benchmark.py +4 -2
  37. mteb/benchmarks/benchmarks/__init__.py +4 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +112 -11
  39. mteb/benchmarks/get_benchmark.py +14 -55
  40. mteb/cache.py +182 -29
  41. mteb/cli/_display_tasks.py +2 -2
  42. mteb/cli/build_cli.py +110 -14
  43. mteb/cli/generate_model_card.py +43 -23
  44. mteb/deprecated_evaluator.py +63 -49
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  49. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  50. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  51. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  53. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  54. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  55. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  56. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  57. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  58. mteb/evaluate.py +44 -33
  59. mteb/filter_tasks.py +25 -26
  60. mteb/get_tasks.py +29 -30
  61. mteb/languages/language_scripts.py +5 -3
  62. mteb/leaderboard/app.py +162 -34
  63. mteb/load_results.py +12 -12
  64. mteb/models/abs_encoder.py +10 -6
  65. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  66. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  67. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  68. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  69. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  70. mteb/models/get_model_meta.py +21 -3
  71. mteb/models/instruct_wrapper.py +28 -8
  72. mteb/models/model_implementations/align_models.py +1 -1
  73. mteb/models/model_implementations/andersborges.py +4 -4
  74. mteb/models/model_implementations/ara_models.py +1 -1
  75. mteb/models/model_implementations/arctic_models.py +8 -8
  76. mteb/models/model_implementations/b1ade_models.py +1 -1
  77. mteb/models/model_implementations/bge_models.py +45 -21
  78. mteb/models/model_implementations/bica_model.py +3 -3
  79. mteb/models/model_implementations/blip2_models.py +2 -2
  80. mteb/models/model_implementations/blip_models.py +16 -16
  81. mteb/models/model_implementations/bm25.py +4 -4
  82. mteb/models/model_implementations/bmretriever_models.py +6 -4
  83. mteb/models/model_implementations/cadet_models.py +1 -1
  84. mteb/models/model_implementations/cde_models.py +11 -4
  85. mteb/models/model_implementations/clip_models.py +6 -6
  86. mteb/models/model_implementations/clips_models.py +3 -3
  87. mteb/models/model_implementations/codefuse_models.py +5 -5
  88. mteb/models/model_implementations/codesage_models.py +3 -3
  89. mteb/models/model_implementations/cohere_models.py +5 -5
  90. mteb/models/model_implementations/cohere_v.py +2 -2
  91. mteb/models/model_implementations/colpali_models.py +3 -3
  92. mteb/models/model_implementations/colqwen_models.py +8 -8
  93. mteb/models/model_implementations/colsmol_models.py +2 -2
  94. mteb/models/model_implementations/conan_models.py +1 -1
  95. mteb/models/model_implementations/dino_models.py +42 -42
  96. mteb/models/model_implementations/e5_instruct.py +23 -4
  97. mteb/models/model_implementations/e5_models.py +9 -9
  98. mteb/models/model_implementations/e5_v.py +6 -6
  99. mteb/models/model_implementations/eagerworks_models.py +1 -1
  100. mteb/models/model_implementations/emillykkejensen_models.py +6 -6
  101. mteb/models/model_implementations/en_code_retriever.py +1 -1
  102. mteb/models/model_implementations/euler_models.py +2 -2
  103. mteb/models/model_implementations/fa_models.py +9 -9
  104. mteb/models/model_implementations/facebookai.py +14 -2
  105. mteb/models/model_implementations/geogpt_models.py +1 -1
  106. mteb/models/model_implementations/gme_v_models.py +6 -5
  107. mteb/models/model_implementations/google_models.py +1 -1
  108. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
  109. mteb/models/model_implementations/gritlm_models.py +2 -2
  110. mteb/models/model_implementations/gte_models.py +25 -13
  111. mteb/models/model_implementations/hinvec_models.py +1 -1
  112. mteb/models/model_implementations/ibm_granite_models.py +30 -6
  113. mteb/models/model_implementations/inf_models.py +2 -2
  114. mteb/models/model_implementations/jasper_models.py +2 -2
  115. mteb/models/model_implementations/jina_clip.py +48 -10
  116. mteb/models/model_implementations/jina_models.py +18 -11
  117. mteb/models/model_implementations/kblab.py +12 -6
  118. mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
  119. mteb/models/model_implementations/kfst.py +1 -1
  120. mteb/models/model_implementations/kowshik24_models.py +1 -1
  121. mteb/models/model_implementations/lgai_embedding_models.py +1 -1
  122. mteb/models/model_implementations/linq_models.py +1 -1
  123. mteb/models/model_implementations/listconranker.py +1 -1
  124. mteb/models/model_implementations/llm2clip_models.py +6 -6
  125. mteb/models/model_implementations/llm2vec_models.py +8 -8
  126. mteb/models/model_implementations/mcinext_models.py +4 -1
  127. mteb/models/model_implementations/mdbr_models.py +17 -3
  128. mteb/models/model_implementations/misc_models.py +68 -68
  129. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  130. mteb/models/model_implementations/mme5_models.py +1 -1
  131. mteb/models/model_implementations/moco_models.py +4 -4
  132. mteb/models/model_implementations/mod_models.py +1 -1
  133. mteb/models/model_implementations/model2vec_models.py +14 -14
  134. mteb/models/model_implementations/moka_models.py +1 -1
  135. mteb/models/model_implementations/nbailab.py +3 -3
  136. mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
  137. mteb/models/model_implementations/nomic_models.py +30 -15
  138. mteb/models/model_implementations/nomic_models_vision.py +1 -1
  139. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
  140. mteb/models/model_implementations/nvidia_models.py +151 -19
  141. mteb/models/model_implementations/octen_models.py +61 -2
  142. mteb/models/model_implementations/openclip_models.py +13 -13
  143. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
  144. mteb/models/model_implementations/ops_moa_models.py +1 -1
  145. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  146. mteb/models/model_implementations/pawan_models.py +1 -1
  147. mteb/models/model_implementations/piccolo_models.py +1 -1
  148. mteb/models/model_implementations/pixie_models.py +56 -0
  149. mteb/models/model_implementations/promptriever_models.py +4 -4
  150. mteb/models/model_implementations/pylate_models.py +10 -9
  151. mteb/models/model_implementations/qodo_models.py +2 -2
  152. mteb/models/model_implementations/qtack_models.py +1 -1
  153. mteb/models/model_implementations/qwen3_models.py +3 -3
  154. mteb/models/model_implementations/qzhou_models.py +2 -2
  155. mteb/models/model_implementations/random_baseline.py +3 -3
  156. mteb/models/model_implementations/rasgaard_models.py +2 -2
  157. mteb/models/model_implementations/reasonir_model.py +1 -1
  158. mteb/models/model_implementations/repllama_models.py +3 -3
  159. mteb/models/model_implementations/rerankers_custom.py +12 -6
  160. mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
  161. mteb/models/model_implementations/richinfoai_models.py +1 -1
  162. mteb/models/model_implementations/ru_sentence_models.py +20 -20
  163. mteb/models/model_implementations/ruri_models.py +10 -10
  164. mteb/models/model_implementations/salesforce_models.py +3 -3
  165. mteb/models/model_implementations/samilpwc_models.py +1 -1
  166. mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
  167. mteb/models/model_implementations/searchmap_models.py +1 -1
  168. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  169. mteb/models/model_implementations/sentence_transformers_models.py +124 -22
  170. mteb/models/model_implementations/shuu_model.py +1 -1
  171. mteb/models/model_implementations/siglip_models.py +20 -20
  172. mteb/models/model_implementations/slm_models.py +416 -0
  173. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
  174. mteb/models/model_implementations/stella_models.py +17 -4
  175. mteb/models/model_implementations/tarka_models.py +2 -2
  176. mteb/models/model_implementations/text2vec_models.py +9 -3
  177. mteb/models/model_implementations/ua_sentence_models.py +1 -1
  178. mteb/models/model_implementations/uae_models.py +7 -1
  179. mteb/models/model_implementations/vdr_models.py +1 -1
  180. mteb/models/model_implementations/vi_vn_models.py +6 -6
  181. mteb/models/model_implementations/vlm2vec_models.py +3 -3
  182. mteb/models/model_implementations/voyage_models.py +84 -0
  183. mteb/models/model_implementations/voyage_v.py +9 -7
  184. mteb/models/model_implementations/youtu_models.py +1 -1
  185. mteb/models/model_implementations/yuan_models.py +1 -1
  186. mteb/models/model_implementations/yuan_models_en.py +1 -1
  187. mteb/models/model_meta.py +80 -31
  188. mteb/models/models_protocols.py +22 -6
  189. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  190. mteb/models/search_wrappers.py +33 -18
  191. mteb/models/sentence_transformer_wrapper.py +50 -25
  192. mteb/models/vllm_wrapper.py +327 -0
  193. mteb/py.typed +0 -0
  194. mteb/results/benchmark_results.py +29 -21
  195. mteb/results/model_result.py +52 -22
  196. mteb/results/task_result.py +80 -58
  197. mteb/similarity_functions.py +11 -7
  198. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  199. mteb/tasks/classification/est/estonian_valence.py +1 -1
  200. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  201. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  202. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  203. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  204. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  205. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  206. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  207. mteb/tasks/retrieval/code/code_rag.py +12 -12
  208. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  209. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  210. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  211. mteb/tasks/retrieval/eng/__init__.py +2 -0
  212. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  213. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  214. mteb/tasks/retrieval/kor/__init__.py +15 -1
  215. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  216. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  217. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  218. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  219. mteb/tasks/retrieval/nob/norquad.py +2 -2
  220. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  221. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  222. mteb/tasks/retrieval/vie/__init__.py +14 -6
  223. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  224. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  225. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  226. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  227. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  228. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  229. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  230. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  231. mteb/types/__init__.py +2 -0
  232. mteb/types/_encoder_io.py +12 -0
  233. mteb/types/_result.py +2 -1
  234. mteb/types/statistics.py +9 -3
  235. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
  236. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
  237. mteb/models/model_implementations/mxbai_models.py +0 -111
  238. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  239. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  240. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  241. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/cache.py CHANGED
@@ -1,13 +1,19 @@
1
+ import gzip
2
+ import io
1
3
  import json
2
4
  import logging
3
5
  import os
4
6
  import shutil
5
7
  import subprocess
8
+ import warnings
6
9
  from collections import defaultdict
7
- from collections.abc import Sequence
10
+ from collections.abc import Iterable, Sequence
8
11
  from pathlib import Path
9
12
  from typing import cast
10
13
 
14
+ import requests
15
+ from pydantic import ValidationError
16
+
11
17
  import mteb
12
18
  from mteb.abstasks import AbsTask
13
19
  from mteb.benchmarks.benchmark import Benchmark
@@ -22,8 +28,8 @@ class ResultCache:
22
28
  """Class to handle the local cache of MTEB results.
23
29
 
24
30
  Examples:
25
- >>> from mteb.cache import ResultCache
26
- >>> cache = ResultCache(cache_path="~/.cache/mteb") # default
31
+ >>> import mteb
32
+ >>> cache = mteb.ResultCache(cache_path="~/.cache/mteb") # default
27
33
  >>> cache.download_from_remote() # download the latest results from the remote repository
28
34
  >>> result = cache.load_results("task_name", "model_name")
29
35
  """
@@ -83,9 +89,9 @@ class ResultCache:
83
89
  model_path = results_folder / model_name
84
90
 
85
91
  if model_revision is None:
86
- logger.warning(
87
- "model_revision is not specified, attempting to load the latest revision. To disable this behavior, specify model_revision explicitly."
88
- )
92
+ msg = "`model_revision` is not specified, attempting to load the latest revision. To disable this behavior, specify the 'model_revision` explicitly."
93
+ logger.warning(msg)
94
+ warnings.warn(msg)
89
95
  # get revs from paths
90
96
  revisions = [p for p in model_path.glob("*") if p.is_dir()]
91
97
  if not revisions:
@@ -275,21 +281,165 @@ class ResultCache:
275
281
 
276
282
  return results_directory
277
283
 
284
+ def _download_cached_results_from_branch(
285
+ self,
286
+ branch: str = "cached-data",
287
+ filename: str = "__cached_results.json.gz",
288
+ output_path: Path | None = None,
289
+ remote: str = "https://github.com/embeddings-benchmark/results",
290
+ timeout: int = 60,
291
+ max_size_mb: int = 500,
292
+ ) -> Path:
293
+ """Download pre-computed cached results from a specific branch.
294
+
295
+ This is significantly faster than download_from_remote() since it downloads
296
+ only a compressed cache file instead of cloning the entire repository.
297
+
298
+ The method performs the following steps:
299
+ 1. Downloads a gzipped JSON file from the specified branch
300
+ 2. Validates file size and content type
301
+ 3. Decompresses the gzip content
302
+ 4. Writes the decompressed JSON to disk
303
+
304
+ Args:
305
+ branch: Branch name to download from (default: "cached-data")
306
+ filename: Name of the cached results file (default: "__cached_results.json.gz")
307
+ output_path: Where to save the file. If None, uses mteb/leaderboard/__cached_results.json
308
+ remote: Base URL of the results repository
309
+ timeout: Request timeout in seconds (default: 60)
310
+ max_size_mb: Maximum allowed file size in megabytes (default: 500)
311
+
312
+ Returns:
313
+ Path to the downloaded and decompressed cache file
314
+
315
+ Raises:
316
+ requests.exceptions.RequestException: On HTTP errors
317
+ ValueError: On validation failures (size, content-type)
318
+ gzip.BadGzipFile: If content is not valid gzip
319
+ UnicodeDecodeError: If content cannot be decoded as UTF-8
320
+ PermissionError: If file cannot be written due to permissions
321
+ OSError: On other file system errors
322
+
323
+ Examples:
324
+ >>> import mteb
325
+ >>> cache = mteb.ResultCache()
326
+ >>> # Download optimized cached results
327
+ >>> cache_file = cache._download_cached_results_from_branch()
328
+ >>> # Use custom output path
329
+ >>> cache_file = cache._download_cached_results_from_branch(
330
+ ... output_path=Path("/tmp/my_cache.json")
331
+ ... )
332
+ """
333
+ if output_path is None:
334
+ # Default to saving in mteb/leaderboard/__cached_results.json
335
+ # Get the mteb package directory (parent of this file)
336
+ mteb_package_dir = Path(__file__).parent
337
+ output_path = mteb_package_dir / "leaderboard" / "__cached_results.json"
338
+
339
+ # Extract repository owner and name from the remote URL
340
+ # e.g., "https://github.com/embeddings-benchmark/results" -> "embeddings-benchmark/results"
341
+ repo_path = remote.replace("https://github.com/", "").replace(
342
+ "http://github.com/", ""
343
+ )
344
+
345
+ url = f"https://raw.githubusercontent.com/{repo_path}/{branch}/{filename}"
346
+ logger.info(f"Downloading cached results from {url}")
347
+
348
+ # Step 1: Download with validation
349
+ max_size_bytes = max_size_mb * 1024 * 1024
350
+
351
+ try:
352
+ response = requests.get(url, timeout=timeout)
353
+ response.raise_for_status()
354
+
355
+ # Check if this is a Git LFS pointer file
356
+ content_type = response.headers.get("content-type", "").lower()
357
+ if (
358
+ content_type == "text/plain; charset=utf-8"
359
+ and b"git-lfs" in response.content
360
+ ):
361
+ # Try Git LFS media URL instead
362
+ media_url = f"https://media.githubusercontent.com/media/{repo_path}/{branch}/{filename}"
363
+ logger.info(f"Detected Git LFS file, trying media URL: {media_url}")
364
+ response = requests.get(media_url, timeout=timeout)
365
+ response.raise_for_status()
366
+ content_type = response.headers.get("content-type", "").lower()
367
+
368
+ # Validate content-type header
369
+ expected_content_types = [
370
+ "application/gzip",
371
+ "application/octet-stream",
372
+ "application/x-gzip",
373
+ ]
374
+ if content_type and not any(
375
+ ct in content_type for ct in expected_content_types
376
+ ):
377
+ raise Exception(
378
+ f"Unexpected content-type: {content_type}. Expected one of: {expected_content_types}"
379
+ )
380
+
381
+ # Validate file size
382
+ content_length = len(response.content)
383
+ if content_length > max_size_bytes:
384
+ raise ValueError(
385
+ f"Downloaded file too large: {content_length} bytes (max: {max_size_bytes})"
386
+ )
387
+
388
+ logger.info(
389
+ f"HTTP request successful, content length: {content_length} bytes"
390
+ )
391
+ content = response.content
392
+
393
+ except Exception as e:
394
+ logger.error(f"Unexpected HTTP error: {type(e).__name__}: {e}")
395
+ raise e
396
+
397
+ # Step 2: Decompress gzip data
398
+ logger.info("Attempting gzip decompression...")
399
+
400
+ try:
401
+ with gzip.open(io.BytesIO(content), "rt", encoding="utf-8") as gz_file:
402
+ data = gz_file.read()
403
+ logger.info(f"Decompression successful, data length: {len(data)} chars")
404
+
405
+ except Exception as e:
406
+ logger.error(f"Unexpected decompression error: {type(e).__name__}: {e}")
407
+ raise e
408
+
409
+ # Step 3: Write to disk
410
+ logger.info(f"Attempting to write to: {output_path}")
411
+
412
+ # Check parent directory exists and is writable
413
+ output_path.parent.mkdir(parents=True, exist_ok=True)
414
+
415
+ try:
416
+ output_path.write_text(data, encoding="utf-8")
417
+ logger.info(
418
+ f"File write successful, size: {output_path.stat().st_size} bytes"
419
+ )
420
+ except Exception as e:
421
+ logger.error(f"Unexpected file write error: {type(e).__name__}: {e}")
422
+ raise e
423
+
424
+ return output_path
425
+
278
426
  def clear_cache(self) -> None:
279
427
  """Clear the local cache directory."""
280
428
  if self.cache_path.exists() and self.cache_path.is_dir():
281
429
  shutil.rmtree(self.cache_path)
282
430
  logger.info(f"Cache directory {self.cache_path} cleared.")
283
431
  else:
284
- logger.warning(f"Cache directory {self.cache_path} does not exist.")
432
+ msg = f"Cache directory `{self.cache_path}` does not exist."
433
+ logger.warning(msg)
434
+ warnings.warn(msg)
285
435
 
286
436
  def __repr__(self) -> str:
287
437
  return f"ResultCache(cache_path={self.cache_path})"
288
438
 
289
439
  def get_cache_paths(
290
440
  self,
291
- models: Sequence[str] | Sequence[ModelMeta] | None = None,
292
- tasks: Sequence[str] | Sequence[AbsTask] | None = None,
441
+ models: Sequence[str] | Iterable[ModelMeta] | None = None,
442
+ tasks: Sequence[str] | Iterable[AbsTask] | None = None,
293
443
  require_model_meta: bool = True,
294
444
  include_remote: bool = True,
295
445
  ) -> list[Path]:
@@ -311,8 +461,8 @@ class ResultCache:
311
461
  A list of paths in the cache directory.
312
462
 
313
463
  Examples:
314
- >>> from mteb.cache import ResultCache
315
- >>> cache = ResultCache()
464
+ >>> import mteb
465
+ >>> cache = mteb.ResultCache()
316
466
  >>>
317
467
  >>> # Get all cache paths
318
468
  >>> paths = cache.get_cache_paths()
@@ -422,7 +572,7 @@ class ResultCache:
422
572
  @staticmethod
423
573
  def _filter_paths_by_model_and_revision(
424
574
  paths: list[Path],
425
- models: Sequence[str] | Sequence[ModelMeta] | None = None,
575
+ models: Sequence[str] | Iterable[ModelMeta] | None = None,
426
576
  ) -> list[Path]:
427
577
  """Filter a list of paths by model name and optional revision.
428
578
 
@@ -432,8 +582,9 @@ class ResultCache:
432
582
  if not models:
433
583
  return paths
434
584
 
435
- if isinstance(models[0], ModelMeta):
436
- models = cast(list[ModelMeta], models)
585
+ first_model = next(iter(models))
586
+ if isinstance(first_model, ModelMeta):
587
+ models = cast(Iterable[ModelMeta], models)
437
588
  name_and_revision = {
438
589
  (m.model_name_as_path(), m.revision or "no_revision_available")
439
590
  for m in models
@@ -444,13 +595,14 @@ class ResultCache:
444
595
  if (p.parent.parent.name, p.parent.name) in name_and_revision
445
596
  ]
446
597
 
447
- model_names = {m.replace("/", "__").replace(" ", "_") for m in models}
598
+ str_models = cast(Sequence[str], models)
599
+ model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
448
600
  return [p for p in paths if p.parent.parent.name in model_names]
449
601
 
450
602
  @staticmethod
451
603
  def _filter_paths_by_task(
452
604
  paths: list[Path],
453
- tasks: Sequence[str] | Sequence[AbsTask] | None = None,
605
+ tasks: Sequence[str] | Iterable[AbsTask] | None = None,
454
606
  ) -> list[Path]:
455
607
  if tasks is not None:
456
608
  task_names = set()
@@ -466,8 +618,8 @@ class ResultCache:
466
618
 
467
619
  def load_results(
468
620
  self,
469
- models: Sequence[str] | Sequence[ModelMeta] | None = None,
470
- tasks: Sequence[str] | Sequence[AbsTask] | Benchmark | str | None = None,
621
+ models: Sequence[str] | Iterable[ModelMeta] | None = None,
622
+ tasks: Sequence[str] | Iterable[AbsTask] | Benchmark | str | None = None,
471
623
  require_model_meta: bool = True,
472
624
  include_remote: bool = True,
473
625
  validate_and_filter: bool = False,
@@ -478,6 +630,7 @@ class ResultCache:
478
630
  Args:
479
631
  models: A list of model names to load the results for. If None it will load the results for all models.
480
632
  tasks: A list of task names to load the results for. If str is passed, then benchmark will be loaded.
633
+ If Benchmark is passed, then all tasks in the benchmark will be loaded.
481
634
  If None it will load the results for all tasks.
482
635
  require_model_meta: If True it will ignore results that do not have a model_meta.json file. If false it attempt to
483
636
  extract the model name and revision from the path.
@@ -490,8 +643,8 @@ class ResultCache:
490
643
  A BenchmarkResults object containing the results for the specified models and tasks.
491
644
 
492
645
  Examples:
493
- >>> from mteb.cache import ResultCache
494
- >>> cache = ResultCache()
646
+ >>> import mteb
647
+ >>> cache = mteb.ResultCache()
495
648
  >>>
496
649
  >>> # Load results for specific models and tasks
497
650
  >>> results = cache.load_results(
@@ -511,7 +664,7 @@ class ResultCache:
511
664
  )
512
665
  models_results = defaultdict(list)
513
666
 
514
- task_names = {}
667
+ task_names: dict[str, AbsTask | None] = {}
515
668
  if tasks is not None:
516
669
  for task in tasks:
517
670
  if isinstance(task, AbsTask):
@@ -529,10 +682,12 @@ class ResultCache:
529
682
  )
530
683
 
531
684
  if validate_and_filter:
532
- task = task_names[task_result.task_name]
685
+ task_instance = task_names[task_result.task_name]
533
686
  try:
534
- task_result = task_result.validate_and_filter_scores(task=task)
535
- except Exception as e:
687
+ task_result = task_result.validate_and_filter_scores(
688
+ task=task_instance
689
+ )
690
+ except ValidationError as e:
536
691
  logger.info(
537
692
  f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
538
693
  )
@@ -541,7 +696,7 @@ class ResultCache:
541
696
  models_results[(model_name, revision)].append(task_result)
542
697
 
543
698
  # create BenchmarkResults object
544
- models_results = [
699
+ models_results_object = [
545
700
  ModelResult(
546
701
  model_name=model_name,
547
702
  model_revision=revision,
@@ -550,9 +705,7 @@ class ResultCache:
550
705
  for (model_name, revision), task_results in models_results.items()
551
706
  ]
552
707
 
553
- benchmark_results = BenchmarkResults(
554
- model_results=models_results,
708
+ return BenchmarkResults(
709
+ model_results=models_results_object,
555
710
  benchmark=tasks if isinstance(tasks, Benchmark) else None,
556
711
  )
557
-
558
- return benchmark_results
@@ -1,4 +1,4 @@
1
- from collections.abc import Sequence
1
+ from collections.abc import Iterable, Sequence
2
2
 
3
3
  from mteb.abstasks import AbsTask
4
4
  from mteb.benchmarks import Benchmark
@@ -31,7 +31,7 @@ def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
31
31
  _display_tasks(benchmark.tasks, name=name)
32
32
 
33
33
 
34
- def _display_tasks(task_list: Sequence[AbsTask], name: str | None = None) -> None:
34
+ def _display_tasks(task_list: Iterable[AbsTask], name: str | None = None) -> None:
35
35
  from rich.console import Console
36
36
 
37
37
  console = Console()
mteb/cli/build_cli.py CHANGED
@@ -1,17 +1,19 @@
1
1
  import argparse
2
2
  import logging
3
3
  import os
4
+ import warnings
4
5
  from pathlib import Path
5
6
 
6
7
  import torch
7
8
  from rich.logging import RichHandler
8
9
 
9
10
  import mteb
11
+ from mteb.abstasks.abstask import AbsTask
10
12
  from mteb.cache import ResultCache
13
+ from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
11
14
  from mteb.cli.generate_model_card import generate_model_card
12
15
  from mteb.evaluate import OverwriteStrategy
13
-
14
- from ._display_tasks import _display_benchmarks, _display_tasks
16
+ from mteb.types._encoder_io import EncodeKwargs
15
17
 
16
18
  logger = logging.getLogger(__name__)
17
19
 
@@ -53,7 +55,7 @@ def run(args: argparse.Namespace) -> None:
53
55
 
54
56
  if args.benchmarks:
55
57
  benchmarks = mteb.get_benchmarks(names=args.benchmarks)
56
- tasks = [t for b in benchmarks for t in b.tasks]
58
+ tasks = tuple(t for b in benchmarks for t in b.tasks)
57
59
  else:
58
60
  tasks = mteb.get_tasks(
59
61
  categories=args.categories,
@@ -63,21 +65,23 @@ def run(args: argparse.Namespace) -> None:
63
65
  eval_splits=args.eval_splits,
64
66
  )
65
67
 
66
- encode_kwargs = {}
68
+ encode_kwargs: EncodeKwargs = {}
67
69
  if args.batch_size is not None:
68
70
  encode_kwargs["batch_size"] = args.batch_size
69
71
 
70
72
  overwrite_strategy = args.overwrite_strategy
71
73
  if args.overwrite:
72
- logger.warning(
73
- "`--overwrite` is deprecated, please use `--overwrite-strategy 'always'` instead."
74
+ warnings.warn(
75
+ "`--overwrite` is deprecated, please use `--overwrite-strategy 'always'` instead.",
76
+ DeprecationWarning,
74
77
  )
75
78
  overwrite_strategy = OverwriteStrategy.ALWAYS.value
76
79
 
77
80
  prediction_folder = args.prediction_folder
78
81
  if args.save_predictions:
79
- logger.warning(
80
- "`--save_predictions` is deprecated, please use `--prediction-folder` instead."
82
+ warnings.warn(
83
+ "`--save_predictions` is deprecated, please use `--prediction-folder` instead.",
84
+ DeprecationWarning,
81
85
  )
82
86
  prediction_folder = args.output_folder
83
87
 
@@ -279,23 +283,25 @@ def _create_meta(args: argparse.Namespace) -> None:
279
283
  from_existing = Path(from_existing)
280
284
 
281
285
  if output_path.exists() and overwrite:
282
- logger.warning("Output path already exists, overwriting.")
286
+ msg = "Output path already exists, overwriting."
287
+ logger.warning(msg)
288
+ warnings.warn(msg)
283
289
  elif output_path.exists():
284
290
  raise FileExistsError(
285
291
  "Output path already exists, use --overwrite to overwrite."
286
292
  )
287
293
 
288
- tasks = []
294
+ benchmarks = None
295
+ tasks: list[AbsTask] = []
289
296
  if tasks_names is not None:
290
- tasks = mteb.get_tasks(tasks_names)
297
+ tasks = list(mteb.get_tasks(tasks_names))
291
298
  if benchmarks is not None:
292
299
  benchmarks = mteb.get_benchmarks(benchmarks)
293
- for benchmark in benchmarks:
294
- tasks.extend(benchmark.tasks)
295
300
 
296
301
  generate_model_card(
297
302
  model_name,
298
- tasks if len(tasks) > 0 else None,
303
+ tasks,
304
+ benchmarks,
299
305
  existing_model_card_id_or_path=from_existing,
300
306
  results_cache=ResultCache(results_folder),
301
307
  output_path=output_path,
@@ -356,6 +362,95 @@ def _add_create_meta_parser(subparsers) -> None:
356
362
  parser.set_defaults(func=_create_meta)
357
363
 
358
364
 
365
+ def _add_leaderboard_parser(subparsers) -> None:
366
+ parser = subparsers.add_parser("leaderboard", help="Launch the MTEB leaderboard")
367
+
368
+ parser.add_argument(
369
+ "--cache-path",
370
+ type=str,
371
+ help="Path to the cache folder containing model results",
372
+ required=False,
373
+ default=None,
374
+ )
375
+ parser.add_argument(
376
+ "--host",
377
+ type=str,
378
+ default="0.0.0.0",
379
+ help="Host to run the leaderboard server on",
380
+ )
381
+ parser.add_argument(
382
+ "--port",
383
+ type=int,
384
+ default=7860,
385
+ help="Port to run the leaderboard server on",
386
+ )
387
+ parser.add_argument(
388
+ "--share",
389
+ action="store_true",
390
+ default=False,
391
+ help="Create a public URL for the leaderboard",
392
+ )
393
+
394
+ parser.set_defaults(func=_leaderboard)
395
+
396
+
397
+ def _leaderboard(args: argparse.Namespace) -> None:
398
+ """Launch the MTEB leaderboard with specified cache path."""
399
+ # Import leaderboard module only when needed to avoid requiring leaderboard dependencies
400
+ # for other CLI commands
401
+ try:
402
+ import gradio as gr
403
+
404
+ from mteb.leaderboard import get_leaderboard_app
405
+ except ImportError as e:
406
+ raise ImportError(
407
+ "Seems like some dependencies are not installed. "
408
+ + "You can likely install these using: `pip install mteb[leaderboard]`. "
409
+ + f"{e}"
410
+ )
411
+
412
+ cache_path = args.cache_path
413
+
414
+ if cache_path:
415
+ logger.info(f"Using cache path: {cache_path}")
416
+ cache = ResultCache(cache_path)
417
+ else:
418
+ cache = ResultCache()
419
+ logger.info(f"Using default cache path: {cache.cache_path}")
420
+
421
+ app = get_leaderboard_app(cache)
422
+
423
+ logger.info(f"Starting leaderboard on {args.host}:{args.port}")
424
+ if args.share:
425
+ logger.info("Creating public URL...")
426
+
427
+ logging.getLogger("mteb.load_results.task_results").setLevel(
428
+ logging.ERROR
429
+ ) # Warnings related to task split
430
+ logging.getLogger("mteb.model_meta").setLevel(
431
+ logging.ERROR
432
+ ) # Warning related to model metadata (fetch_from_hf=False)
433
+ logging.getLogger("mteb.load_results.benchmark_results").setLevel(
434
+ logging.ERROR
435
+ ) # Warning related to model metadata (fetch_from_hf=False)
436
+ warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
437
+
438
+ # Head content for Tailwind CSS
439
+ head = """
440
+ <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
441
+ """
442
+
443
+ app.launch(
444
+ server_name=args.host,
445
+ server_port=args.port,
446
+ share=args.share,
447
+ theme=gr.themes.Soft(
448
+ font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
449
+ ),
450
+ head=head,
451
+ )
452
+
453
+
359
454
  def build_cli() -> argparse.ArgumentParser:
360
455
  """Builds the argument parser for the MTEB CLI.
361
456
 
@@ -375,6 +470,7 @@ def build_cli() -> argparse.ArgumentParser:
375
470
  _add_available_tasks_parser(subparsers)
376
471
  _add_available_benchmarks_parser(subparsers)
377
472
  _add_create_meta_parser(subparsers)
473
+ _add_leaderboard_parser(subparsers)
378
474
 
379
475
  return parser
380
476
 
@@ -1,10 +1,12 @@
1
1
  import logging
2
+ import warnings
3
+ from collections.abc import Sequence
2
4
  from pathlib import Path
3
5
 
4
6
  from huggingface_hub import ModelCard, ModelCardData, repo_exists
5
7
 
6
- from mteb import BenchmarkResults
7
8
  from mteb.abstasks.abstask import AbsTask
9
+ from mteb.benchmarks.benchmark import Benchmark
8
10
  from mteb.cache import ResultCache
9
11
 
10
12
  logger = logging.getLogger(__name__)
@@ -12,12 +14,13 @@ logger = logging.getLogger(__name__)
12
14
 
13
15
  def generate_model_card(
14
16
  model_name: str,
15
- tasks: list[AbsTask] | None = None,
17
+ tasks: Sequence[AbsTask] | None = None,
18
+ benchmarks: Sequence[Benchmark] | None = None,
16
19
  existing_model_card_id_or_path: str | Path | None = None,
17
20
  results_cache: ResultCache = ResultCache(),
18
21
  output_path: Path = Path("model_card.md"),
19
22
  add_table_to_model_card: bool = False,
20
- models_to_compare: list[str] | None = None,
23
+ models_to_compare: Sequence[str] | None = None,
21
24
  token: str | None = None,
22
25
  push_to_hub: bool = False,
23
26
  ) -> None:
@@ -26,6 +29,7 @@ def generate_model_card(
26
29
  Args:
27
30
  model_name: Name of the model.
28
31
  tasks: List of tasks to generate results for.
32
+ benchmarks: A Benchmark or list of benchmarks to generate results for.
29
33
  existing_model_card_id_or_path: Path or ID of an existing model card to update.
30
34
  results_cache: Instance of ResultCache to load results from.
31
35
  output_path: Path to save the generated model card.
@@ -39,16 +43,24 @@ def generate_model_card(
39
43
  if existing_model_card_id_or_path:
40
44
  existing_model_card = ModelCard.load(existing_model_card_id_or_path)
41
45
 
46
+ all_tasks: list[AbsTask] = []
47
+ if tasks is not None:
48
+ all_tasks.extend(tasks)
49
+
50
+ if benchmarks is not None:
51
+ for b in benchmarks:
52
+ all_tasks.extend(b.tasks)
53
+
42
54
  benchmark_results = results_cache.load_results(
43
- [model_name], tasks, only_main_score=True
55
+ [model_name], all_tasks if all_tasks else None, only_main_score=True
44
56
  )
45
57
  eval_results = []
46
58
  for models_results in benchmark_results.model_results:
47
59
  for task_result in models_results.task_results:
48
60
  eval_results.extend(task_result.get_hf_eval_results())
49
61
 
50
- existing_model_card_data = (
51
- existing_model_card.data if existing_model_card else ModelCardData()
62
+ existing_model_card_data: ModelCardData = (
63
+ existing_model_card.data if existing_model_card else ModelCardData() # type: ignore[assignment]
52
64
  )
53
65
 
54
66
  if existing_model_card_data.eval_results is None:
@@ -78,35 +90,43 @@ def generate_model_card(
78
90
  card_data=existing_model_card_data
79
91
  )
80
92
 
81
- if models_to_compare:
82
- benchmark_results = results_cache.load_results(
83
- [model_name, *models_to_compare], tasks, only_main_score=True
84
- )
85
-
86
93
  if add_table_to_model_card:
87
94
  existing_model_card = _add_table_to_model_card(
88
- benchmark_results, existing_model_card
95
+ results_cache,
96
+ existing_model_card,
97
+ (model_name, *models_to_compare) if models_to_compare else (model_name,),
98
+ benchmarks or [],
89
99
  )
90
100
 
91
- if push_to_hub:
101
+ if push_to_hub and existing_model_card_id_or_path:
102
+ existing_model_card_id_or_path = str(existing_model_card_id_or_path)
92
103
  if repo_exists(existing_model_card_id_or_path):
93
104
  existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token)
94
105
  else:
95
- logger.warning(
96
- f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
97
- )
106
+ msg = f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
107
+ logger.warning(msg)
108
+ warnings.warn(msg)
98
109
  existing_model_card.save(output_path)
99
110
 
100
111
 
101
112
  def _add_table_to_model_card(
102
- results: BenchmarkResults, model_card: ModelCard
113
+ results_cache: ResultCache,
114
+ model_card: ModelCard,
115
+ models: Sequence[str],
116
+ benchmarks: Sequence[Benchmark],
103
117
  ) -> ModelCard:
104
118
  original_content = model_card.content
105
- results_df = results.to_dataframe()
106
- results_df = results_df.set_index("task_name")
107
- mteb_content = f"""
108
- # MTEB results
109
- {results_df.to_markdown()}
110
- """
119
+ mteb_content = "# MTEB Results\n\n"
120
+
121
+ for benchmark in benchmarks:
122
+ mteb_content += f"## Benchmark: {benchmark.name}\n\n"
123
+ benchmark_results = results_cache.load_results(
124
+ tasks=benchmark,
125
+ models=models,
126
+ only_main_score=True,
127
+ )
128
+ df_results = benchmark_results.get_benchmark_result()
129
+ mteb_content += df_results.to_markdown(index=True) + "\n\n"
130
+
111
131
  model_card.content = original_content + "\n\n" + mteb_content
112
132
  return model_card