mteb 2.7.1__py3-none-any.whl → 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +16 -9
  3. mteb/_evaluators/any_sts_evaluator.py +10 -5
  4. mteb/_evaluators/clustering_evaluator.py +10 -4
  5. mteb/_evaluators/evaluator.py +9 -4
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +10 -5
  8. mteb/_evaluators/retrieval_evaluator.py +19 -13
  9. mteb/_evaluators/retrieval_metrics.py +9 -3
  10. mteb/_evaluators/sklearn_evaluator.py +14 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
  12. mteb/_evaluators/text/summarization_evaluator.py +8 -4
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
  14. mteb/_helpful_enum.py +5 -1
  15. mteb/abstasks/_data_filter/filters.py +8 -2
  16. mteb/abstasks/_data_filter/task_pipelines.py +7 -2
  17. mteb/abstasks/_statistics_calculation.py +6 -4
  18. mteb/abstasks/abstask.py +17 -9
  19. mteb/abstasks/aggregate_task_metadata.py +20 -9
  20. mteb/abstasks/aggregated_task.py +15 -8
  21. mteb/abstasks/classification.py +15 -6
  22. mteb/abstasks/clustering.py +17 -8
  23. mteb/abstasks/clustering_legacy.py +14 -6
  24. mteb/abstasks/image/image_text_pair_classification.py +17 -7
  25. mteb/abstasks/multilabel_classification.py +11 -5
  26. mteb/abstasks/pair_classification.py +19 -9
  27. mteb/abstasks/regression.py +14 -6
  28. mteb/abstasks/retrieval.py +27 -16
  29. mteb/abstasks/retrieval_dataset_loaders.py +11 -8
  30. mteb/abstasks/sts.py +19 -10
  31. mteb/abstasks/task_metadata.py +17 -8
  32. mteb/abstasks/text/bitext_mining.py +14 -7
  33. mteb/abstasks/text/summarization.py +17 -7
  34. mteb/abstasks/zeroshot_classification.py +15 -7
  35. mteb/benchmarks/_create_table.py +13 -3
  36. mteb/benchmarks/benchmark.py +11 -1
  37. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  38. mteb/cache.py +20 -14
  39. mteb/cli/_display_tasks.py +9 -3
  40. mteb/cli/build_cli.py +5 -2
  41. mteb/cli/generate_model_card.py +9 -2
  42. mteb/deprecated_evaluator.py +16 -12
  43. mteb/evaluate.py +20 -18
  44. mteb/filter_tasks.py +12 -7
  45. mteb/get_tasks.py +9 -4
  46. mteb/languages/language_scripts.py +8 -3
  47. mteb/leaderboard/app.py +7 -3
  48. mteb/leaderboard/table.py +7 -2
  49. mteb/load_results.py +9 -3
  50. mteb/models/abs_encoder.py +22 -12
  51. mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
  52. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
  53. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
  54. mteb/models/cache_wrappers/cache_wrapper.py +14 -9
  55. mteb/models/get_model_meta.py +11 -4
  56. mteb/models/instruct_wrapper.py +13 -5
  57. mteb/models/model_implementations/align_models.py +9 -4
  58. mteb/models/model_implementations/bedrock_models.py +16 -6
  59. mteb/models/model_implementations/blip2_models.py +9 -4
  60. mteb/models/model_implementations/blip_models.py +9 -4
  61. mteb/models/model_implementations/bm25.py +15 -10
  62. mteb/models/model_implementations/bmretriever_models.py +6 -2
  63. mteb/models/model_implementations/cde_models.py +9 -5
  64. mteb/models/model_implementations/clip_models.py +9 -4
  65. mteb/models/model_implementations/cohere_models.py +10 -4
  66. mteb/models/model_implementations/cohere_v.py +9 -4
  67. mteb/models/model_implementations/colpali_models.py +4 -3
  68. mteb/models/model_implementations/colqwen_models.py +10 -31
  69. mteb/models/model_implementations/colsmol_models.py +1 -1
  70. mteb/models/model_implementations/conan_models.py +10 -4
  71. mteb/models/model_implementations/dino_models.py +9 -4
  72. mteb/models/model_implementations/e5_v.py +9 -4
  73. mteb/models/model_implementations/eagerworks_models.py +10 -4
  74. mteb/models/model_implementations/evaclip_models.py +9 -4
  75. mteb/models/model_implementations/gme_v_models.py +5 -3
  76. mteb/models/model_implementations/google_models.py +10 -4
  77. mteb/models/model_implementations/granite_vision_embedding_models.py +6 -5
  78. mteb/models/model_implementations/hinvec_models.py +5 -1
  79. mteb/models/model_implementations/jasper_models.py +12 -5
  80. mteb/models/model_implementations/jina_clip.py +9 -4
  81. mteb/models/model_implementations/jina_models.py +10 -5
  82. mteb/models/model_implementations/kalm_models.py +18 -12
  83. mteb/models/model_implementations/linq_models.py +6 -1
  84. mteb/models/model_implementations/listconranker.py +9 -4
  85. mteb/models/model_implementations/llm2clip_models.py +9 -4
  86. mteb/models/model_implementations/llm2vec_models.py +12 -6
  87. mteb/models/model_implementations/mcinext_models.py +5 -2
  88. mteb/models/model_implementations/mdbr_models.py +3 -1
  89. mteb/models/model_implementations/{mxbai_models.py → mixedbread_ai_models.py} +91 -0
  90. mteb/models/model_implementations/moco_models.py +9 -4
  91. mteb/models/model_implementations/mod_models.py +1 -1
  92. mteb/models/model_implementations/model2vec_models.py +10 -4
  93. mteb/models/model_implementations/no_instruct_sentence_models.py +12 -5
  94. mteb/models/model_implementations/nomic_models.py +10 -4
  95. mteb/models/model_implementations/nomic_models_vision.py +4 -3
  96. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +7 -3
  97. mteb/models/model_implementations/nvidia_models.py +12 -4
  98. mteb/models/model_implementations/octen_models.py +1 -1
  99. mteb/models/model_implementations/openai_models.py +9 -4
  100. mteb/models/model_implementations/openclip_models.py +9 -4
  101. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -4
  102. mteb/models/model_implementations/ops_moa_models.py +7 -2
  103. mteb/models/model_implementations/pixie_models.py +56 -0
  104. mteb/models/model_implementations/promptriever_models.py +12 -6
  105. mteb/models/model_implementations/pylate_models.py +19 -13
  106. mteb/models/model_implementations/qwen3_models.py +8 -1
  107. mteb/models/model_implementations/random_baseline.py +4 -3
  108. mteb/models/model_implementations/repllama_models.py +13 -6
  109. mteb/models/model_implementations/rerankers_custom.py +10 -4
  110. mteb/models/model_implementations/rerankers_monot5_based.py +10 -4
  111. mteb/models/model_implementations/salesforce_models.py +7 -1
  112. mteb/models/model_implementations/seed_1_6_embedding_models.py +4 -2
  113. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +5 -2
  114. mteb/models/model_implementations/seed_models.py +1 -1
  115. mteb/models/model_implementations/siglip_models.py +9 -4
  116. mteb/models/model_implementations/slm_models.py +7 -4
  117. mteb/models/model_implementations/uae_models.py +9 -4
  118. mteb/models/model_implementations/vdr_models.py +7 -1
  119. mteb/models/model_implementations/vista_models.py +9 -4
  120. mteb/models/model_implementations/vlm2vec_models.py +9 -4
  121. mteb/models/model_implementations/voyage_models.py +10 -4
  122. mteb/models/model_implementations/voyage_v.py +10 -6
  123. mteb/models/model_implementations/yuan_models_en.py +1 -1
  124. mteb/models/model_meta.py +12 -7
  125. mteb/models/models_protocols.py +19 -18
  126. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  127. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
  128. mteb/models/search_wrappers.py +19 -12
  129. mteb/models/sentence_transformer_wrapper.py +4 -3
  130. mteb/models/vllm_wrapper.py +8 -6
  131. mteb/results/benchmark_results.py +22 -17
  132. mteb/results/model_result.py +21 -15
  133. mteb/results/task_result.py +41 -10
  134. mteb/similarity_functions.py +8 -2
  135. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  136. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  137. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  138. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  139. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  140. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  141. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  142. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  143. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  144. mteb/tasks/clustering/nob/snl_clustering.py +7 -2
  145. mteb/tasks/clustering/nob/vg_clustering.py +7 -2
  146. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  147. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
  148. mteb/types/_encoder_io.py +1 -1
  149. mteb/types/statistics.py +9 -2
  150. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/METADATA +1 -1
  151. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/RECORD +155 -154
  152. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/WHEEL +0 -0
  153. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/entry_points.txt +0 -0
  154. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/licenses/LICENSE +0 -0
  155. {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,8 @@ RTEB_CITATION = r"""@article{rteb2025,
10
10
  year = {2025},
11
11
  }"""
12
12
 
13
+ removal_note = "\n\nNote: We have temporarily removed the 'Private' column to read more about this decision out the [announcement](https://github.com/embeddings-benchmark/mteb/issues/3934)."
14
+
13
15
  RTEB_MAIN = RtebBenchmark(
14
16
  name="RTEB(beta)",
15
17
  display_name="RTEB Multilingual",
@@ -48,7 +50,8 @@ RTEB_MAIN = RtebBenchmark(
48
50
  "JapaneseLegal1Retrieval",
49
51
  ],
50
52
  ),
51
- description="RTEB (ReTrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across multiple languages. The dataset includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
53
+ description="RTEB (ReTrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across multiple languages. The dataset includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
54
+ + removal_note,
52
55
  citation=RTEB_CITATION,
53
56
  contacts=["fzowl"],
54
57
  )
@@ -83,7 +86,8 @@ RTEB_ENGLISH = RtebBenchmark(
83
86
  ],
84
87
  languages=["eng"],
85
88
  ),
86
- description="RTEB English is a subset of RTEB containing retrieval tasks in English across legal, finance, code, and healthcare domains. Includes diverse tasks covering specialized domains such as healthcare and finance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
89
+ description="RTEB English is a subset of RTEB containing retrieval tasks in English across legal, finance, code, and healthcare domains. Includes diverse tasks covering specialized domains such as healthcare and finance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
90
+ + removal_note,
87
91
  citation=RTEB_CITATION,
88
92
  contacts=["fzowl"],
89
93
  )
@@ -101,7 +105,8 @@ RTEB_FRENCH = RtebBenchmark(
101
105
  ],
102
106
  languages=["fra"],
103
107
  ),
104
- description="RTEB French is a subset of RTEB containing retrieval tasks in French across legal and general knowledge domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
108
+ description="RTEB French is a subset of RTEB containing retrieval tasks in French across legal and general knowledge domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
109
+ + removal_note,
105
110
  citation=RTEB_CITATION,
106
111
  contacts=["fzowl"],
107
112
  )
@@ -119,7 +124,8 @@ RTEB_GERMAN = RtebBenchmark(
119
124
  "GermanLegal1Retrieval",
120
125
  ],
121
126
  ),
122
- description="RTEB German is a subset of RTEB containing retrieval tasks in German across legal, healthcare, and business domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
127
+ description="RTEB German is a subset of RTEB containing retrieval tasks in German across legal, healthcare, and business domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
128
+ + removal_note,
123
129
  citation=RTEB_CITATION,
124
130
  contacts=["fzowl"],
125
131
  )
@@ -135,7 +141,8 @@ RTEB_JAPANESE = RtebBenchmark(
135
141
  "JapaneseLegal1Retrieval",
136
142
  ],
137
143
  ),
138
- description="RTEB Japanese is a subset of RTEB containing retrieval tasks in Japanese across legal and code domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
144
+ description="RTEB Japanese is a subset of RTEB containing retrieval tasks in Japanese across legal and code domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
145
+ + removal_note,
139
146
  citation=RTEB_CITATION,
140
147
  contacts=["fzowl"],
141
148
  )
@@ -156,7 +163,8 @@ RTEB_FINANCE = RtebBenchmark(
156
163
  "EnglishFinance4Retrieval",
157
164
  ],
158
165
  ),
159
- description="RTEB Finance is a subset of RTEB containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, financial document retrieval, and corporate governance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
166
+ description="RTEB Finance is a subset of RTEB containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, financial document retrieval, and corporate governance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
167
+ + removal_note,
160
168
  citation=RTEB_CITATION,
161
169
  contacts=["fzowl"],
162
170
  )
@@ -177,7 +185,8 @@ RTEB_LEGAL = RtebBenchmark(
177
185
  "JapaneseLegal1Retrieval",
178
186
  ],
179
187
  ),
180
- description="RTEB Legal is a subset of RTEB containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and multilingual legal Q&A. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
188
+ description="RTEB Legal is a subset of RTEB containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and multilingual legal Q&A. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
189
+ + removal_note,
181
190
  citation=RTEB_CITATION,
182
191
  contacts=["fzowl"],
183
192
  )
@@ -199,7 +208,8 @@ RTEB_CODE = RtebBenchmark(
199
208
  "JapaneseCode1Retrieval",
200
209
  ],
201
210
  ),
202
- description="RTEB Code is a subset of RTEB containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, SQL retrieval, and multilingual code retrieval. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
211
+ description="RTEB Code is a subset of RTEB containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, SQL retrieval, and multilingual code retrieval. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
212
+ + removal_note,
203
213
  citation=RTEB_CITATION,
204
214
  contacts=["fzowl"],
205
215
  )
@@ -217,7 +227,8 @@ RTEB_HEALTHCARE = RtebBenchmark(
217
227
  "GermanHealthcare1Retrieval",
218
228
  ],
219
229
  ),
220
- description="RTEB Healthcare is a subset of RTEB containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, cross-lingual medical retrieval, and multilingual medical consultation. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues).",
230
+ description="RTEB Healthcare is a subset of RTEB containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, cross-lingual medical retrieval, and multilingual medical consultation. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
231
+ + removal_note,
221
232
  citation=RTEB_CITATION,
222
233
  contacts=["fzowl"],
223
234
  )
mteb/cache.py CHANGED
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import gzip
2
4
  import io
3
5
  import json
@@ -7,18 +9,22 @@ import shutil
7
9
  import subprocess
8
10
  import warnings
9
11
  from collections import defaultdict
10
- from collections.abc import Iterable, Sequence
11
12
  from pathlib import Path
12
- from typing import cast
13
+ from typing import TYPE_CHECKING, cast
13
14
 
14
15
  import requests
16
+ from pydantic import ValidationError
15
17
 
16
18
  import mteb
17
19
  from mteb.abstasks import AbsTask
18
20
  from mteb.benchmarks.benchmark import Benchmark
19
21
  from mteb.models import ModelMeta
20
22
  from mteb.results import BenchmarkResults, ModelResult, TaskResult
21
- from mteb.types import ModelName, Revision
23
+
24
+ if TYPE_CHECKING:
25
+ from collections.abc import Iterable, Sequence
26
+
27
+ from mteb.types import ModelName, Revision
22
28
 
23
29
  logger = logging.getLogger(__name__)
24
30
 
@@ -27,8 +33,8 @@ class ResultCache:
27
33
  """Class to handle the local cache of MTEB results.
28
34
 
29
35
  Examples:
30
- >>> from mteb.cache import ResultCache
31
- >>> cache = ResultCache(cache_path="~/.cache/mteb") # default
36
+ >>> import mteb
37
+ >>> cache = mteb.ResultCache(cache_path="~/.cache/mteb") # default
32
38
  >>> cache.download_from_remote() # download the latest results from the remote repository
33
39
  >>> result = cache.load_results("task_name", "model_name")
34
40
  """
@@ -320,8 +326,8 @@ class ResultCache:
320
326
  OSError: On other file system errors
321
327
 
322
328
  Examples:
323
- >>> from mteb.cache import ResultCache
324
- >>> cache = ResultCache()
329
+ >>> import mteb
330
+ >>> cache = mteb.ResultCache()
325
331
  >>> # Download optimized cached results
326
332
  >>> cache_file = cache._download_cached_results_from_branch()
327
333
  >>> # Use custom output path
@@ -460,8 +466,8 @@ class ResultCache:
460
466
  A list of paths in the cache directory.
461
467
 
462
468
  Examples:
463
- >>> from mteb.cache import ResultCache
464
- >>> cache = ResultCache()
469
+ >>> import mteb
470
+ >>> cache = mteb.ResultCache()
465
471
  >>>
466
472
  >>> # Get all cache paths
467
473
  >>> paths = cache.get_cache_paths()
@@ -583,7 +589,7 @@ class ResultCache:
583
589
 
584
590
  first_model = next(iter(models))
585
591
  if isinstance(first_model, ModelMeta):
586
- models = cast(Iterable[ModelMeta], models)
592
+ models = cast("Iterable[ModelMeta]", models)
587
593
  name_and_revision = {
588
594
  (m.model_name_as_path(), m.revision or "no_revision_available")
589
595
  for m in models
@@ -594,7 +600,7 @@ class ResultCache:
594
600
  if (p.parent.parent.name, p.parent.name) in name_and_revision
595
601
  ]
596
602
 
597
- str_models = cast(Sequence[str], models)
603
+ str_models = cast("Sequence[str]", models)
598
604
  model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
599
605
  return [p for p in paths if p.parent.parent.name in model_names]
600
606
 
@@ -642,8 +648,8 @@ class ResultCache:
642
648
  A BenchmarkResults object containing the results for the specified models and tasks.
643
649
 
644
650
  Examples:
645
- >>> from mteb.cache import ResultCache
646
- >>> cache = ResultCache()
651
+ >>> import mteb
652
+ >>> cache = mteb.ResultCache()
647
653
  >>>
648
654
  >>> # Load results for specific models and tasks
649
655
  >>> results = cache.load_results(
@@ -686,7 +692,7 @@ class ResultCache:
686
692
  task_result = task_result.validate_and_filter_scores(
687
693
  task=task_instance
688
694
  )
689
- except Exception as e:
695
+ except ValidationError as e:
690
696
  logger.info(
691
697
  f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
692
698
  )
@@ -1,9 +1,15 @@
1
- from collections.abc import Iterable, Sequence
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
2
4
 
3
- from mteb.abstasks import AbsTask
4
- from mteb.benchmarks import Benchmark
5
5
  from mteb.get_tasks import MTEBTasks
6
6
 
7
+ if TYPE_CHECKING:
8
+ from collections.abc import Iterable, Sequence
9
+
10
+ from mteb.abstasks import AbsTask
11
+ from mteb.benchmarks import Benchmark
12
+
7
13
 
8
14
  def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
9
15
  """Get all benchmarks available in the MTEB."""
mteb/cli/build_cli.py CHANGED
@@ -3,17 +3,20 @@ import logging
3
3
  import os
4
4
  import warnings
5
5
  from pathlib import Path
6
+ from typing import TYPE_CHECKING
6
7
 
7
8
  import torch
8
9
  from rich.logging import RichHandler
9
10
 
10
11
  import mteb
11
- from mteb.abstasks.abstask import AbsTask
12
12
  from mteb.cache import ResultCache
13
13
  from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
14
14
  from mteb.cli.generate_model_card import generate_model_card
15
15
  from mteb.evaluate import OverwriteStrategy
16
- from mteb.types._encoder_io import EncodeKwargs
16
+
17
+ if TYPE_CHECKING:
18
+ from mteb.abstasks.abstask import AbsTask
19
+ from mteb.types import EncodeKwargs
17
20
 
18
21
  logger = logging.getLogger(__name__)
19
22
 
@@ -1,14 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import warnings
3
- from collections.abc import Sequence
4
5
  from pathlib import Path
6
+ from typing import TYPE_CHECKING
5
7
 
6
8
  from huggingface_hub import ModelCard, ModelCardData, repo_exists
7
9
 
8
10
  from mteb.abstasks.abstask import AbsTask
9
- from mteb.benchmarks.benchmark import Benchmark
10
11
  from mteb.cache import ResultCache
11
12
 
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Sequence
15
+
16
+ from mteb.abstasks.abstask import AbsTask
17
+ from mteb.benchmarks.benchmark import Benchmark
18
+
12
19
  logger = logging.getLogger(__name__)
13
20
 
14
21
 
@@ -6,7 +6,6 @@ import os
6
6
  import sys
7
7
  import traceback
8
8
  import warnings
9
- from collections.abc import Iterable, Sequence
10
9
  from copy import deepcopy
11
10
  from datetime import datetime
12
11
  from itertools import chain
@@ -18,26 +17,31 @@ import datasets
18
17
 
19
18
  import mteb
20
19
  from mteb.abstasks import AbsTask
21
- from mteb.abstasks.aggregated_task import AbsTaskAggregate
22
- from mteb.abstasks.task_metadata import TaskCategory, TaskType
23
20
  from mteb.benchmarks import Benchmark
24
21
  from mteb.models import (
25
22
  CrossEncoderWrapper,
26
23
  ModelMeta,
27
- MTEBModels,
28
24
  SentenceTransformerEncoderWrapper,
29
25
  )
30
26
  from mteb.results import TaskResult
31
- from mteb.types import EncodeKwargs, ScoresDict
27
+
28
+ if TYPE_CHECKING:
29
+ from collections.abc import Iterable, Sequence
30
+
31
+ from sentence_transformers import CrossEncoder, SentenceTransformer
32
+
33
+ from mteb.abstasks.aggregated_task import AbsTaskAggregate
34
+ from mteb.abstasks.task_metadata import TaskCategory, TaskType
35
+ from mteb.models import (
36
+ MTEBModels,
37
+ )
38
+ from mteb.types import EncodeKwargs, ScoresDict
32
39
 
33
40
  if sys.version_info >= (3, 13):
34
41
  from warnings import deprecated
35
42
  else:
36
43
  from typing_extensions import deprecated
37
44
 
38
- if TYPE_CHECKING:
39
- from sentence_transformers import CrossEncoder, SentenceTransformer
40
-
41
45
  logger = logging.getLogger(__name__)
42
46
 
43
47
 
@@ -66,9 +70,9 @@ class MTEB:
66
70
  """
67
71
  if isinstance(next(iter(tasks)), Benchmark):
68
72
  self.benchmarks = tasks
69
- self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
73
+ self.tasks = list(chain.from_iterable(cast("Iterable[Benchmark]", tasks)))
70
74
  elif isinstance(next(iter(tasks)), AbsTask):
71
- self.tasks = list(cast(Iterable[AbsTask], tasks))
75
+ self.tasks = list(cast("Iterable[AbsTask]", tasks))
72
76
 
73
77
  self.err_logs_path = Path(err_logs_path)
74
78
  self._last_evaluated_splits: dict[str, list[str]] = {}
@@ -313,7 +317,7 @@ class MTEB:
313
317
  elif isinstance(model, CrossEncoder):
314
318
  mteb_model = CrossEncoderWrapper(model)
315
319
  else:
316
- mteb_model = cast(MTEBModels, model)
320
+ mteb_model = cast("MTEBModels", model)
317
321
 
318
322
  meta = self.create_model_meta(mteb_model)
319
323
  output_path = self._create_output_folder(meta, output_folder)
@@ -346,7 +350,7 @@ class MTEB:
346
350
  )
347
351
 
348
352
  if task.is_aggregate:
349
- aggregated_task = cast(AbsTaskAggregate, task)
353
+ aggregated_task = cast("AbsTaskAggregate", task)
350
354
  self_ = MTEB(tasks=aggregated_task.metadata.tasks)
351
355
  aggregated_task_results = self_.run(
352
356
  mteb_model,
mteb/evaluate.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import warnings
5
- from collections.abc import Iterable
6
5
  from pathlib import Path
7
6
  from time import time
8
7
  from typing import TYPE_CHECKING, cast
@@ -17,22 +16,25 @@ from mteb.abstasks.aggregated_task import AbsTaskAggregate
17
16
  from mteb.benchmarks.benchmark import Benchmark
18
17
  from mteb.cache import ResultCache
19
18
  from mteb.models.model_meta import ModelMeta
20
- from mteb.models.models_protocols import (
21
- MTEBModels,
22
- )
23
19
  from mteb.models.sentence_transformer_wrapper import (
24
20
  CrossEncoderWrapper,
25
21
  SentenceTransformerEncoderWrapper,
26
22
  )
27
23
  from mteb.results import ModelResult, TaskResult
28
24
  from mteb.results.task_result import TaskError
29
- from mteb.types import HFSubset, PromptType, SplitName
30
- from mteb.types._encoder_io import EncodeKwargs
31
- from mteb.types._metadata import ModelName, Revision
25
+ from mteb.types import PromptType
32
26
 
33
27
  if TYPE_CHECKING:
28
+ from collections.abc import Iterable
29
+
34
30
  from sentence_transformers import CrossEncoder, SentenceTransformer
35
31
 
32
+ from mteb.models.models_protocols import (
33
+ MTEBModels,
34
+ )
35
+ from mteb.types import EncodeKwargs, HFSubset, SplitName
36
+ from mteb.types._metadata import ModelName, Revision
37
+
36
38
  logger = logging.getLogger(__name__)
37
39
 
38
40
 
@@ -69,13 +71,13 @@ def _sanitize_model(
69
71
  meta = getattr(model, "mteb_model_meta")
70
72
  if not isinstance(meta, ModelMeta):
71
73
  meta = ModelMeta._from_hub(None)
72
- wrapped_model = cast(MTEBModels | ModelMeta, model)
74
+ wrapped_model = cast("MTEBModels | ModelMeta", model)
73
75
  else:
74
76
  meta = ModelMeta._from_hub(None) if not isinstance(model, ModelMeta) else model
75
77
  wrapped_model = meta
76
78
 
77
- model_name = cast(str, meta.name)
78
- model_revision = cast(str, meta.revision)
79
+ model_name = cast("str", meta.name)
80
+ model_revision = cast("str", meta.revision)
79
81
 
80
82
  return wrapped_model, meta, model_name, model_revision
81
83
 
@@ -132,8 +134,8 @@ def _evaluate_task(
132
134
 
133
135
  task.check_if_dataset_is_superseded()
134
136
 
135
- data_loaded = task.data_loaded
136
- if not data_loaded:
137
+ data_preloaded = task.data_loaded
138
+ if not data_preloaded:
137
139
  try:
138
140
  task.load_data()
139
141
  except DatasetNotFoundError as e:
@@ -176,7 +178,7 @@ def _evaluate_task(
176
178
  kg_co2_emissions=None,
177
179
  )
178
180
 
179
- if data_loaded: # only unload if we loaded the data
181
+ if not data_preloaded: # only unload if we loaded the data
180
182
  task.unload_data()
181
183
 
182
184
  return result
@@ -202,10 +204,10 @@ def _check_model_modalities(
202
204
  if isinstance(tasks, AbsTask):
203
205
  check_tasks = [tasks]
204
206
  elif isinstance(tasks, Benchmark):
205
- benchmark = cast(Benchmark, tasks)
207
+ benchmark = cast("Benchmark", tasks)
206
208
  check_tasks = benchmark.tasks
207
209
  else:
208
- check_tasks = cast(Iterable[AbsTask], tasks)
210
+ check_tasks = cast("Iterable[AbsTask]", tasks)
209
211
 
210
212
  warnings, errors = [], []
211
213
 
@@ -298,7 +300,7 @@ def evaluate(
298
300
  changed.
299
301
  - "only-cache": Only load the results from the cache folder and do not run the task. Useful if you just want to load the results from the
300
302
  cache.
301
- prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be sabed in `prediction_folder/{task_name}_predictions.json`
303
+ prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be saved in `prediction_folder/{task_name}_predictions.json`
302
304
  show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
303
305
  `encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
304
306
  public_only: Run only public tasks. If None, it will attempt to run the private task.
@@ -342,7 +344,7 @@ def evaluate(
342
344
 
343
345
  # AbsTaskAggregate is a special case where we have to run multiple tasks and combine the results
344
346
  if isinstance(tasks, AbsTaskAggregate):
345
- aggregated_task = cast(AbsTaskAggregate, tasks)
347
+ aggregated_task = cast("AbsTaskAggregate", tasks)
346
348
  results = evaluate(
347
349
  model,
348
350
  aggregated_task.metadata.tasks,
@@ -365,7 +367,7 @@ def evaluate(
365
367
  if isinstance(tasks, AbsTask):
366
368
  task = tasks
367
369
  else:
368
- tasks = cast(Iterable[AbsTask], tasks)
370
+ tasks = cast("Iterable[AbsTask]", tasks)
369
371
  evaluate_results = []
370
372
  exceptions = []
371
373
  tasks_tqdm = tqdm(
mteb/filter_tasks.py CHANGED
@@ -1,19 +1,24 @@
1
1
  """This script contains functions that are used to get an overview of the MTEB benchmark."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import logging
4
- from collections.abc import Iterable, Sequence
5
- from typing import overload
6
+ from typing import TYPE_CHECKING, overload
6
7
 
7
- from mteb.abstasks import (
8
- AbsTask,
9
- )
10
8
  from mteb.abstasks.aggregated_task import AbsTaskAggregate
11
- from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
12
9
  from mteb.languages import (
13
10
  ISO_TO_LANGUAGE,
14
11
  ISO_TO_SCRIPT,
15
12
  )
16
- from mteb.types import Modalities
13
+
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Iterable, Sequence
16
+
17
+ from mteb.abstasks import (
18
+ AbsTask,
19
+ )
20
+ from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
21
+ from mteb.types import Modalities
17
22
 
18
23
  logger = logging.getLogger(__name__)
19
24
 
mteb/get_tasks.py CHANGED
@@ -1,20 +1,25 @@
1
1
  """This script contains functions that are used to get an overview of the MTEB benchmark."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import difflib
4
6
  import logging
5
7
  import warnings
6
8
  from collections import Counter, defaultdict
7
- from collections.abc import Iterable, Sequence
8
- from typing import Any
9
+ from typing import TYPE_CHECKING, Any
9
10
 
10
11
  import pandas as pd
11
12
 
12
13
  from mteb.abstasks import (
13
14
  AbsTask,
14
15
  )
15
- from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
16
16
  from mteb.filter_tasks import filter_tasks
17
- from mteb.types import Modalities
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import Iterable, Sequence
20
+
21
+ from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
22
+ from mteb.types import Modalities
18
23
 
19
24
  logger = logging.getLogger(__name__)
20
25
 
@@ -1,10 +1,15 @@
1
- from collections.abc import Iterable, Sequence
2
- from dataclasses import dataclass
1
+ from __future__ import annotations
3
2
 
4
- from typing_extensions import Self
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING
5
5
 
6
6
  from mteb.languages.check_language_code import check_language_code
7
7
 
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Iterable, Sequence
10
+
11
+ from typing_extensions import Self
12
+
8
13
 
9
14
  @dataclass
10
15
  class LanguageScripts:
mteb/leaderboard/app.py CHANGED
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import itertools
2
4
  import json
3
5
  import logging
@@ -5,15 +7,14 @@ import tempfile
5
7
  import time
6
8
  import warnings
7
9
  from pathlib import Path
8
- from typing import Literal, get_args
10
+ from typing import TYPE_CHECKING, Literal, get_args
9
11
  from urllib.parse import urlencode
10
12
 
11
13
  import cachetools
12
14
  import gradio as gr
13
- import pandas as pd
15
+ import pandas as pd # noqa: TC002 # gradio tries to validate typehints
14
16
 
15
17
  import mteb
16
- from mteb import BenchmarkResults
17
18
  from mteb.benchmarks.benchmark import RtebBenchmark
18
19
  from mteb.cache import ResultCache
19
20
  from mteb.leaderboard.benchmark_selector import (
@@ -31,6 +32,9 @@ from mteb.leaderboard.table import (
31
32
  from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
32
33
  from mteb.models.model_meta import MODEL_TYPES
33
34
 
35
+ if TYPE_CHECKING:
36
+ from mteb import BenchmarkResults
37
+
34
38
  logger = logging.getLogger(__name__)
35
39
 
36
40
 
mteb/leaderboard/table.py CHANGED
@@ -1,3 +1,7 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
1
5
  import gradio as gr
2
6
  import matplotlib.pyplot as plt
3
7
  import numpy as np
@@ -5,8 +9,9 @@ import pandas as pd
5
9
  from matplotlib.colors import LinearSegmentedColormap
6
10
  from pandas.api.types import is_numeric_dtype
7
11
 
8
- from mteb.benchmarks.benchmark import Benchmark
9
- from mteb.results.benchmark_results import BenchmarkResults
12
+ if TYPE_CHECKING:
13
+ from mteb.benchmarks.benchmark import Benchmark
14
+ from mteb.results.benchmark_results import BenchmarkResults
10
15
 
11
16
 
12
17
  def _borda_count(scores: pd.Series) -> pd.Series:
mteb/load_results.py CHANGED
@@ -1,13 +1,19 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import sys
4
- from collections.abc import Iterable, Sequence
5
- from pathlib import Path
6
+ from typing import TYPE_CHECKING
6
7
 
7
8
  from mteb.abstasks.abstask import AbsTask
8
9
  from mteb.models.model_meta import ModelMeta
9
10
  from mteb.results import BenchmarkResults, ModelResult, TaskResult
10
- from mteb.types import ModelName, Revision
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Iterable, Sequence
14
+ from pathlib import Path
15
+
16
+ from mteb.types import ModelName, Revision
11
17
 
12
18
  if sys.version_info >= (3, 13):
13
19
  from warnings import deprecated