mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +17 -18
  3. mteb/_evaluators/any_sts_evaluator.py +3 -3
  4. mteb/_evaluators/clustering_evaluator.py +2 -2
  5. mteb/_evaluators/evaluator.py +4 -2
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
  7. mteb/_evaluators/pair_classification_evaluator.py +5 -3
  8. mteb/_evaluators/retrieval_evaluator.py +2 -2
  9. mteb/_evaluators/retrieval_metrics.py +18 -17
  10. mteb/_evaluators/sklearn_evaluator.py +11 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
  12. mteb/_evaluators/text/summarization_evaluator.py +23 -18
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  14. mteb/abstasks/_data_filter/filters.py +1 -1
  15. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  16. mteb/abstasks/_statistics_calculation.py +18 -10
  17. mteb/abstasks/_stratification.py +18 -18
  18. mteb/abstasks/abstask.py +35 -28
  19. mteb/abstasks/aggregate_task_metadata.py +1 -9
  20. mteb/abstasks/aggregated_task.py +10 -29
  21. mteb/abstasks/classification.py +15 -10
  22. mteb/abstasks/clustering.py +19 -15
  23. mteb/abstasks/clustering_legacy.py +10 -10
  24. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  25. mteb/abstasks/multilabel_classification.py +23 -19
  26. mteb/abstasks/pair_classification.py +20 -11
  27. mteb/abstasks/regression.py +4 -4
  28. mteb/abstasks/retrieval.py +28 -24
  29. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  30. mteb/abstasks/sts.py +8 -5
  31. mteb/abstasks/task_metadata.py +31 -33
  32. mteb/abstasks/text/bitext_mining.py +39 -28
  33. mteb/abstasks/text/reranking.py +8 -6
  34. mteb/abstasks/text/summarization.py +10 -5
  35. mteb/abstasks/zeroshot_classification.py +8 -4
  36. mteb/benchmarks/benchmark.py +4 -2
  37. mteb/benchmarks/benchmarks/__init__.py +4 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +112 -11
  39. mteb/benchmarks/get_benchmark.py +14 -55
  40. mteb/cache.py +182 -29
  41. mteb/cli/_display_tasks.py +2 -2
  42. mteb/cli/build_cli.py +110 -14
  43. mteb/cli/generate_model_card.py +43 -23
  44. mteb/deprecated_evaluator.py +63 -49
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  49. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  50. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  51. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  53. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  54. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  55. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  56. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  57. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  58. mteb/evaluate.py +44 -33
  59. mteb/filter_tasks.py +25 -26
  60. mteb/get_tasks.py +29 -30
  61. mteb/languages/language_scripts.py +5 -3
  62. mteb/leaderboard/app.py +162 -34
  63. mteb/load_results.py +12 -12
  64. mteb/models/abs_encoder.py +10 -6
  65. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  66. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  67. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  68. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  69. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  70. mteb/models/get_model_meta.py +21 -3
  71. mteb/models/instruct_wrapper.py +28 -8
  72. mteb/models/model_implementations/align_models.py +1 -1
  73. mteb/models/model_implementations/andersborges.py +4 -4
  74. mteb/models/model_implementations/ara_models.py +1 -1
  75. mteb/models/model_implementations/arctic_models.py +8 -8
  76. mteb/models/model_implementations/b1ade_models.py +1 -1
  77. mteb/models/model_implementations/bge_models.py +45 -21
  78. mteb/models/model_implementations/bica_model.py +3 -3
  79. mteb/models/model_implementations/blip2_models.py +2 -2
  80. mteb/models/model_implementations/blip_models.py +16 -16
  81. mteb/models/model_implementations/bm25.py +4 -4
  82. mteb/models/model_implementations/bmretriever_models.py +6 -4
  83. mteb/models/model_implementations/cadet_models.py +1 -1
  84. mteb/models/model_implementations/cde_models.py +11 -4
  85. mteb/models/model_implementations/clip_models.py +6 -6
  86. mteb/models/model_implementations/clips_models.py +3 -3
  87. mteb/models/model_implementations/codefuse_models.py +5 -5
  88. mteb/models/model_implementations/codesage_models.py +3 -3
  89. mteb/models/model_implementations/cohere_models.py +5 -5
  90. mteb/models/model_implementations/cohere_v.py +2 -2
  91. mteb/models/model_implementations/colpali_models.py +3 -3
  92. mteb/models/model_implementations/colqwen_models.py +8 -8
  93. mteb/models/model_implementations/colsmol_models.py +2 -2
  94. mteb/models/model_implementations/conan_models.py +1 -1
  95. mteb/models/model_implementations/dino_models.py +42 -42
  96. mteb/models/model_implementations/e5_instruct.py +23 -4
  97. mteb/models/model_implementations/e5_models.py +9 -9
  98. mteb/models/model_implementations/e5_v.py +6 -6
  99. mteb/models/model_implementations/eagerworks_models.py +1 -1
  100. mteb/models/model_implementations/emillykkejensen_models.py +6 -6
  101. mteb/models/model_implementations/en_code_retriever.py +1 -1
  102. mteb/models/model_implementations/euler_models.py +2 -2
  103. mteb/models/model_implementations/fa_models.py +9 -9
  104. mteb/models/model_implementations/facebookai.py +14 -2
  105. mteb/models/model_implementations/geogpt_models.py +1 -1
  106. mteb/models/model_implementations/gme_v_models.py +6 -5
  107. mteb/models/model_implementations/google_models.py +1 -1
  108. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
  109. mteb/models/model_implementations/gritlm_models.py +2 -2
  110. mteb/models/model_implementations/gte_models.py +25 -13
  111. mteb/models/model_implementations/hinvec_models.py +1 -1
  112. mteb/models/model_implementations/ibm_granite_models.py +30 -6
  113. mteb/models/model_implementations/inf_models.py +2 -2
  114. mteb/models/model_implementations/jasper_models.py +2 -2
  115. mteb/models/model_implementations/jina_clip.py +48 -10
  116. mteb/models/model_implementations/jina_models.py +18 -11
  117. mteb/models/model_implementations/kblab.py +12 -6
  118. mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
  119. mteb/models/model_implementations/kfst.py +1 -1
  120. mteb/models/model_implementations/kowshik24_models.py +1 -1
  121. mteb/models/model_implementations/lgai_embedding_models.py +1 -1
  122. mteb/models/model_implementations/linq_models.py +1 -1
  123. mteb/models/model_implementations/listconranker.py +1 -1
  124. mteb/models/model_implementations/llm2clip_models.py +6 -6
  125. mteb/models/model_implementations/llm2vec_models.py +8 -8
  126. mteb/models/model_implementations/mcinext_models.py +4 -1
  127. mteb/models/model_implementations/mdbr_models.py +17 -3
  128. mteb/models/model_implementations/misc_models.py +68 -68
  129. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  130. mteb/models/model_implementations/mme5_models.py +1 -1
  131. mteb/models/model_implementations/moco_models.py +4 -4
  132. mteb/models/model_implementations/mod_models.py +1 -1
  133. mteb/models/model_implementations/model2vec_models.py +14 -14
  134. mteb/models/model_implementations/moka_models.py +1 -1
  135. mteb/models/model_implementations/nbailab.py +3 -3
  136. mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
  137. mteb/models/model_implementations/nomic_models.py +30 -15
  138. mteb/models/model_implementations/nomic_models_vision.py +1 -1
  139. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
  140. mteb/models/model_implementations/nvidia_models.py +151 -19
  141. mteb/models/model_implementations/octen_models.py +61 -2
  142. mteb/models/model_implementations/openclip_models.py +13 -13
  143. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
  144. mteb/models/model_implementations/ops_moa_models.py +1 -1
  145. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  146. mteb/models/model_implementations/pawan_models.py +1 -1
  147. mteb/models/model_implementations/piccolo_models.py +1 -1
  148. mteb/models/model_implementations/pixie_models.py +56 -0
  149. mteb/models/model_implementations/promptriever_models.py +4 -4
  150. mteb/models/model_implementations/pylate_models.py +10 -9
  151. mteb/models/model_implementations/qodo_models.py +2 -2
  152. mteb/models/model_implementations/qtack_models.py +1 -1
  153. mteb/models/model_implementations/qwen3_models.py +3 -3
  154. mteb/models/model_implementations/qzhou_models.py +2 -2
  155. mteb/models/model_implementations/random_baseline.py +3 -3
  156. mteb/models/model_implementations/rasgaard_models.py +2 -2
  157. mteb/models/model_implementations/reasonir_model.py +1 -1
  158. mteb/models/model_implementations/repllama_models.py +3 -3
  159. mteb/models/model_implementations/rerankers_custom.py +12 -6
  160. mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
  161. mteb/models/model_implementations/richinfoai_models.py +1 -1
  162. mteb/models/model_implementations/ru_sentence_models.py +20 -20
  163. mteb/models/model_implementations/ruri_models.py +10 -10
  164. mteb/models/model_implementations/salesforce_models.py +3 -3
  165. mteb/models/model_implementations/samilpwc_models.py +1 -1
  166. mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
  167. mteb/models/model_implementations/searchmap_models.py +1 -1
  168. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  169. mteb/models/model_implementations/sentence_transformers_models.py +124 -22
  170. mteb/models/model_implementations/shuu_model.py +1 -1
  171. mteb/models/model_implementations/siglip_models.py +20 -20
  172. mteb/models/model_implementations/slm_models.py +416 -0
  173. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
  174. mteb/models/model_implementations/stella_models.py +17 -4
  175. mteb/models/model_implementations/tarka_models.py +2 -2
  176. mteb/models/model_implementations/text2vec_models.py +9 -3
  177. mteb/models/model_implementations/ua_sentence_models.py +1 -1
  178. mteb/models/model_implementations/uae_models.py +7 -1
  179. mteb/models/model_implementations/vdr_models.py +1 -1
  180. mteb/models/model_implementations/vi_vn_models.py +6 -6
  181. mteb/models/model_implementations/vlm2vec_models.py +3 -3
  182. mteb/models/model_implementations/voyage_models.py +84 -0
  183. mteb/models/model_implementations/voyage_v.py +9 -7
  184. mteb/models/model_implementations/youtu_models.py +1 -1
  185. mteb/models/model_implementations/yuan_models.py +1 -1
  186. mteb/models/model_implementations/yuan_models_en.py +1 -1
  187. mteb/models/model_meta.py +80 -31
  188. mteb/models/models_protocols.py +22 -6
  189. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  190. mteb/models/search_wrappers.py +33 -18
  191. mteb/models/sentence_transformer_wrapper.py +50 -25
  192. mteb/models/vllm_wrapper.py +327 -0
  193. mteb/py.typed +0 -0
  194. mteb/results/benchmark_results.py +29 -21
  195. mteb/results/model_result.py +52 -22
  196. mteb/results/task_result.py +80 -58
  197. mteb/similarity_functions.py +11 -7
  198. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  199. mteb/tasks/classification/est/estonian_valence.py +1 -1
  200. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  201. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  202. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  203. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  204. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  205. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  206. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  207. mteb/tasks/retrieval/code/code_rag.py +12 -12
  208. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  209. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  210. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  211. mteb/tasks/retrieval/eng/__init__.py +2 -0
  212. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  213. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  214. mteb/tasks/retrieval/kor/__init__.py +15 -1
  215. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  216. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  217. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  218. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  219. mteb/tasks/retrieval/nob/norquad.py +2 -2
  220. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  221. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  222. mteb/tasks/retrieval/vie/__init__.py +14 -6
  223. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  224. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  225. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  226. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  227. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  228. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  229. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  230. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  231. mteb/types/__init__.py +2 -0
  232. mteb/types/_encoder_io.py +12 -0
  233. mteb/types/_result.py +2 -1
  234. mteb/types/statistics.py +9 -3
  235. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
  236. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
  237. mteb/models/model_implementations/mxbai_models.py +0 -111
  238. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  239. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  240. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  241. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  import sys
3
- from typing import Any, TypedDict
3
+ from typing import TypedDict
4
4
 
5
5
  import numpy as np
6
6
  import torch
@@ -12,6 +12,7 @@ from mteb._evaluators.evaluator import Evaluator
12
12
  from mteb.abstasks.task_metadata import TaskMetadata
13
13
  from mteb.models import EncoderProtocol
14
14
  from mteb.similarity_functions import cos_sim, dot_score
15
+ from mteb.types import EncodeKwargs
15
16
 
16
17
  # if later than python 3.13 use typing module
17
18
  if sys.version_info >= (3, 13):
@@ -94,7 +95,7 @@ class SummarizationEvaluator(Evaluator):
94
95
  self,
95
96
  model: EncoderProtocol,
96
97
  *,
97
- encode_kwargs: dict[str, Any],
98
+ encode_kwargs: EncodeKwargs,
98
99
  ) -> SummarizationDistances:
99
100
  # Get the human & machine summaries for the text in one go for all
100
101
  human_lens = [len(human_summaries) for human_summaries in self.human_summaries]
@@ -135,10 +136,10 @@ class SummarizationEvaluator(Evaluator):
135
136
  )
136
137
 
137
138
  # Split the embeddings into the original human & machine summaries
138
- embs_human_summaries_all = np.split(
139
+ embs_human_summaries_all_split = np.split(
139
140
  embs_human_summaries_all, np.cumsum(human_lens)[:-1]
140
141
  )
141
- embs_machine_summaries_all = np.split(
142
+ embs_machine_summaries_all_split = np.split(
142
143
  embs_machine_summaries_all, np.cumsum(machine_lens)[:-1]
143
144
  )
144
145
 
@@ -148,7 +149,9 @@ class SummarizationEvaluator(Evaluator):
148
149
  all_human_scores = []
149
150
 
150
151
  for i, (embs_human_summaries, embs_machine_summaries) in tqdm(
151
- enumerate(zip(embs_human_summaries_all, embs_machine_summaries_all)),
152
+ enumerate(
153
+ zip(embs_human_summaries_all_split, embs_machine_summaries_all_split)
154
+ ),
152
155
  desc="Scoring",
153
156
  total=len(self.human_summaries),
154
157
  ):
@@ -164,7 +167,7 @@ class SummarizationEvaluator(Evaluator):
164
167
  dot_scores = dot_score(emb_machine_summary, embs_human_summaries)
165
168
 
166
169
  _sim_score = [
167
- float(model.similarity(emb_machine_summary, emb_human_summary)) # type: ignore
170
+ float(model.similarity(emb_machine_summary, emb_human_summary))
168
171
  for emb_human_summary in embs_human_summaries
169
172
  ]
170
173
  sim_score = torch.tensor(_sim_score)
@@ -216,17 +219,19 @@ class SummarizationEvaluator(Evaluator):
216
219
  strict=True,
217
220
  ):
218
221
  cosine_spearman_scores.append(
219
- spearmanr(human_scores, cosine_pred_scores).statistic
222
+ float(spearmanr(human_scores, cosine_pred_scores).statistic)
220
223
  )
221
224
  cosine_pearson_scores.append(
222
- pearsonr(human_scores, cosine_pred_scores).statistic
225
+ float(pearsonr(human_scores, cosine_pred_scores).statistic)
223
226
  )
224
227
  dot_spearman_scores.append(
225
- spearmanr(human_scores, dot_pred_scores).statistic
228
+ float(spearmanr(human_scores, dot_pred_scores).statistic)
229
+ )
230
+ dot_pearson_scores.append(
231
+ float(pearsonr(human_scores, dot_pred_scores).statistic)
226
232
  )
227
- dot_pearson_scores.append(pearsonr(human_scores, dot_pred_scores).statistic)
228
- spearman_scores.append(spearmanr(human_scores, sim_scores).statistic)
229
- pearson_scores.append(pearsonr(human_scores, sim_scores).statistic)
233
+ spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic))
234
+ pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic))
230
235
 
231
236
  return SummarizationMetrics(
232
237
  pearson=float(np.mean(pearson_scores)),
@@ -273,10 +278,10 @@ class DeprecatedSummarizationEvaluator(SummarizationEvaluator):
273
278
  pearson_scores.append(pearsonr(human_scores, sim_scores))
274
279
 
275
280
  return SummarizationMetrics(
276
- pearson=float(np.mean(pearson_scores)),
277
- spearman=float(np.mean(spearman_scores)),
278
- cosine_spearman=float(np.mean(cosine_spearman_scores)),
279
- cosine_pearson=float(np.mean(cosine_pearson_scores)),
280
- dot_pearson=float(np.mean(dot_pearson_scores)),
281
- dot_spearman=float(np.mean(dot_spearman_scores)),
281
+ pearson=float(np.mean(pearson_scores)), # type: ignore[arg-type]
282
+ spearman=float(np.mean(spearman_scores)), # type: ignore[arg-type]
283
+ cosine_spearman=float(np.mean(cosine_spearman_scores)), # type: ignore[arg-type]
284
+ cosine_pearson=float(np.mean(cosine_pearson_scores)), # type: ignore[arg-type]
285
+ dot_pearson=float(np.mean(dot_pearson_scores)), # type: ignore[arg-type]
286
+ dot_spearman=float(np.mean(dot_spearman_scores)), # type: ignore[arg-type]
282
287
  )
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from typing import Any
3
2
 
4
3
  from datasets import Dataset
5
4
 
@@ -10,7 +9,7 @@ from mteb._create_dataloaders import (
10
9
  from mteb.abstasks.task_metadata import TaskMetadata
11
10
  from mteb.models import EncoderProtocol
12
11
  from mteb.similarity_functions import similarity
13
- from mteb.types import Array
12
+ from mteb.types import Array, EncodeKwargs
14
13
 
15
14
  from .evaluator import Evaluator
16
15
 
@@ -38,7 +37,10 @@ class ZeroShotClassificationEvaluator(Evaluator):
38
37
  self.hf_subset = hf_subset
39
38
 
40
39
  def __call__(
41
- self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
40
+ self,
41
+ model: EncoderProtocol,
42
+ *,
43
+ encode_kwargs: EncodeKwargs,
42
44
  ) -> Array:
43
45
  dataloader = create_dataloader(
44
46
  self.dataset,
@@ -61,7 +61,7 @@ def filter_unclear_label(
61
61
  for text, label in zip(ds[input_column], ds[label_column]):
62
62
  key = text.strip().lower()
63
63
  normalized.setdefault(key, set()).add(
64
- label if isinstance(label, (str, int, float)) else tuple(label)
64
+ label if isinstance(label, (str, int, float)) else tuple(label) # type: ignore[arg-type]
65
65
  )
66
66
 
67
67
  bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
@@ -89,6 +89,9 @@ def process_classification(
89
89
  subset=None,
90
90
  )
91
91
 
92
+ if task.dataset is None:
93
+ raise ValueError("Task dataset is None.")
94
+
92
95
  new_ds = {}
93
96
  for subset in task.dataset:
94
97
  new_ds[subset] = clean_dataset(
@@ -2,7 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  import hashlib
4
4
  from collections import Counter
5
- from typing import TYPE_CHECKING
5
+ from collections.abc import Mapping
6
+ from typing import TYPE_CHECKING, cast
6
7
 
7
8
  from mteb.types import TopRankedDocumentsType
8
9
  from mteb.types.statistics import (
@@ -52,7 +53,7 @@ def calculate_image_statistics(images: list[Image.Image]) -> ImageStatistics:
52
53
  seen_hashes: set[str] = set()
53
54
 
54
55
  for img in images:
55
- width, height = img.size # type: ignore
56
+ width, height = img.size
56
57
  img_heights.append(height)
57
58
  img_widths.append(width)
58
59
 
@@ -82,17 +83,24 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics
82
83
  LabelStatistics: A dictionary containing the descriptive statistics.
83
84
 
84
85
  """
86
+ total_labels: list[int | None] = []
87
+
85
88
  if not isinstance(labels[0], list):
86
- label_len = [1] * len(labels)
87
- total_label_len = len(labels)
88
- total_labels = labels
89
+ # single label classification
90
+ single_label = cast(list[int], labels)
91
+ label_len = [1] * len(single_label)
92
+ total_label_len = len(single_label)
93
+ total_labels.extend(single_label)
89
94
  elif isinstance(labels[0], list):
90
95
  # multilabel classification
91
- label_len = [len(l) for l in labels]
96
+ multilabel_labels = cast(list[list[int]], labels)
97
+ label_len = [len(l) for l in multilabel_labels]
92
98
  total_label_len = sum(label_len)
93
- total_labels = []
94
- for l in labels:
95
- total_labels.extend(l if len(l) > 0 else [None])
99
+ for l in multilabel_labels:
100
+ if l and len(l) > 0:
101
+ total_labels.extend(l)
102
+ else:
103
+ total_labels.append(None)
96
104
  else:
97
105
  raise ValueError(
98
106
  "Labels must be a list of integers or a list of lists of integers."
@@ -159,7 +167,7 @@ def calculate_top_ranked_statistics(
159
167
 
160
168
 
161
169
  def calculate_relevant_docs_statistics(
162
- relevant_docs: dict[str, dict[str, float]],
170
+ relevant_docs: Mapping[str, Mapping[str, int]],
163
171
  ) -> RelevantDocsStatistics:
164
172
  qrels_lengths = [len(relevant_docs[qid]) for qid in relevant_docs]
165
173
  unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]})
@@ -39,6 +39,7 @@ Bibtex:
39
39
  """
40
40
 
41
41
  import itertools
42
+ from typing import Any
42
43
 
43
44
  import numpy as np
44
45
  import scipy.sparse as sp
@@ -119,8 +120,10 @@ def _get_most_desired_combination(samples_with_combination: dict):
119
120
  if support_size == 0:
120
121
  continue
121
122
  if currently_chosen is None or (
122
- best_number_of_combinations < number_of_combinations # type: ignore
123
- and best_support_size > support_size # type: ignore
123
+ best_number_of_combinations is not None
124
+ and best_support_size is not None
125
+ and best_number_of_combinations < number_of_combinations
126
+ and best_support_size > support_size
124
127
  ):
125
128
  currently_chosen = combination
126
129
  best_number_of_combinations, best_support_size = (
@@ -162,7 +165,7 @@ class IterativeStratification(_BaseKFold):
162
165
  self._rng_state = check_random_state(random_state)
163
166
  need_shuffle = shuffle or random_state is not None
164
167
  self.order = order
165
- super().__init__( # type: ignore
168
+ super().__init__(
166
169
  n_splits,
167
170
  shuffle=need_shuffle,
168
171
  random_state=self._rng_state if need_shuffle else None,
@@ -172,8 +175,7 @@ class IterativeStratification(_BaseKFold):
172
175
  self.percentage_per_fold = sample_distribution_per_fold
173
176
  else:
174
177
  self.percentage_per_fold = [
175
- 1 / float(self.n_splits)
176
- for _ in range(self.n_splits) # type: ignore
178
+ 1 / float(self.n_splits) for _ in range(self.n_splits)
177
179
  ]
178
180
 
179
181
  def _prepare_stratification(
@@ -182,9 +184,9 @@ class IterativeStratification(_BaseKFold):
182
184
  list[list[int]],
183
185
  dict[int, bool],
184
186
  list[list[int]],
185
- list[list[list[int]]],
186
- dict[tuple[int, ...], list[int]],
187
- list[list[int]],
187
+ list[list[Any]],
188
+ dict[str, list[Any]],
189
+ list[list[Any]],
188
190
  ]:
189
191
  """Prepares variables for performing stratification
190
192
 
@@ -206,14 +208,14 @@ class IterativeStratification(_BaseKFold):
206
208
  """
207
209
  self.n_samples, self.n_labels = y.shape
208
210
  self.desired_samples_per_fold = np.array(
209
- [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)] # type: ignore
211
+ [self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
210
212
  )
211
213
  rows = sp.lil_matrix(y).rows
212
214
  rows_used = dict.fromkeys(range(self.n_samples), False)
213
215
  all_combinations = []
214
- per_row_combinations = [[] for i in range(self.n_samples)]
215
- samples_with_combination = {}
216
- folds = [[] for _ in range(self.n_splits)] # type: ignore
216
+ per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)]
217
+ samples_with_combination: dict[str, list[Any]] = {}
218
+ folds: list[list[int]] = [[] for _ in range(self.n_splits)]
217
219
 
218
220
  # for every row
219
221
  for sample_index, label_assignment in enumerate(rows):
@@ -229,21 +231,19 @@ class IterativeStratification(_BaseKFold):
229
231
  all_combinations.append(combination)
230
232
  per_row_combinations[sample_index].append(combination)
231
233
 
232
- all_combinations = [list(x) for x in set(all_combinations)]
233
-
234
234
  self.desired_samples_per_combination_per_fold = {
235
235
  combination: np.array(
236
236
  [
237
237
  len(evidence_for_combination) * self.percentage_per_fold[j]
238
- for j in range(self.n_splits) # type: ignore
238
+ for j in range(self.n_splits)
239
239
  ]
240
240
  )
241
241
  for combination, evidence_for_combination in samples_with_combination.items()
242
242
  }
243
243
  return (
244
- rows,
244
+ rows.tolist(),
245
245
  rows_used,
246
- all_combinations,
246
+ [list(x) for x in set(all_combinations)],
247
247
  per_row_combinations,
248
248
  samples_with_combination,
249
249
  folds,
@@ -328,7 +328,7 @@ class IterativeStratification(_BaseKFold):
328
328
  per_row_combinations,
329
329
  samples_with_combination,
330
330
  folds,
331
- ) = self._prepare_stratification(y) # type: ignore
331
+ ) = self._prepare_stratification(y)
332
332
 
333
333
  self._distribute_positive_evidence(
334
334
  rows_used, folds, samples_with_combination, per_row_combinations
mteb/abstasks/abstask.py CHANGED
@@ -1,10 +1,11 @@
1
1
  import json
2
2
  import logging
3
+ import warnings
3
4
  from abc import ABC, abstractmethod
4
- from collections.abc import Sequence
5
+ from collections.abc import Mapping, Sequence
5
6
  from copy import copy
6
7
  from pathlib import Path
7
- from typing import Any, cast
8
+ from typing import Any, Literal, cast
8
9
 
9
10
  import numpy as np
10
11
  from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
@@ -22,6 +23,7 @@ from mteb.models import (
22
23
  SearchProtocol,
23
24
  )
24
25
  from mteb.types import HFSubset, Modalities, ScoresDict
26
+ from mteb.types._encoder_io import EncodeKwargs
25
27
  from mteb.types.statistics import DescriptiveStatistics, SplitDescriptiveStatistics
26
28
 
27
29
  logger = logging.getLogger(__name__)
@@ -78,8 +80,8 @@ class AbsTask(ABC):
78
80
  """
79
81
 
80
82
  metadata: TaskMetadata
81
- abstask_prompt: str | None = None
82
- _eval_splits: list[str] | None = None
83
+ abstask_prompt: str
84
+ _eval_splits: Sequence[str] | None = None
83
85
  dataset: dict[HFSubset, DatasetDict] | None = None
84
86
  data_loaded: bool = False
85
87
  hf_subsets: list[HFSubset]
@@ -102,9 +104,9 @@ class AbsTask(ABC):
102
104
  def check_if_dataset_is_superseded(self) -> None:
103
105
  """Check if the dataset is superseded by a newer version."""
104
106
  if self.superseded_by:
105
- logger.warning(
106
- f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset."
107
- )
107
+ msg = f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}'. We recommend using the newer version of the dataset unless you are running a specific benchmark. See `get_task('{self.superseded_by}').metadata.description` to get a description of the task and changes."
108
+ logger.warning(msg)
109
+ warnings.warn(msg)
108
110
 
109
111
  def dataset_transform(self):
110
112
  """A transform operations applied to the dataset after loading.
@@ -120,10 +122,10 @@ class AbsTask(ABC):
120
122
  split: str = "test",
121
123
  subsets_to_run: list[HFSubset] | None = None,
122
124
  *,
123
- encode_kwargs: dict[str, Any],
125
+ encode_kwargs: EncodeKwargs,
124
126
  prediction_folder: Path | None = None,
125
127
  **kwargs: Any,
126
- ) -> dict[HFSubset, ScoresDict]:
128
+ ) -> Mapping[HFSubset, ScoresDict]:
127
129
  """Evaluates an MTEB compatible model on the task.
128
130
 
129
131
  Args:
@@ -195,12 +197,12 @@ class AbsTask(ABC):
195
197
  @abstractmethod
196
198
  def _evaluate_subset(
197
199
  self,
198
- model: EncoderProtocol,
200
+ model: MTEBModels,
199
201
  data_split: Dataset,
200
202
  *,
201
- encode_kwargs: dict[str, Any],
202
203
  hf_split: str,
203
204
  hf_subset: str,
205
+ encode_kwargs: EncodeKwargs,
204
206
  prediction_folder: Path | None = None,
205
207
  **kwargs: Any,
206
208
  ) -> ScoresDict:
@@ -210,7 +212,7 @@ class AbsTask(ABC):
210
212
 
211
213
  def _save_task_predictions(
212
214
  self,
213
- predictions: dict[str, Any] | list[Any],
215
+ predictions: Mapping[str, Any] | list[Any],
214
216
  model: MTEBModels,
215
217
  prediction_folder: Path,
216
218
  hf_split: str,
@@ -226,7 +228,7 @@ class AbsTask(ABC):
226
228
  hf_subset: The subset of the dataset (e.g. "en").
227
229
  """
228
230
  predictions_path = self._predictions_path(prediction_folder)
229
- existing_results = {
231
+ existing_results: dict[str, Any] = {
230
232
  "mteb_model_meta": {
231
233
  "model_name": model.mteb_model_meta.name,
232
234
  "revision": model.mteb_model_meta.revision,
@@ -326,7 +328,7 @@ class AbsTask(ABC):
326
328
  )
327
329
  else:
328
330
  # some of monolingual datasets explicitly adding the split name to the dataset name
329
- self.dataset = load_dataset(**self.metadata.dataset) # type: ignore
331
+ self.dataset = load_dataset(**self.metadata.dataset)
330
332
  self.dataset_transform()
331
333
  self.data_loaded = True
332
334
 
@@ -362,15 +364,19 @@ class AbsTask(ABC):
362
364
  """
363
365
  from mteb.abstasks import AbsTaskClassification
364
366
 
365
- if self.metadata.descriptive_stat_path.exists() and not overwrite_results:
367
+ existing_stats = self.metadata.descriptive_stats
368
+
369
+ if existing_stats is not None and not overwrite_results:
366
370
  logger.info("Loading metadata descriptive statistics from cache.")
367
- return self.metadata.descriptive_stats
371
+ return existing_stats
368
372
 
369
373
  if not self.data_loaded:
370
374
  self.load_data()
371
375
 
372
376
  descriptive_stats: dict[str, DescriptiveStatistics] = {}
373
- hf_subset_stat = "hf_subset_descriptive_stats"
377
+ hf_subset_stat: Literal["hf_subset_descriptive_stats"] = (
378
+ "hf_subset_descriptive_stats"
379
+ )
374
380
  eval_splits = self.metadata.eval_splits
375
381
  if isinstance(self, AbsTaskClassification):
376
382
  eval_splits.append(self.train_split)
@@ -381,7 +387,7 @@ class AbsTask(ABC):
381
387
  logger.info(f"Processing metadata for split {split}")
382
388
  if self.metadata.is_multilingual:
383
389
  descriptive_stats[split] = (
384
- self._calculate_descriptive_statistics_from_split(
390
+ self._calculate_descriptive_statistics_from_split( # type: ignore[assignment]
385
391
  split, compute_overall=True
386
392
  )
387
393
  )
@@ -400,7 +406,7 @@ class AbsTask(ABC):
400
406
  descriptive_stats[split][hf_subset_stat][hf_subset] = split_details
401
407
  else:
402
408
  split_details = self._calculate_descriptive_statistics_from_split(split)
403
- descriptive_stats[split] = split_details
409
+ descriptive_stats[split] = split_details # type: ignore[assignment]
404
410
 
405
411
  with self.metadata.descriptive_stat_path.open("w") as f:
406
412
  json.dump(descriptive_stats, f, indent=4)
@@ -437,7 +443,7 @@ class AbsTask(ABC):
437
443
 
438
444
  return self.metadata.languages
439
445
 
440
- def filter_eval_splits(self, eval_splits: list[str] | None) -> Self:
446
+ def filter_eval_splits(self, eval_splits: Sequence[str] | None) -> Self:
441
447
  """Filter the evaluation splits of the task.
442
448
 
443
449
  Args:
@@ -451,9 +457,9 @@ class AbsTask(ABC):
451
457
 
452
458
  def filter_languages(
453
459
  self,
454
- languages: list[str] | None,
455
- script: list[str] | None = None,
456
- hf_subsets: list[HFSubset] | None = None,
460
+ languages: Sequence[str] | None,
461
+ script: Sequence[str] | None = None,
462
+ hf_subsets: Sequence[HFSubset] | None = None,
457
463
  exclusive_language_filter: bool = False,
458
464
  ) -> Self:
459
465
  """Filter the languages of the task.
@@ -499,12 +505,14 @@ class AbsTask(ABC):
499
505
  self.hf_subsets = subsets_to_keep
500
506
  return self
501
507
 
502
- def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
508
+ def _add_main_score(self, scores: ScoresDict) -> None:
503
509
  scores["main_score"] = scores[self.metadata.main_score]
504
510
 
505
511
  def _upload_dataset_to_hub(
506
512
  self, repo_name: str, fields: list[str] | dict[str, str]
507
513
  ) -> None:
514
+ if self.dataset is None:
515
+ raise ValueError("Dataset not loaded")
508
516
  if self.metadata.is_multilingual:
509
517
  for config in self.metadata.eval_langs:
510
518
  logger.info(f"Converting {config} of {self.metadata.name}")
@@ -574,7 +582,7 @@ class AbsTask(ABC):
574
582
  return False
575
583
 
576
584
  @property
577
- def eval_splits(self) -> list[str]:
585
+ def eval_splits(self) -> Sequence[str]:
578
586
  """Returns the evaluation splits of the task."""
579
587
  if self._eval_splits:
580
588
  return self._eval_splits
@@ -607,9 +615,8 @@ class AbsTask(ABC):
607
615
  self.data_loaded = False
608
616
  logger.info(f"Unloaded dataset {self.metadata.name} from memory.")
609
617
  else:
610
- logger.warning(
611
- f"Dataset {self.metadata.name} is not loaded, cannot unload it."
612
- )
618
+ msg = f"Dataset `{self.metadata.name}` is not loaded, cannot unload it."
619
+ logger.warning(msg)
613
620
 
614
621
  @property
615
622
  def superseded_by(self) -> str | None:
@@ -5,7 +5,6 @@ from pydantic import ConfigDict, Field, model_validator
5
5
  from typing_extensions import Self
6
6
 
7
7
  from mteb.types import (
8
- HFSubset,
9
8
  ISOLanguageScript,
10
9
  Languages,
11
10
  Licenses,
@@ -60,14 +59,7 @@ class AggregateTaskMetadata(TaskMetadata):
60
59
  reference: str | None = None
61
60
  bibtex_citation: str | None = None
62
61
 
63
- @property
64
- def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]:
65
- """Return a dictionary mapping huggingface subsets to languages."""
66
- if isinstance(self.eval_langs, dict):
67
- return self.eval_langs
68
- return {"default": self.eval_langs} # type: ignore
69
-
70
- @model_validator(mode="after") # type: ignore
62
+ @model_validator(mode="after")
71
63
  def _compute_unfilled_cases(self) -> Self:
72
64
  if not self.eval_langs:
73
65
  self.eval_langs = self._compute_eval_langs()
@@ -1,14 +1,15 @@
1
1
  import logging
2
+ import warnings
3
+ from collections.abc import Mapping
2
4
  from pathlib import Path
3
5
  from typing import Any
4
6
 
5
7
  import numpy as np
6
8
  from datasets import Dataset, DatasetDict
7
- from typing_extensions import Self
8
9
 
9
10
  from mteb.models.models_protocols import MTEBModels
10
11
  from mteb.results.task_result import TaskResult
11
- from mteb.types import HFSubset, ScoresDict
12
+ from mteb.types import EncodeKwargs, HFSubset, ScoresDict
12
13
  from mteb.types.statistics import DescriptiveStatistics
13
14
 
14
15
  from .abstask import AbsTask
@@ -32,7 +33,7 @@ class AbsTaskAggregate(AbsTask):
32
33
 
33
34
  def task_results_to_scores(
34
35
  self, task_results: list[TaskResult]
35
- ) -> dict[str, dict[HFSubset, ScoresDict]]:
36
+ ) -> dict[str, Mapping[HFSubset, ScoresDict]]:
36
37
  """The function that aggregated scores. Can be redefined to allow for custom aggregations.
37
38
 
38
39
  Args:
@@ -41,7 +42,7 @@ class AbsTaskAggregate(AbsTask):
41
42
  Returns:
42
43
  A dictionary with the aggregated scores.
43
44
  """
44
- scores = {}
45
+ scores: dict[str, Mapping[HFSubset, ScoresDict]] = {}
45
46
  subsets = (
46
47
  self.metadata.eval_langs.keys()
47
48
  if isinstance(self.metadata.eval_langs, dict)
@@ -113,40 +114,20 @@ class AbsTaskAggregate(AbsTask):
113
114
  )
114
115
  mteb_versions = {tr.mteb_version for tr in task_results}
115
116
  if len(mteb_versions) != 1:
116
- logger.warning(
117
- f"All tasks of {self.metadata.name} is not run using the same version."
118
- )
117
+ msg = f"All tasks of {self.metadata.name} is not run using the same version. different versions found are: {mteb_versions}"
118
+ logger.warning(msg)
119
+ warnings.warn(msg)
119
120
  task_res.mteb_version = None
120
121
  task_res.mteb_version = task_results[0].mteb_version
121
122
  return task_res
122
123
 
123
- def check_if_dataset_is_superseded(self) -> None:
124
- """Check if the dataset is superseded by a newer version"""
125
- if self.superseded_by:
126
- logger.warning(
127
- f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset."
128
- )
129
-
130
- def filter_eval_splits(self, eval_splits: list[str] | None) -> Self:
131
- """Filter the evaluation splits of the task.
132
-
133
- Args:
134
- eval_splits: List of splits to evaluate on. If None, all splits in metadata
135
- are used.
136
-
137
- Returns:
138
- The task with filtered evaluation splits.
139
- """
140
- self._eval_splits = eval_splits
141
- return self
142
-
143
124
  def evaluate(
144
125
  self,
145
126
  model: MTEBModels,
146
127
  split: str = "test",
147
128
  subsets_to_run: list[HFSubset] | None = None,
148
129
  *,
149
- encode_kwargs: dict[str, Any],
130
+ encode_kwargs: EncodeKwargs,
150
131
  prediction_folder: Path | None = None,
151
132
  **kwargs: Any,
152
133
  ) -> dict[HFSubset, ScoresDict]:
@@ -160,7 +141,7 @@ class AbsTaskAggregate(AbsTask):
160
141
  self,
161
142
  model: MTEBModels,
162
143
  data_split: DatasetDict | Dataset,
163
- encode_kwargs: dict[str, Any],
144
+ encode_kwargs: EncodeKwargs,
164
145
  **kwargs: Any,
165
146
  ) -> ScoresDict:
166
147
  raise NotImplementedError(