mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +17 -18
  3. mteb/_evaluators/any_sts_evaluator.py +3 -3
  4. mteb/_evaluators/clustering_evaluator.py +2 -2
  5. mteb/_evaluators/evaluator.py +4 -2
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
  7. mteb/_evaluators/pair_classification_evaluator.py +5 -3
  8. mteb/_evaluators/retrieval_evaluator.py +2 -2
  9. mteb/_evaluators/retrieval_metrics.py +18 -17
  10. mteb/_evaluators/sklearn_evaluator.py +11 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
  12. mteb/_evaluators/text/summarization_evaluator.py +23 -18
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  14. mteb/abstasks/_data_filter/filters.py +1 -1
  15. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  16. mteb/abstasks/_statistics_calculation.py +18 -10
  17. mteb/abstasks/_stratification.py +18 -18
  18. mteb/abstasks/abstask.py +35 -28
  19. mteb/abstasks/aggregate_task_metadata.py +1 -9
  20. mteb/abstasks/aggregated_task.py +10 -29
  21. mteb/abstasks/classification.py +15 -10
  22. mteb/abstasks/clustering.py +19 -15
  23. mteb/abstasks/clustering_legacy.py +10 -10
  24. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  25. mteb/abstasks/multilabel_classification.py +23 -19
  26. mteb/abstasks/pair_classification.py +20 -11
  27. mteb/abstasks/regression.py +4 -4
  28. mteb/abstasks/retrieval.py +28 -24
  29. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  30. mteb/abstasks/sts.py +8 -5
  31. mteb/abstasks/task_metadata.py +31 -33
  32. mteb/abstasks/text/bitext_mining.py +39 -28
  33. mteb/abstasks/text/reranking.py +8 -6
  34. mteb/abstasks/text/summarization.py +10 -5
  35. mteb/abstasks/zeroshot_classification.py +8 -4
  36. mteb/benchmarks/benchmark.py +4 -2
  37. mteb/benchmarks/benchmarks/__init__.py +4 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +112 -11
  39. mteb/benchmarks/get_benchmark.py +14 -55
  40. mteb/cache.py +182 -29
  41. mteb/cli/_display_tasks.py +2 -2
  42. mteb/cli/build_cli.py +110 -14
  43. mteb/cli/generate_model_card.py +43 -23
  44. mteb/deprecated_evaluator.py +63 -49
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  49. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  50. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  51. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  53. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  54. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  55. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  56. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  57. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  58. mteb/evaluate.py +44 -33
  59. mteb/filter_tasks.py +25 -26
  60. mteb/get_tasks.py +29 -30
  61. mteb/languages/language_scripts.py +5 -3
  62. mteb/leaderboard/app.py +162 -34
  63. mteb/load_results.py +12 -12
  64. mteb/models/abs_encoder.py +10 -6
  65. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  66. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  67. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  68. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  69. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  70. mteb/models/get_model_meta.py +21 -3
  71. mteb/models/instruct_wrapper.py +28 -8
  72. mteb/models/model_implementations/align_models.py +1 -1
  73. mteb/models/model_implementations/andersborges.py +4 -4
  74. mteb/models/model_implementations/ara_models.py +1 -1
  75. mteb/models/model_implementations/arctic_models.py +8 -8
  76. mteb/models/model_implementations/b1ade_models.py +1 -1
  77. mteb/models/model_implementations/bge_models.py +45 -21
  78. mteb/models/model_implementations/bica_model.py +3 -3
  79. mteb/models/model_implementations/blip2_models.py +2 -2
  80. mteb/models/model_implementations/blip_models.py +16 -16
  81. mteb/models/model_implementations/bm25.py +4 -4
  82. mteb/models/model_implementations/bmretriever_models.py +6 -4
  83. mteb/models/model_implementations/cadet_models.py +1 -1
  84. mteb/models/model_implementations/cde_models.py +11 -4
  85. mteb/models/model_implementations/clip_models.py +6 -6
  86. mteb/models/model_implementations/clips_models.py +3 -3
  87. mteb/models/model_implementations/codefuse_models.py +5 -5
  88. mteb/models/model_implementations/codesage_models.py +3 -3
  89. mteb/models/model_implementations/cohere_models.py +5 -5
  90. mteb/models/model_implementations/cohere_v.py +2 -2
  91. mteb/models/model_implementations/colpali_models.py +3 -3
  92. mteb/models/model_implementations/colqwen_models.py +8 -8
  93. mteb/models/model_implementations/colsmol_models.py +2 -2
  94. mteb/models/model_implementations/conan_models.py +1 -1
  95. mteb/models/model_implementations/dino_models.py +42 -42
  96. mteb/models/model_implementations/e5_instruct.py +23 -4
  97. mteb/models/model_implementations/e5_models.py +9 -9
  98. mteb/models/model_implementations/e5_v.py +6 -6
  99. mteb/models/model_implementations/eagerworks_models.py +1 -1
  100. mteb/models/model_implementations/emillykkejensen_models.py +6 -6
  101. mteb/models/model_implementations/en_code_retriever.py +1 -1
  102. mteb/models/model_implementations/euler_models.py +2 -2
  103. mteb/models/model_implementations/fa_models.py +9 -9
  104. mteb/models/model_implementations/facebookai.py +14 -2
  105. mteb/models/model_implementations/geogpt_models.py +1 -1
  106. mteb/models/model_implementations/gme_v_models.py +6 -5
  107. mteb/models/model_implementations/google_models.py +1 -1
  108. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
  109. mteb/models/model_implementations/gritlm_models.py +2 -2
  110. mteb/models/model_implementations/gte_models.py +25 -13
  111. mteb/models/model_implementations/hinvec_models.py +1 -1
  112. mteb/models/model_implementations/ibm_granite_models.py +30 -6
  113. mteb/models/model_implementations/inf_models.py +2 -2
  114. mteb/models/model_implementations/jasper_models.py +2 -2
  115. mteb/models/model_implementations/jina_clip.py +48 -10
  116. mteb/models/model_implementations/jina_models.py +18 -11
  117. mteb/models/model_implementations/kblab.py +12 -6
  118. mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
  119. mteb/models/model_implementations/kfst.py +1 -1
  120. mteb/models/model_implementations/kowshik24_models.py +1 -1
  121. mteb/models/model_implementations/lgai_embedding_models.py +1 -1
  122. mteb/models/model_implementations/linq_models.py +1 -1
  123. mteb/models/model_implementations/listconranker.py +1 -1
  124. mteb/models/model_implementations/llm2clip_models.py +6 -6
  125. mteb/models/model_implementations/llm2vec_models.py +8 -8
  126. mteb/models/model_implementations/mcinext_models.py +4 -1
  127. mteb/models/model_implementations/mdbr_models.py +17 -3
  128. mteb/models/model_implementations/misc_models.py +68 -68
  129. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  130. mteb/models/model_implementations/mme5_models.py +1 -1
  131. mteb/models/model_implementations/moco_models.py +4 -4
  132. mteb/models/model_implementations/mod_models.py +1 -1
  133. mteb/models/model_implementations/model2vec_models.py +14 -14
  134. mteb/models/model_implementations/moka_models.py +1 -1
  135. mteb/models/model_implementations/nbailab.py +3 -3
  136. mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
  137. mteb/models/model_implementations/nomic_models.py +30 -15
  138. mteb/models/model_implementations/nomic_models_vision.py +1 -1
  139. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
  140. mteb/models/model_implementations/nvidia_models.py +151 -19
  141. mteb/models/model_implementations/octen_models.py +61 -2
  142. mteb/models/model_implementations/openclip_models.py +13 -13
  143. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
  144. mteb/models/model_implementations/ops_moa_models.py +1 -1
  145. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  146. mteb/models/model_implementations/pawan_models.py +1 -1
  147. mteb/models/model_implementations/piccolo_models.py +1 -1
  148. mteb/models/model_implementations/pixie_models.py +56 -0
  149. mteb/models/model_implementations/promptriever_models.py +4 -4
  150. mteb/models/model_implementations/pylate_models.py +10 -9
  151. mteb/models/model_implementations/qodo_models.py +2 -2
  152. mteb/models/model_implementations/qtack_models.py +1 -1
  153. mteb/models/model_implementations/qwen3_models.py +3 -3
  154. mteb/models/model_implementations/qzhou_models.py +2 -2
  155. mteb/models/model_implementations/random_baseline.py +3 -3
  156. mteb/models/model_implementations/rasgaard_models.py +2 -2
  157. mteb/models/model_implementations/reasonir_model.py +1 -1
  158. mteb/models/model_implementations/repllama_models.py +3 -3
  159. mteb/models/model_implementations/rerankers_custom.py +12 -6
  160. mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
  161. mteb/models/model_implementations/richinfoai_models.py +1 -1
  162. mteb/models/model_implementations/ru_sentence_models.py +20 -20
  163. mteb/models/model_implementations/ruri_models.py +10 -10
  164. mteb/models/model_implementations/salesforce_models.py +3 -3
  165. mteb/models/model_implementations/samilpwc_models.py +1 -1
  166. mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
  167. mteb/models/model_implementations/searchmap_models.py +1 -1
  168. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  169. mteb/models/model_implementations/sentence_transformers_models.py +124 -22
  170. mteb/models/model_implementations/shuu_model.py +1 -1
  171. mteb/models/model_implementations/siglip_models.py +20 -20
  172. mteb/models/model_implementations/slm_models.py +416 -0
  173. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
  174. mteb/models/model_implementations/stella_models.py +17 -4
  175. mteb/models/model_implementations/tarka_models.py +2 -2
  176. mteb/models/model_implementations/text2vec_models.py +9 -3
  177. mteb/models/model_implementations/ua_sentence_models.py +1 -1
  178. mteb/models/model_implementations/uae_models.py +7 -1
  179. mteb/models/model_implementations/vdr_models.py +1 -1
  180. mteb/models/model_implementations/vi_vn_models.py +6 -6
  181. mteb/models/model_implementations/vlm2vec_models.py +3 -3
  182. mteb/models/model_implementations/voyage_models.py +84 -0
  183. mteb/models/model_implementations/voyage_v.py +9 -7
  184. mteb/models/model_implementations/youtu_models.py +1 -1
  185. mteb/models/model_implementations/yuan_models.py +1 -1
  186. mteb/models/model_implementations/yuan_models_en.py +1 -1
  187. mteb/models/model_meta.py +80 -31
  188. mteb/models/models_protocols.py +22 -6
  189. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  190. mteb/models/search_wrappers.py +33 -18
  191. mteb/models/sentence_transformer_wrapper.py +50 -25
  192. mteb/models/vllm_wrapper.py +327 -0
  193. mteb/py.typed +0 -0
  194. mteb/results/benchmark_results.py +29 -21
  195. mteb/results/model_result.py +52 -22
  196. mteb/results/task_result.py +80 -58
  197. mteb/similarity_functions.py +11 -7
  198. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  199. mteb/tasks/classification/est/estonian_valence.py +1 -1
  200. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  201. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  202. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  203. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  204. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  205. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  206. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  207. mteb/tasks/retrieval/code/code_rag.py +12 -12
  208. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  209. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  210. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  211. mteb/tasks/retrieval/eng/__init__.py +2 -0
  212. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  213. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  214. mteb/tasks/retrieval/kor/__init__.py +15 -1
  215. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  216. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  217. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  218. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  219. mteb/tasks/retrieval/nob/norquad.py +2 -2
  220. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  221. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  222. mteb/tasks/retrieval/vie/__init__.py +14 -6
  223. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  224. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  225. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  226. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  227. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  228. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  229. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  230. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  231. mteb/types/__init__.py +2 -0
  232. mteb/types/_encoder_io.py +12 -0
  233. mteb/types/_result.py +2 -1
  234. mteb/types/statistics.py +9 -3
  235. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
  236. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
  237. mteb/models/model_implementations/mxbai_models.py +0 -111
  238. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  239. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  240. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  241. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -136,7 +136,7 @@ class RetrievalDatasetLoader:
136
136
  "_id", "id"
137
137
  )
138
138
  logger.info("Loaded %d %s Documents.", len(corpus_ds), self.split.upper())
139
- logger.info("Doc Example: %s", corpus_ds[0])
139
+ logger.debug("Doc Example: %s", corpus_ds[0])
140
140
  return corpus_ds
141
141
 
142
142
  def _load_queries(self) -> QueryDatasetType:
@@ -152,7 +152,7 @@ class RetrievalDatasetLoader:
152
152
  )
153
153
 
154
154
  logger.info("Loaded %d %s queries.", len(queries_ds), self.split.upper())
155
- logger.info("Query Example: %s", queries_ds[0])
155
+ logger.debug("Query Example: %s", queries_ds[0])
156
156
 
157
157
  return queries_ds
158
158
 
mteb/abstasks/sts.py CHANGED
@@ -7,8 +7,8 @@ from scipy.stats import pearsonr, spearmanr
7
7
 
8
8
  from mteb._evaluators import AnySTSEvaluator
9
9
  from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
10
- from mteb.models import EncoderProtocol
11
- from mteb.types import PromptType
10
+ from mteb.models import EncoderProtocol, MTEBModels
11
+ from mteb.types import EncodeKwargs, PromptType
12
12
  from mteb.types.statistics import (
13
13
  ImageStatistics,
14
14
  ScoreStatistics,
@@ -103,14 +103,17 @@ class AbsTaskSTS(AbsTask):
103
103
 
104
104
  def _evaluate_subset(
105
105
  self,
106
- model: EncoderProtocol,
106
+ model: MTEBModels,
107
107
  data_split: Dataset,
108
- encode_kwargs: dict[str, Any],
108
+ encode_kwargs: EncodeKwargs,
109
109
  hf_split: str,
110
110
  hf_subset: str,
111
111
  prediction_folder: Path | None = None,
112
112
  **kwargs: Any,
113
113
  ) -> STSMetrics:
114
+ if not isinstance(model, EncoderProtocol):
115
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
116
+
114
117
  normalized_scores = list(map(self._normalize, data_split["score"]))
115
118
  data_split = data_split.select_columns(list(self.column_names))
116
119
 
@@ -142,7 +145,7 @@ class AbsTaskSTS(AbsTask):
142
145
  ) -> STSMetrics:
143
146
  def compute_corr(x: list[float], y: list[float]) -> tuple[float, float]:
144
147
  """Return (pearson, spearman) correlations between x and y."""
145
- return pearsonr(x, y)[0], spearmanr(x, y)[0]
148
+ return float(pearsonr(x, y)[0]), float(spearmanr(x, y)[0])
146
149
 
147
150
  cosine_pearson, cosine_spearman = compute_corr(
148
151
  normalized_scores, scores["cosine_scores"]
@@ -2,9 +2,10 @@ import json
2
2
  import logging
3
3
  from collections.abc import Sequence
4
4
  from pathlib import Path
5
- from typing import Any, Literal
5
+ from typing import Any, Literal, cast
6
6
 
7
7
  from huggingface_hub import (
8
+ CardData,
8
9
  DatasetCard,
9
10
  DatasetCardData,
10
11
  constants,
@@ -150,7 +151,7 @@ _TASK_TYPE = (
150
151
  "InstructionReranking",
151
152
  ) + MIEB_TASK_TYPE
152
153
 
153
- TaskType = Literal[_TASK_TYPE]
154
+ TaskType = Literal[_TASK_TYPE] # type: ignore[valid-type]
154
155
  """The type of the task. E.g. includes "Classification", "Retrieval" and "Clustering"."""
155
156
 
156
157
 
@@ -192,8 +193,10 @@ AnnotatorType = Literal[
192
193
  """The type of the annotators. Is often important for understanding the quality of a dataset."""
193
194
 
194
195
 
195
- PromptDict = TypedDict(
196
- "PromptDict", {prompt_type.value: str for prompt_type in PromptType}, total=False
196
+ PromptDict = TypedDict( # type: ignore[misc]
197
+ "PromptDict",
198
+ {prompt_type.value: str for prompt_type in PromptType},
199
+ total=False,
197
200
  )
198
201
  """A dictionary containing the prompt used for the task.
199
202
 
@@ -365,7 +368,7 @@ class TaskMetadata(BaseModel):
365
368
  """Return a dictionary mapping huggingface subsets to languages."""
366
369
  if isinstance(self.eval_langs, dict):
367
370
  return self.eval_langs
368
- return {"default": self.eval_langs} # type: ignore
371
+ return {"default": cast(list[str], self.eval_langs)}
369
372
 
370
373
  @property
371
374
  def intext_citation(self, include_cite: bool = True) -> str:
@@ -376,9 +379,8 @@ class TaskMetadata(BaseModel):
376
379
  if include_cite and cite:
377
380
  # check for whitespace in the citation
378
381
  if " " in cite:
379
- logger.warning(
380
- "Citation contains whitespace. Please ensure that the citation is correctly formatted."
381
- )
382
+ msg = "Citation contains whitespace. Please ensure that the citation is correctly formatted."
383
+ logger.warning(msg)
382
384
  return f"\\cite{{{cite}}}"
383
385
  return cite
384
386
 
@@ -414,7 +416,7 @@ class TaskMetadata(BaseModel):
414
416
  for subset, subset_value in stats.items():
415
417
  if subset == "hf_subset_descriptive_stats":
416
418
  continue
417
- n_samples[subset] = subset_value["num_samples"] # type: ignore
419
+ n_samples[subset] = subset_value["num_samples"]
418
420
  return n_samples
419
421
 
420
422
  @property
@@ -447,7 +449,7 @@ class TaskMetadata(BaseModel):
447
449
  Raises:
448
450
  ValueError: If the prompt type is not recognized.
449
451
  """
450
- if prompt_type is None:
452
+ if prompt_type is None or self.category is None:
451
453
  return self.modalities
452
454
  query_modalities, doc_modalities = self.category.split("2")
453
455
  category_to_modality: dict[str, Modalities] = {
@@ -467,7 +469,7 @@ class TaskMetadata(BaseModel):
467
469
 
468
470
  def _create_dataset_card_data(
469
471
  self,
470
- existing_dataset_card_data: DatasetCardData | None = None,
472
+ existing_dataset_card_data: CardData | None = None,
471
473
  ) -> tuple[DatasetCardData, dict[str, Any]]:
472
474
  """Create a DatasetCardData object from the task metadata.
473
475
 
@@ -483,7 +485,6 @@ class TaskMetadata(BaseModel):
483
485
  dataset_type = [
484
486
  *self._hf_task_type(),
485
487
  *self._hf_task_category(),
486
- *self._hf_subtypes(),
487
488
  ]
488
489
  languages = self._hf_languages()
489
490
 
@@ -502,12 +503,13 @@ class TaskMetadata(BaseModel):
502
503
 
503
504
  tags = ["mteb"] + self.modalities
504
505
 
505
- descriptive_stats = self.descriptive_stats
506
- if descriptive_stats is not None:
507
- for split, split_stat in descriptive_stats.items():
506
+ descriptive_stats = ""
507
+ if self.descriptive_stats is not None:
508
+ descriptive_stats_ = self.descriptive_stats
509
+ for split, split_stat in descriptive_stats_.items():
508
510
  if len(split_stat.get("hf_subset_descriptive_stats", {})) > 10:
509
511
  split_stat.pop("hf_subset_descriptive_stats", {})
510
- descriptive_stats = json.dumps(descriptive_stats, indent=4)
512
+ descriptive_stats = json.dumps(descriptive_stats_, indent=4)
511
513
 
512
514
  dataset_card_data_params = existing_dataset_card_data.to_dict()
513
515
  # override the existing values
@@ -584,10 +586,8 @@ class TaskMetadata(BaseModel):
584
586
 
585
587
  def _hf_subtypes(self) -> list[str]:
586
588
  # to get full list of available task_ids execute
587
- # requests.post("https://huggingface.co/api/validate-yaml", json={
588
- # "content": "---\ntask_ids: 'test'\n---",
589
- # "repoType": "dataset"
590
- # })
589
+ # https://huggingface.co/api/datasets-tags-by-type?type=task_ids
590
+ # ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
591
591
  mteb_to_hf_subtype = {
592
592
  "Article retrieval": ["document-retrieval"],
593
593
  "Conversational retrieval": ["conversational", "utterance-retrieval"],
@@ -609,7 +609,7 @@ class TaskMetadata(BaseModel):
609
609
  "hate-speech-detection",
610
610
  ],
611
611
  "Thematic clustering": [],
612
- "Scientific Reranking": [],
612
+ "Scientific Reranking": ["text-scoring"],
613
613
  "Claim verification": ["fact-checking", "fact-checking-retrieval"],
614
614
  "Topic classification": ["topic-classification"],
615
615
  "Code retrieval": [],
@@ -617,21 +617,21 @@ class TaskMetadata(BaseModel):
617
617
  "Cross-Lingual Semantic Discrimination": [],
618
618
  "Textual Entailment": ["natural-language-inference"],
619
619
  "Counterfactual Detection": [],
620
- "Emotion classification": [],
620
+ "Emotion classification": ["sentiment-classification"],
621
621
  "Reasoning as Retrieval": [],
622
622
  "Rendered Texts Understanding": [],
623
623
  "Image Text Retrieval": [],
624
624
  "Object recognition": [],
625
625
  "Scene recognition": [],
626
626
  "Caption Pairing": ["image-captioning"],
627
- "Emotion recognition": [],
627
+ "Emotion recognition": ["sentiment-scoring"],
628
628
  "Textures recognition": [],
629
629
  "Activity recognition": [],
630
630
  "Tumor detection": [],
631
631
  "Duplicate Detection": [],
632
632
  "Rendered semantic textual similarity": [
633
633
  "semantic-similarity-scoring",
634
- "rendered semantic textual similarity",
634
+ "semantic-similarity-classification",
635
635
  ],
636
636
  "Intent classification": [
637
637
  "intent-classification",
@@ -645,10 +645,8 @@ class TaskMetadata(BaseModel):
645
645
 
646
646
  def _hf_task_type(self) -> list[str]:
647
647
  # to get full list of task_types execute:
648
- # requests.post("https://huggingface.co/api/validate-yaml", json={
649
- # "content": "---\ntask_categories: ['test']\n---", "repoType": "dataset"
650
- # }).json()
651
- # or look at https://huggingface.co/tasks
648
+ # https://huggingface.co/api/datasets-tags-by-type?type=task_categories
649
+ # ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
652
650
  mteb_task_type_to_datasets = {
653
651
  # Text
654
652
  "BitextMining": ["translation"],
@@ -667,7 +665,7 @@ class TaskMetadata(BaseModel):
667
665
  "Any2AnyRetrieval": ["visual-document-retrieval"],
668
666
  "Any2AnyMultilingualRetrieval": ["visual-document-retrieval"],
669
667
  "VisionCentricQA": ["visual-question-answering"],
670
- "ImageClustering": ["image-clustering"],
668
+ "ImageClustering": ["image-feature-extraction"],
671
669
  "ImageClassification": ["image-classification"],
672
670
  "ImageMultilabelClassification": ["image-classification"],
673
671
  "DocumentUnderstanding": ["visual-document-retrieval"],
@@ -695,11 +693,11 @@ class TaskMetadata(BaseModel):
695
693
 
696
694
  def _hf_languages(self) -> list[str]:
697
695
  languages: list[str] = []
698
- if self.is_multilingual:
699
- for val in list(self.eval_langs.values()):
696
+ if self.is_multilingual and isinstance(self.eval_langs, dict):
697
+ for val in self.eval_langs.values():
700
698
  languages.extend(val)
701
699
  else:
702
- languages = self.eval_langs
700
+ languages = cast(list[str], self.eval_langs)
703
701
  # value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters),
704
702
  # or a special value like "code", "multilingual".
705
703
  readme_langs = []
@@ -711,7 +709,7 @@ class TaskMetadata(BaseModel):
711
709
  readme_langs.append(lang_name)
712
710
  return sorted(set(readme_langs))
713
711
 
714
- def _hf_license(self) -> str:
712
+ def _hf_license(self) -> str | None:
715
713
  dataset_license = self.license
716
714
  if dataset_license:
717
715
  license_mapping = {
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from pathlib import Path
4
- from typing import Any, ClassVar, TypedDict
4
+ from typing import Any, ClassVar, TypedDict, cast
5
5
 
6
6
  from datasets import Dataset, DatasetDict
7
7
  from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
@@ -10,7 +10,7 @@ from mteb._evaluators import BitextMiningEvaluator
10
10
  from mteb.abstasks._statistics_calculation import calculate_text_statistics
11
11
  from mteb.abstasks.abstask import AbsTask
12
12
  from mteb.models import EncoderProtocol, MTEBModels
13
- from mteb.types import HFSubset, ScoresDict
13
+ from mteb.types import EncodeKwargs, HFSubset, ScoresDict
14
14
  from mteb.types.statistics import SplitDescriptiveStatistics, TextStatistics
15
15
 
16
16
  logger = logging.getLogger(__name__)
@@ -73,11 +73,14 @@ class AbsTaskBitextMining(AbsTask):
73
73
  split: str = "test",
74
74
  subsets_to_run: list[HFSubset] | None = None,
75
75
  *,
76
- encode_kwargs: dict[str, Any],
76
+ encode_kwargs: EncodeKwargs,
77
77
  prediction_folder: Path | None = None,
78
78
  **kwargs: Any,
79
79
  ) -> dict[HFSubset, ScoresDict]:
80
80
  """Added load for "parallel" datasets"""
81
+ if not isinstance(model, EncoderProtocol):
82
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
83
+
81
84
  if not self.data_loaded:
82
85
  self.load_data()
83
86
 
@@ -87,11 +90,16 @@ class AbsTaskBitextMining(AbsTask):
87
90
  if subsets_to_run is not None:
88
91
  hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
89
92
 
90
- scores = {}
93
+ encoder_model = cast(EncoderProtocol, model)
94
+
95
+ if self.dataset is None:
96
+ raise ValueError("Dataset is not loaded.")
97
+
98
+ scores: dict[str, BitextMiningMetrics] = {}
91
99
  if self.parallel_subsets:
92
- scores = self._evaluate_subset(
93
- model,
94
- self.dataset[split], # type: ignore
100
+ scores = self._evaluate_subset( # type: ignore[assignment]
101
+ encoder_model,
102
+ self.dataset[split],
95
103
  parallel=True,
96
104
  hf_split=split,
97
105
  hf_subset="parallel",
@@ -109,8 +117,8 @@ class AbsTaskBitextMining(AbsTask):
109
117
  data_split = self.dataset[split]
110
118
  else:
111
119
  data_split = self.dataset[hf_subset][split]
112
- scores[hf_subset] = self._evaluate_subset(
113
- model,
120
+ scores[hf_subset] = self._evaluate_subset( # type: ignore[assignment]
121
+ encoder_model,
114
122
  data_split,
115
123
  hf_split=split,
116
124
  hf_subset=hf_subset,
@@ -119,32 +127,32 @@ class AbsTaskBitextMining(AbsTask):
119
127
  **kwargs,
120
128
  )
121
129
 
122
- return scores
130
+ return cast(dict[HFSubset, ScoresDict], scores)
123
131
 
124
132
  def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]:
125
133
  pairs = self._DEFAULT_PAIR
126
134
  if parallel:
127
- pairs = [langpair.split("-") for langpair in self.hf_subsets]
135
+ pairs = [langpair.split("-") for langpair in self.hf_subsets] # type: ignore[misc]
128
136
  return pairs
129
137
 
130
- def _evaluate_subset(
138
+ def _evaluate_subset( # type: ignore[override]
131
139
  self,
132
140
  model: EncoderProtocol,
133
141
  data_split: Dataset,
134
142
  *,
135
143
  hf_split: str,
136
144
  hf_subset: str,
137
- parallel: bool = False,
138
- encode_kwargs: dict[str, Any],
145
+ encode_kwargs: EncodeKwargs,
139
146
  prediction_folder: Path | None = None,
147
+ parallel: bool = False,
140
148
  **kwargs,
141
- ) -> ScoresDict:
149
+ ) -> BitextMiningMetrics | dict[str, BitextMiningMetrics]:
142
150
  pairs = self._get_pairs(parallel)
143
151
 
144
152
  evaluator = BitextMiningEvaluator(
145
153
  data_split,
146
154
  task_metadata=self.metadata,
147
- pair_columns=pairs, # type: ignore
155
+ pair_columns=pairs,
148
156
  hf_split=hf_split,
149
157
  hf_subset=hf_subset,
150
158
  **kwargs,
@@ -168,16 +176,16 @@ class AbsTaskBitextMining(AbsTask):
168
176
  )
169
177
 
170
178
  if parallel:
171
- metrics = {}
179
+ parallel_metrics = {}
172
180
  for keys, nearest_neighbors in neighbours.items():
173
- metrics[keys] = self._compute_metrics(nearest_neighbors, gold)
181
+ parallel_metrics[keys] = self._compute_metrics(nearest_neighbors, gold)
174
182
 
175
- for v in metrics.values():
183
+ for v in parallel_metrics.values():
176
184
  self._add_main_score(v)
177
- else:
178
- def_pair_str = "-".join(self._DEFAULT_PAIR[0])
179
- metrics = self._compute_metrics(neighbours[def_pair_str], gold)
180
- self._add_main_score(metrics)
185
+ return parallel_metrics
186
+ def_pair_str = "-".join(self._DEFAULT_PAIR[0])
187
+ metrics = self._compute_metrics(neighbours[def_pair_str], gold)
188
+ self._add_main_score(metrics)
181
189
  return metrics
182
190
 
183
191
  def _compute_metrics(
@@ -250,8 +258,11 @@ class AbsTaskBitextMining(AbsTask):
250
258
  )
251
259
 
252
260
  def _push_dataset_to_hub(self, repo_name: str) -> None:
261
+ if self.dataset is None:
262
+ raise ValueError("Dataset is not loaded.")
263
+
253
264
  if self.metadata.is_multilingual:
254
- dataset = defaultdict(dict)
265
+ dataset: dict[str, dict[str, list[str]]] = defaultdict(dict)
255
266
  for config in self.metadata.eval_langs:
256
267
  logger.info(f"Converting {config} of {self.metadata.name}")
257
268
 
@@ -266,10 +277,10 @@ class AbsTaskBitextMining(AbsTask):
266
277
  for split in self.dataset[config]:
267
278
  dataset[split][lang_1] = self.dataset[config][split][sent_1]
268
279
  dataset[split][lang_2] = self.dataset[config][split][sent_2]
269
- for split in dataset:
270
- dataset[split] = Dataset.from_dict(dataset[split])
271
- dataset = DatasetDict(dataset)
272
- dataset.push_to_hub(repo_name)
280
+ dataset_dict = DatasetDict(
281
+ {split: Dataset.from_dict(dataset[split]) for split in dataset}
282
+ )
283
+ dataset_dict.push_to_hub(repo_name)
273
284
  else:
274
285
  sentences = {}
275
286
  for split in self.dataset:
@@ -16,7 +16,7 @@ else:
16
16
 
17
17
  logger = logging.getLogger(__name__)
18
18
 
19
- OLD_FORMAT_RERANKING_TASKS = []
19
+ OLD_FORMAT_RERANKING_TASKS: list[str] = []
20
20
 
21
21
 
22
22
  @deprecated(
@@ -100,12 +100,14 @@ class AbsTaskReranking(AbsTaskRetrieval):
100
100
  if self.metadata.name not in OLD_FORMAT_RERANKING_TASKS:
101
101
  return
102
102
 
103
- logging.info(
103
+ logger.info(
104
104
  f"Transforming old format to standard format for {self.metadata.name}"
105
105
  )
106
106
 
107
107
  given_dataset = copy(given_dataset)
108
- self.dataset = defaultdict(lambda: defaultdict(dict))
108
+ self.dataset: dict[str, dict[str, RetrievalSplitData]] = defaultdict(
109
+ lambda: defaultdict(dict) # type: ignore[arg-type]
110
+ )
109
111
 
110
112
  hf_subsets = self.hf_subsets
111
113
 
@@ -115,19 +117,19 @@ class AbsTaskReranking(AbsTaskRetrieval):
115
117
  if hf_subset in cur_dataset:
116
118
  cur_dataset = cur_dataset[hf_subset]
117
119
  elif "name" in self.metadata.dataset:
118
- cur_dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
120
+ cur_dataset = datasets.load_dataset(**self.metadata.dataset)
119
121
  assert hf_subset == "default", (
120
122
  f"Only default subset is supported for {self.metadata.name} since `name` is given in the metadata."
121
123
  )
122
124
  else:
123
125
  cur_dataset = datasets.load_dataset(
124
126
  **self.metadata.dataset, name=hf_subset
125
- ) # type: ignore
127
+ )
126
128
 
127
129
  for split in cur_dataset:
128
130
  corpus = []
129
131
  queries = []
130
- relevant_docs = defaultdict(dict)
132
+ relevant_docs: dict[str, dict[str, int]] = defaultdict(dict)
131
133
  top_ranked = defaultdict(list)
132
134
 
133
135
  # Create an enumerated dataset to pass indices
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  from pathlib import Path
3
- from typing import Any
4
3
 
5
4
  import numpy as np
6
5
  from datasets import Dataset
@@ -12,7 +11,8 @@ from mteb.abstasks._statistics_calculation import (
12
11
  calculate_text_statistics,
13
12
  )
14
13
  from mteb.abstasks.abstask import AbsTask
15
- from mteb.models import EncoderProtocol
14
+ from mteb.models import EncoderProtocol, MTEBModels
15
+ from mteb.types import EncodeKwargs
16
16
  from mteb.types.statistics import (
17
17
  ScoreStatistics,
18
18
  SplitDescriptiveStatistics,
@@ -77,17 +77,22 @@ class AbsTaskSummarization(AbsTask):
77
77
 
78
78
  def _evaluate_subset(
79
79
  self,
80
- model: EncoderProtocol,
80
+ model: MTEBModels,
81
81
  data_split: Dataset,
82
82
  *,
83
83
  hf_split: str,
84
84
  hf_subset: str,
85
- encode_kwargs: dict[str, Any],
85
+ encode_kwargs: EncodeKwargs,
86
86
  prediction_folder: Path | None = None,
87
87
  **kwargs,
88
88
  ) -> SummarizationMetrics:
89
+ if not isinstance(model, EncoderProtocol):
90
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
91
+
89
92
  normalized_scores = [
90
- (np.array(x) - self.min_score) / (self.max_score - self.min_score)
93
+ (
94
+ (np.array(x) - self.min_score) / (self.max_score - self.min_score)
95
+ ).tolist()
91
96
  for x in data_split[self.relevancy_column_name]
92
97
  ]
93
98
  evaluator = self.evaluator(
@@ -1,13 +1,14 @@
1
1
  import logging
2
2
  from pathlib import Path
3
- from typing import Any, TypedDict
3
+ from typing import TypedDict
4
4
 
5
5
  import torch
6
6
  from datasets import Dataset
7
7
  from sklearn import metrics
8
8
 
9
9
  from mteb._evaluators import ZeroShotClassificationEvaluator
10
- from mteb.models import EncoderProtocol
10
+ from mteb.models import EncoderProtocol, MTEBModels
11
+ from mteb.types import EncodeKwargs
11
12
  from mteb.types.statistics import (
12
13
  ImageStatistics,
13
14
  LabelStatistics,
@@ -111,15 +112,18 @@ class AbsTaskZeroShotClassification(AbsTask):
111
112
 
112
113
  def _evaluate_subset(
113
114
  self,
114
- model: EncoderProtocol,
115
+ model: MTEBModels,
115
116
  data_split: Dataset,
116
117
  *,
117
118
  hf_split: str,
118
119
  hf_subset: str,
119
- encode_kwargs: dict[str, Any],
120
+ encode_kwargs: EncodeKwargs,
120
121
  prediction_folder: Path | None = None,
121
122
  **kwargs,
122
123
  ) -> ZeroShotClassificationMetrics:
124
+ if not isinstance(model, EncoderProtocol):
125
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
126
+
123
127
  candidate_labels = self.get_candidate_labels()
124
128
  data_split = data_split.select_columns(
125
129
  [self.input_column_name, self.label_column_name]
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from collections.abc import Iterable, Sequence
3
+ from collections.abc import Iterator, Sequence
4
4
  from dataclasses import dataclass, field
5
5
  from typing import TYPE_CHECKING, Literal
6
6
 
@@ -19,6 +19,7 @@ class Benchmark:
19
19
 
20
20
  Args:
21
21
  name: The name of the benchmark
22
+ aliases: Alternative names for the benchmark
22
23
  tasks: The tasks within the benchmark.
23
24
  description: A description of the benchmark, should include its intended goal and potentially a description of its construction
24
25
  reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
@@ -38,6 +39,7 @@ class Benchmark:
38
39
 
39
40
  name: str
40
41
  tasks: Sequence[AbsTask]
42
+ aliases: Sequence[str] = field(default_factory=tuple)
41
43
  description: str | None = None
42
44
  reference: StrURL | None = None
43
45
  citation: str | None = None
@@ -47,7 +49,7 @@ class Benchmark:
47
49
  display_name: str | None = None
48
50
  language_view: list[str] | Literal["all"] = field(default_factory=list)
49
51
 
50
- def __iter__(self) -> Iterable[AbsTask]:
52
+ def __iter__(self) -> Iterator[AbsTask]:
51
53
  return iter(self.tasks)
52
54
 
53
55
  def __len__(self) -> int:
@@ -6,6 +6,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
6
6
  BUILT_MTEB,
7
7
  C_MTEB,
8
8
  CHEMTEB,
9
+ CHEMTEB_V1_1,
9
10
  CODE_RAG,
10
11
  ENCODECHKA,
11
12
  FA_MTEB,
@@ -14,6 +15,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
14
15
  JINA_VDR,
15
16
  JMTEB_LITE_V1,
16
17
  JMTEB_V2,
18
+ KOVIDORE_V2,
17
19
  LONG_EMBED,
18
20
  MIEB_ENG,
19
21
  MIEB_IMG,
@@ -69,6 +71,7 @@ __all__ = [
69
71
  "BRIGHT_LONG",
70
72
  "BUILT_MTEB",
71
73
  "CHEMTEB",
74
+ "CHEMTEB_V1_1",
72
75
  "CODE_RAG",
73
76
  "C_MTEB",
74
77
  "ENCODECHKA",
@@ -79,6 +82,7 @@ __all__ = [
79
82
  "JINA_VDR",
80
83
  "JMTEB_LITE_V1",
81
84
  "JMTEB_V2",
85
+ "KOVIDORE_V2",
82
86
  "LONG_EMBED",
83
87
  "MIEB_ENG",
84
88
  "MIEB_IMG",