mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +17 -18
  3. mteb/_evaluators/any_sts_evaluator.py +3 -3
  4. mteb/_evaluators/clustering_evaluator.py +2 -2
  5. mteb/_evaluators/evaluator.py +4 -2
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
  7. mteb/_evaluators/pair_classification_evaluator.py +5 -3
  8. mteb/_evaluators/retrieval_evaluator.py +2 -2
  9. mteb/_evaluators/retrieval_metrics.py +18 -17
  10. mteb/_evaluators/sklearn_evaluator.py +11 -10
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
  12. mteb/_evaluators/text/summarization_evaluator.py +23 -18
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  14. mteb/abstasks/_data_filter/filters.py +1 -1
  15. mteb/abstasks/_data_filter/task_pipelines.py +3 -0
  16. mteb/abstasks/_statistics_calculation.py +18 -10
  17. mteb/abstasks/_stratification.py +18 -18
  18. mteb/abstasks/abstask.py +35 -28
  19. mteb/abstasks/aggregate_task_metadata.py +1 -9
  20. mteb/abstasks/aggregated_task.py +10 -29
  21. mteb/abstasks/classification.py +15 -10
  22. mteb/abstasks/clustering.py +19 -15
  23. mteb/abstasks/clustering_legacy.py +10 -10
  24. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  25. mteb/abstasks/multilabel_classification.py +23 -19
  26. mteb/abstasks/pair_classification.py +20 -11
  27. mteb/abstasks/regression.py +4 -4
  28. mteb/abstasks/retrieval.py +28 -24
  29. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  30. mteb/abstasks/sts.py +8 -5
  31. mteb/abstasks/task_metadata.py +31 -33
  32. mteb/abstasks/text/bitext_mining.py +39 -28
  33. mteb/abstasks/text/reranking.py +8 -6
  34. mteb/abstasks/text/summarization.py +10 -5
  35. mteb/abstasks/zeroshot_classification.py +8 -4
  36. mteb/benchmarks/benchmark.py +4 -2
  37. mteb/benchmarks/benchmarks/__init__.py +4 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +112 -11
  39. mteb/benchmarks/get_benchmark.py +14 -55
  40. mteb/cache.py +182 -29
  41. mteb/cli/_display_tasks.py +2 -2
  42. mteb/cli/build_cli.py +110 -14
  43. mteb/cli/generate_model_card.py +43 -23
  44. mteb/deprecated_evaluator.py +63 -49
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  49. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  50. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  51. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  53. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  54. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  55. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  56. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  57. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  58. mteb/evaluate.py +44 -33
  59. mteb/filter_tasks.py +25 -26
  60. mteb/get_tasks.py +29 -30
  61. mteb/languages/language_scripts.py +5 -3
  62. mteb/leaderboard/app.py +162 -34
  63. mteb/load_results.py +12 -12
  64. mteb/models/abs_encoder.py +10 -6
  65. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  66. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
  67. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  68. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  69. mteb/models/cache_wrappers/cache_wrapper.py +2 -2
  70. mteb/models/get_model_meta.py +21 -3
  71. mteb/models/instruct_wrapper.py +28 -8
  72. mteb/models/model_implementations/align_models.py +1 -1
  73. mteb/models/model_implementations/andersborges.py +4 -4
  74. mteb/models/model_implementations/ara_models.py +1 -1
  75. mteb/models/model_implementations/arctic_models.py +8 -8
  76. mteb/models/model_implementations/b1ade_models.py +1 -1
  77. mteb/models/model_implementations/bge_models.py +45 -21
  78. mteb/models/model_implementations/bica_model.py +3 -3
  79. mteb/models/model_implementations/blip2_models.py +2 -2
  80. mteb/models/model_implementations/blip_models.py +16 -16
  81. mteb/models/model_implementations/bm25.py +4 -4
  82. mteb/models/model_implementations/bmretriever_models.py +6 -4
  83. mteb/models/model_implementations/cadet_models.py +1 -1
  84. mteb/models/model_implementations/cde_models.py +11 -4
  85. mteb/models/model_implementations/clip_models.py +6 -6
  86. mteb/models/model_implementations/clips_models.py +3 -3
  87. mteb/models/model_implementations/codefuse_models.py +5 -5
  88. mteb/models/model_implementations/codesage_models.py +3 -3
  89. mteb/models/model_implementations/cohere_models.py +5 -5
  90. mteb/models/model_implementations/cohere_v.py +2 -2
  91. mteb/models/model_implementations/colpali_models.py +3 -3
  92. mteb/models/model_implementations/colqwen_models.py +8 -8
  93. mteb/models/model_implementations/colsmol_models.py +2 -2
  94. mteb/models/model_implementations/conan_models.py +1 -1
  95. mteb/models/model_implementations/dino_models.py +42 -42
  96. mteb/models/model_implementations/e5_instruct.py +23 -4
  97. mteb/models/model_implementations/e5_models.py +9 -9
  98. mteb/models/model_implementations/e5_v.py +6 -6
  99. mteb/models/model_implementations/eagerworks_models.py +1 -1
  100. mteb/models/model_implementations/emillykkejensen_models.py +6 -6
  101. mteb/models/model_implementations/en_code_retriever.py +1 -1
  102. mteb/models/model_implementations/euler_models.py +2 -2
  103. mteb/models/model_implementations/fa_models.py +9 -9
  104. mteb/models/model_implementations/facebookai.py +14 -2
  105. mteb/models/model_implementations/geogpt_models.py +1 -1
  106. mteb/models/model_implementations/gme_v_models.py +6 -5
  107. mteb/models/model_implementations/google_models.py +1 -1
  108. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
  109. mteb/models/model_implementations/gritlm_models.py +2 -2
  110. mteb/models/model_implementations/gte_models.py +25 -13
  111. mteb/models/model_implementations/hinvec_models.py +1 -1
  112. mteb/models/model_implementations/ibm_granite_models.py +30 -6
  113. mteb/models/model_implementations/inf_models.py +2 -2
  114. mteb/models/model_implementations/jasper_models.py +2 -2
  115. mteb/models/model_implementations/jina_clip.py +48 -10
  116. mteb/models/model_implementations/jina_models.py +18 -11
  117. mteb/models/model_implementations/kblab.py +12 -6
  118. mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
  119. mteb/models/model_implementations/kfst.py +1 -1
  120. mteb/models/model_implementations/kowshik24_models.py +1 -1
  121. mteb/models/model_implementations/lgai_embedding_models.py +1 -1
  122. mteb/models/model_implementations/linq_models.py +1 -1
  123. mteb/models/model_implementations/listconranker.py +1 -1
  124. mteb/models/model_implementations/llm2clip_models.py +6 -6
  125. mteb/models/model_implementations/llm2vec_models.py +8 -8
  126. mteb/models/model_implementations/mcinext_models.py +4 -1
  127. mteb/models/model_implementations/mdbr_models.py +17 -3
  128. mteb/models/model_implementations/misc_models.py +68 -68
  129. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  130. mteb/models/model_implementations/mme5_models.py +1 -1
  131. mteb/models/model_implementations/moco_models.py +4 -4
  132. mteb/models/model_implementations/mod_models.py +1 -1
  133. mteb/models/model_implementations/model2vec_models.py +14 -14
  134. mteb/models/model_implementations/moka_models.py +1 -1
  135. mteb/models/model_implementations/nbailab.py +3 -3
  136. mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
  137. mteb/models/model_implementations/nomic_models.py +30 -15
  138. mteb/models/model_implementations/nomic_models_vision.py +1 -1
  139. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
  140. mteb/models/model_implementations/nvidia_models.py +151 -19
  141. mteb/models/model_implementations/octen_models.py +61 -2
  142. mteb/models/model_implementations/openclip_models.py +13 -13
  143. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
  144. mteb/models/model_implementations/ops_moa_models.py +1 -1
  145. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  146. mteb/models/model_implementations/pawan_models.py +1 -1
  147. mteb/models/model_implementations/piccolo_models.py +1 -1
  148. mteb/models/model_implementations/pixie_models.py +56 -0
  149. mteb/models/model_implementations/promptriever_models.py +4 -4
  150. mteb/models/model_implementations/pylate_models.py +10 -9
  151. mteb/models/model_implementations/qodo_models.py +2 -2
  152. mteb/models/model_implementations/qtack_models.py +1 -1
  153. mteb/models/model_implementations/qwen3_models.py +3 -3
  154. mteb/models/model_implementations/qzhou_models.py +2 -2
  155. mteb/models/model_implementations/random_baseline.py +3 -3
  156. mteb/models/model_implementations/rasgaard_models.py +2 -2
  157. mteb/models/model_implementations/reasonir_model.py +1 -1
  158. mteb/models/model_implementations/repllama_models.py +3 -3
  159. mteb/models/model_implementations/rerankers_custom.py +12 -6
  160. mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
  161. mteb/models/model_implementations/richinfoai_models.py +1 -1
  162. mteb/models/model_implementations/ru_sentence_models.py +20 -20
  163. mteb/models/model_implementations/ruri_models.py +10 -10
  164. mteb/models/model_implementations/salesforce_models.py +3 -3
  165. mteb/models/model_implementations/samilpwc_models.py +1 -1
  166. mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
  167. mteb/models/model_implementations/searchmap_models.py +1 -1
  168. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
  169. mteb/models/model_implementations/sentence_transformers_models.py +124 -22
  170. mteb/models/model_implementations/shuu_model.py +1 -1
  171. mteb/models/model_implementations/siglip_models.py +20 -20
  172. mteb/models/model_implementations/slm_models.py +416 -0
  173. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
  174. mteb/models/model_implementations/stella_models.py +17 -4
  175. mteb/models/model_implementations/tarka_models.py +2 -2
  176. mteb/models/model_implementations/text2vec_models.py +9 -3
  177. mteb/models/model_implementations/ua_sentence_models.py +1 -1
  178. mteb/models/model_implementations/uae_models.py +7 -1
  179. mteb/models/model_implementations/vdr_models.py +1 -1
  180. mteb/models/model_implementations/vi_vn_models.py +6 -6
  181. mteb/models/model_implementations/vlm2vec_models.py +3 -3
  182. mteb/models/model_implementations/voyage_models.py +84 -0
  183. mteb/models/model_implementations/voyage_v.py +9 -7
  184. mteb/models/model_implementations/youtu_models.py +1 -1
  185. mteb/models/model_implementations/yuan_models.py +1 -1
  186. mteb/models/model_implementations/yuan_models_en.py +1 -1
  187. mteb/models/model_meta.py +80 -31
  188. mteb/models/models_protocols.py +22 -6
  189. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
  190. mteb/models/search_wrappers.py +33 -18
  191. mteb/models/sentence_transformer_wrapper.py +50 -25
  192. mteb/models/vllm_wrapper.py +327 -0
  193. mteb/py.typed +0 -0
  194. mteb/results/benchmark_results.py +29 -21
  195. mteb/results/model_result.py +52 -22
  196. mteb/results/task_result.py +80 -58
  197. mteb/similarity_functions.py +11 -7
  198. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  199. mteb/tasks/classification/est/estonian_valence.py +1 -1
  200. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  201. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  202. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  203. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  204. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  205. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  206. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  207. mteb/tasks/retrieval/code/code_rag.py +12 -12
  208. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  209. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  210. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  211. mteb/tasks/retrieval/eng/__init__.py +2 -0
  212. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  213. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  214. mteb/tasks/retrieval/kor/__init__.py +15 -1
  215. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  216. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  217. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  218. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  219. mteb/tasks/retrieval/nob/norquad.py +2 -2
  220. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  221. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  222. mteb/tasks/retrieval/vie/__init__.py +14 -6
  223. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  224. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  225. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  226. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  227. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  228. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  229. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  230. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  231. mteb/types/__init__.py +2 -0
  232. mteb/types/_encoder_io.py +12 -0
  233. mteb/types/_result.py +2 -1
  234. mteb/types/statistics.py +9 -3
  235. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
  236. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
  237. mteb/models/model_implementations/mxbai_models.py +0 -111
  238. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  239. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  240. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  241. {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,7 @@ from sklearn.metrics import (
16
16
 
17
17
  from mteb._evaluators.sklearn_evaluator import SklearnEvaluator, SklearnModelProtocol
18
18
  from mteb.models import EncoderProtocol, MTEBModels
19
- from mteb.types import HFSubset, ScoresDict
19
+ from mteb.types import EncodeKwargs, HFSubset, ScoresDict
20
20
  from mteb.types.statistics import (
21
21
  ImageStatistics,
22
22
  LabelStatistics,
@@ -98,9 +98,8 @@ class AbsTaskClassification(AbsTask):
98
98
  text: str (for text) or PIL.Image (for image). Column name can be changed via `input_column_name` attribute.
99
99
  label: int. Column name can be changed via `label_column_name` attribute.
100
100
  evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LogisticRegression`.
101
- Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol].
102
- samples_per_label: Number of samples per label to use for training the evaluator model. Default is 8.
103
- n_experiments: Number of experiments to run. Default is 10.
101
+ samples_per_label: Number of samples per label to use for training the evaluator model. Default is 8.
102
+ n_experiments: Number of experiments to run. Default is 10.
104
103
  train_split: Name of the split to use for training the evaluator model. Default is "train".
105
104
  label_column_name: Name of the column containing the labels. Default is "label".
106
105
  input_column_name: Name of the column containing the input data. Default is "text".
@@ -126,7 +125,7 @@ class AbsTaskClassification(AbsTask):
126
125
  split: str = "test",
127
126
  subsets_to_run: list[HFSubset] | None = None,
128
127
  *,
129
- encode_kwargs: dict[str, Any],
128
+ encode_kwargs: EncodeKwargs,
130
129
  prediction_folder: Path | None = None,
131
130
  **kwargs: Any,
132
131
  ) -> dict[HFSubset, ScoresDict]:
@@ -143,6 +142,9 @@ class AbsTaskClassification(AbsTask):
143
142
  if not self.data_loaded:
144
143
  self.load_data()
145
144
 
145
+ if self.dataset is None:
146
+ raise RuntimeError("Dataset not loaded.")
147
+
146
148
  if "random_state" in self.evaluator_model.get_params():
147
149
  self.evaluator_model = self.evaluator_model.set_params(
148
150
  random_state=self.seed
@@ -175,19 +177,22 @@ class AbsTaskClassification(AbsTask):
175
177
  )
176
178
  self._add_main_score(scores[hf_subset])
177
179
 
178
- return scores
180
+ return scores # type: ignore[return-value]
179
181
 
180
182
  def _evaluate_subset(
181
183
  self,
182
- model: EncoderProtocol,
184
+ model: MTEBModels,
183
185
  data_split: DatasetDict,
184
186
  *,
185
- encode_kwargs: dict[str, Any],
187
+ encode_kwargs: EncodeKwargs,
186
188
  hf_split: str,
187
189
  hf_subset: str,
188
190
  prediction_folder: Path | None = None,
189
191
  **kwargs: Any,
190
192
  ) -> FullClassificationMetrics:
193
+ if not isinstance(model, EncoderProtocol):
194
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
195
+
191
196
  train_split = data_split[self.train_split]
192
197
  eval_split = data_split[hf_split]
193
198
 
@@ -237,7 +242,7 @@ class AbsTaskClassification(AbsTask):
237
242
  # ap will be none for non binary classification tasks
238
243
  k: (
239
244
  float(np.mean(values))
240
- if (values := [s[k] for s in scores if s[k] is not None])
245
+ if (values := [s[k] for s in scores if s[k] is not None]) # type: ignore[literal-required]
241
246
  else np.nan
242
247
  )
243
248
  for k in scores[0].keys()
@@ -245,7 +250,7 @@ class AbsTaskClassification(AbsTask):
245
250
  logger.info(f"Running {self.metadata.name} - Finished.")
246
251
  return FullClassificationMetrics(
247
252
  scores_per_experiment=scores,
248
- **avg_scores,
253
+ **avg_scores, # type: ignore[typeddict-item]
249
254
  )
250
255
 
251
256
  def _calculate_scores(
@@ -3,7 +3,7 @@ import logging
3
3
  import random
4
4
  from collections import defaultdict
5
5
  from pathlib import Path
6
- from typing import Any
6
+ from typing import Any, cast
7
7
 
8
8
  import numpy as np
9
9
  from datasets import Dataset, DatasetDict
@@ -11,8 +11,8 @@ from sklearn.cluster import MiniBatchKMeans
11
11
  from sklearn.metrics.cluster import v_measure_score
12
12
 
13
13
  from mteb._create_dataloaders import create_dataloader
14
- from mteb.models import EncoderProtocol
15
- from mteb.types import HFSubset, ScoresDict
14
+ from mteb.models import EncoderProtocol, MTEBModels
15
+ from mteb.types import Array, EncodeKwargs, HFSubset, ScoresDict
16
16
  from mteb.types.statistics import (
17
17
  ImageStatistics,
18
18
  LabelStatistics,
@@ -34,7 +34,7 @@ MultilingualDataset = dict[HFSubset, DatasetDict]
34
34
 
35
35
 
36
36
  def _evaluate_clustering_bootstrapped(
37
- embeddings: np.ndarray,
37
+ embeddings: Array,
38
38
  labels: list[list[str]],
39
39
  n_clusters: int,
40
40
  cluster_size: int,
@@ -61,21 +61,21 @@ def _evaluate_clustering_bootstrapped(
61
61
  max_depth = max(map(len, labels))
62
62
  # Evaluate on each level til max depth
63
63
  for i_level in range(max_depth):
64
- level_labels = []
64
+ level_labels: list[str | int] = []
65
65
  # Assign -1 to gold label if the level is not there
66
66
  for label in labels:
67
67
  if len(label) > i_level:
68
68
  level_labels.append(label[i_level])
69
69
  else:
70
70
  level_labels.append(-1)
71
- level_labels = np.array(level_labels)
71
+ np_level_labels = np.array(level_labels)
72
72
  valid_idx = np.array(
73
- [level_label != -1 for level_label in level_labels]
73
+ [level_label != -1 for level_label in np_level_labels]
74
74
  ) # Could be level_labels != -1 but fails with FutureWarning: elementwise comparison failed
75
- level_labels = level_labels[valid_idx]
75
+ np_level_labels = np_level_labels[valid_idx]
76
76
  level_embeddings = embeddings[valid_idx]
77
77
  clustering_model = MiniBatchKMeans(
78
- n_clusters=np.unique(level_labels).size,
78
+ n_clusters=np.unique(np_level_labels).size,
79
79
  batch_size=kmean_batch_size,
80
80
  init="k-means++",
81
81
  n_init=1, # default when kmeans++ is used
@@ -87,7 +87,7 @@ def _evaluate_clustering_bootstrapped(
87
87
  cluster_indices = rng_state.choices(range(n_embeddings), k=cluster_size)
88
88
 
89
89
  _embeddings = level_embeddings[cluster_indices]
90
- _labels = level_labels[cluster_indices]
90
+ _labels = np_level_labels[cluster_indices]
91
91
  cluster_assignment = clustering_model.fit_predict(_embeddings)
92
92
  v_measure = v_measure_score(_labels, cluster_assignment)
93
93
  v_measures[f"Level {i_level}"].append(v_measure)
@@ -153,15 +153,19 @@ class AbsTaskClustering(AbsTask):
153
153
 
154
154
  def _evaluate_subset(
155
155
  self,
156
- model: EncoderProtocol,
156
+ model: MTEBModels,
157
157
  data_split: Dataset,
158
158
  *,
159
- encode_kwargs: dict[str, Any],
159
+ encode_kwargs: EncodeKwargs,
160
160
  hf_split: str,
161
161
  hf_subset: str,
162
162
  prediction_folder: Path | None = None,
163
163
  **kwargs: Any,
164
164
  ) -> ScoresDict:
165
+ if not isinstance(model, EncoderProtocol):
166
+ raise TypeError(
167
+ "Expected encoder model to be an instance of EncoderProtocol."
168
+ )
165
169
  if (
166
170
  self.max_document_to_embed is not None
167
171
  and self.max_fraction_of_documents_to_embed is not None
@@ -182,13 +186,13 @@ class AbsTaskClustering(AbsTask):
182
186
  self.max_fraction_of_documents_to_embed * len(data_split)
183
187
  )
184
188
  else:
185
- max_documents_to_embed = self.max_document_to_embed
189
+ max_documents_to_embed = cast(int, self.max_document_to_embed)
186
190
 
187
- max_documents_to_embed = min(len(data_split), max_documents_to_embed) # type: ignore
191
+ max_documents_to_embed = min(len(data_split), max_documents_to_embed)
188
192
  example_indices = self.rng_state.sample(
189
193
  range(len(data_split)), k=max_documents_to_embed
190
194
  )
191
- downsampled_dataset = data_split.select(example_indices) # type: ignore
195
+ downsampled_dataset = data_split.select(example_indices)
192
196
 
193
197
  downsampled_dataset = downsampled_dataset.select_columns(
194
198
  [self.input_column_name, self.label_column_name]
@@ -8,8 +8,8 @@ from scipy.optimize import linear_sum_assignment
8
8
  from sklearn import metrics
9
9
 
10
10
  from mteb._evaluators import ClusteringEvaluator
11
- from mteb.models import EncoderProtocol
12
- from mteb.types import ScoresDict
11
+ from mteb.models import EncoderProtocol, MTEBModels
12
+ from mteb.types import EncodeKwargs, ScoresDict
13
13
  from mteb.types.statistics import (
14
14
  ImageStatistics,
15
15
  LabelStatistics,
@@ -80,15 +80,18 @@ class AbsTaskClusteringLegacy(AbsTask):
80
80
 
81
81
  def _evaluate_subset(
82
82
  self,
83
- model: EncoderProtocol,
83
+ model: MTEBModels,
84
84
  data_split: Dataset,
85
85
  *,
86
- encode_kwargs: dict[str, Any],
86
+ encode_kwargs: EncodeKwargs,
87
87
  hf_split: str,
88
88
  hf_subset: str,
89
89
  prediction_folder: Path | None = None,
90
90
  **kwargs: Any,
91
91
  ) -> ScoresDict:
92
+ if not isinstance(model, EncoderProtocol):
93
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
94
+
92
95
  data_split = data_split.select_columns(
93
96
  [self.input_column_name, self.label_column_name]
94
97
  )
@@ -139,9 +142,6 @@ class AbsTaskClusteringLegacy(AbsTask):
139
142
  }
140
143
  return scores
141
144
 
142
- data_split = data_split.select_columns(
143
- [self.input_column_name, self.label_column_name]
144
- )
145
145
  evaluator = self.evaluator(
146
146
  data_split,
147
147
  input_column_name=self.input_column_name,
@@ -151,10 +151,10 @@ class AbsTaskClusteringLegacy(AbsTask):
151
151
  hf_subset=hf_subset,
152
152
  **kwargs,
153
153
  )
154
- clusters = evaluator(model, encode_kwargs=encode_kwargs)
154
+ evaluate_clusters = evaluator(model, encode_kwargs=encode_kwargs)
155
155
  if prediction_folder:
156
156
  self._save_task_predictions(
157
- clusters,
157
+ evaluate_clusters,
158
158
  model,
159
159
  prediction_folder,
160
160
  hf_subset=hf_subset,
@@ -163,7 +163,7 @@ class AbsTaskClusteringLegacy(AbsTask):
163
163
 
164
164
  return self._compute_metrics(
165
165
  data_split[self.label_column_name],
166
- clusters,
166
+ evaluate_clusters,
167
167
  )
168
168
 
169
169
  def _compute_metrics(
@@ -12,7 +12,8 @@ from mteb.abstasks._statistics_calculation import (
12
12
  calculate_text_statistics,
13
13
  )
14
14
  from mteb.abstasks.abstask import AbsTask
15
- from mteb.models.models_protocols import EncoderProtocol
15
+ from mteb.models.models_protocols import EncoderProtocol, MTEBModels
16
+ from mteb.types import EncodeKwargs
16
17
  from mteb.types.statistics import (
17
18
  ImageStatistics,
18
19
  SplitDescriptiveStatistics,
@@ -116,15 +117,17 @@ class AbsTaskImageTextPairClassification(AbsTask):
116
117
 
117
118
  def _evaluate_subset(
118
119
  self,
119
- model: EncoderProtocol,
120
+ model: MTEBModels,
120
121
  data_split: Dataset,
121
122
  *,
122
- encode_kwargs: dict[str, Any],
123
+ encode_kwargs: EncodeKwargs,
123
124
  hf_split: str,
124
125
  hf_subset: str,
125
126
  prediction_folder: Path | None = None,
126
127
  **kwargs: Any,
127
128
  ) -> ImageTextPairClassificationMetrics:
129
+ if not isinstance(model, EncoderProtocol):
130
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
128
131
  select_columns = []
129
132
  for columns in (self.images_column_names, self.texts_column_names):
130
133
  if isinstance(columns, str):
@@ -154,7 +157,7 @@ class AbsTaskImageTextPairClassification(AbsTask):
154
157
  hf_subset=hf_subset,
155
158
  **kwargs,
156
159
  )
157
- scores = evaluator(model, encode_kwargs=encode_kwargs)
160
+ scores: list[torch.Tensor] = evaluator(model, encode_kwargs=encode_kwargs) # type: ignore[assignment]
158
161
  if prediction_folder:
159
162
  self._save_task_predictions(
160
163
  [score.tolist() for score in scores],
@@ -16,7 +16,8 @@ from typing_extensions import override
16
16
  from mteb._create_dataloaders import create_dataloader
17
17
  from mteb._evaluators.classification_metrics import hamming_score
18
18
  from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
19
- from mteb.models import EncoderProtocol
19
+ from mteb.models import EncoderProtocol, MTEBModels
20
+ from mteb.types import Array, EncodeKwargs
20
21
 
21
22
  from .classification import AbsTaskClassification
22
23
 
@@ -24,14 +25,14 @@ logger = logging.getLogger(__name__)
24
25
 
25
26
 
26
27
  def _evaluate_classifier(
27
- embeddings_train: np.ndarray,
28
+ embeddings_train: Array,
28
29
  y_train: np.ndarray,
29
- embeddings_test: np.ndarray,
30
+ embeddings_test: Array,
30
31
  classifier: SklearnModelProtocol,
31
32
  ) -> tuple[np.ndarray, SklearnModelProtocol]:
32
- classifier: SklearnModelProtocol = clone(classifier)
33
- classifier.fit(embeddings_train, y_train)
34
- return classifier.predict(embeddings_test), classifier
33
+ classifier_copy: SklearnModelProtocol = clone(classifier)
34
+ classifier_copy.fit(embeddings_train, y_train)
35
+ return classifier_copy.predict(embeddings_test), classifier_copy
35
36
 
36
37
 
37
38
  class MultilabelClassificationMetrics(TypedDict):
@@ -69,25 +70,28 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
69
70
  input_column_name: Name of the column containing the input text.
70
71
  label_column_name: Name of the column containing the labels.
71
72
  samples_per_label: Number of samples to use pr. label. These samples are embedded and a classifier is fit using the labels and samples.
72
- evaluator: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
73
+ evaluator_model: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
73
74
  """
74
75
 
75
- evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
76
+ evaluator_model: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
76
77
  input_column_name: str = "text"
77
78
  label_column_name: str = "label"
78
79
 
79
80
  @override
80
- def _evaluate_subset(
81
+ def _evaluate_subset( # type: ignore[override]
81
82
  self,
82
- model: EncoderProtocol,
83
+ model: MTEBModels,
83
84
  data_split: DatasetDict,
84
85
  *,
85
- encode_kwargs: dict[str, Any],
86
+ encode_kwargs: EncodeKwargs,
86
87
  hf_split: str,
87
88
  hf_subset: str,
88
89
  prediction_folder: Path | None = None,
89
90
  **kwargs: Any,
90
91
  ) -> FullMultilabelClassificationMetrics:
92
+ if not isinstance(model, EncoderProtocol):
93
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
94
+
91
95
  if isinstance(data_split, DatasetDict):
92
96
  data_split = data_split.select_columns(
93
97
  [self.input_column_name, self.label_column_name]
@@ -165,7 +169,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
165
169
  y_train = train_split.select(sample_indices)[self.label_column_name]
166
170
  y_train = binarizer.transform(y_train)
167
171
  y_pred, current_classifier = _evaluate_classifier(
168
- X_train, y_train, X_test, self.evaluator
172
+ X_train, y_train, X_test, self.evaluator_model
169
173
  )
170
174
  if prediction_folder:
171
175
  all_predictions.append(y_pred.tolist())
@@ -185,19 +189,20 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
185
189
  )
186
190
 
187
191
  avg_scores: dict[str, Any] = {
188
- k: np.mean([s[k] for s in scores]) for k in scores[0].keys()
192
+ k: np.mean([s[k] for s in scores]) # type: ignore[literal-required]
193
+ for k in scores[0].keys()
189
194
  }
190
195
  logger.info("Running multilabel classification - Finished.")
191
196
  return FullMultilabelClassificationMetrics(
192
197
  scores_per_experiment=scores,
193
- **avg_scores,
198
+ **avg_scores, # type: ignore[typeddict-item]
194
199
  )
195
200
 
196
- def _calculate_scores(
201
+ def _calculate_scores( # type: ignore[override]
197
202
  self,
198
203
  y_test: np.ndarray,
199
204
  y_pred: np.ndarray,
200
- x_test_embedding: np.ndarray,
205
+ x_test_embedding: Array,
201
206
  current_classifier: SklearnModelProtocol,
202
207
  ) -> MultilabelClassificationMetrics:
203
208
  accuracy = current_classifier.score(x_test_embedding, y_test)
@@ -232,10 +237,9 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
232
237
  """
233
238
  sample_indices = []
234
239
  if idxs is None:
235
- idxs = np.arange(len(y))
240
+ idxs = list(np.arange(len(y)))
236
241
  self.np_rng.shuffle(idxs)
237
- idxs = idxs.tolist()
238
- label_counter = defaultdict(int)
242
+ label_counter: dict[int, int] = defaultdict(int)
239
243
  for i in idxs:
240
244
  if any((label_counter[label] < samples_per_label) for label in y[i]):
241
245
  sample_indices.append(i)
@@ -18,8 +18,8 @@ from mteb.abstasks._statistics_calculation import (
18
18
  )
19
19
  from mteb.abstasks.abstask import AbsTask
20
20
  from mteb.models.model_meta import ScoringFunction
21
- from mteb.models.models_protocols import EncoderProtocol
22
- from mteb.types import PromptType
21
+ from mteb.models.models_protocols import EncoderProtocol, MTEBModels
22
+ from mteb.types import EncodeKwargs, PromptType
23
23
  from mteb.types.statistics import (
24
24
  ImageStatistics,
25
25
  LabelStatistics,
@@ -44,8 +44,8 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
44
44
  """
45
45
 
46
46
  num_samples: int
47
- number_of_characters: int
48
- unique_pairs: int
47
+ number_of_characters: int | None
48
+ unique_pairs: int | None
49
49
 
50
50
  text1_statistics: TextStatistics | None
51
51
  image1_statistics: ImageStatistics | None
@@ -79,15 +79,18 @@ class AbsTaskPairClassification(AbsTask):
79
79
 
80
80
  def _evaluate_subset(
81
81
  self,
82
- model: EncoderProtocol,
82
+ model: MTEBModels,
83
83
  data_split: Dataset,
84
84
  *,
85
85
  hf_split: str,
86
86
  hf_subset: str,
87
- encode_kwargs: dict[str, str],
87
+ encode_kwargs: EncodeKwargs,
88
88
  prediction_folder: Path | None = None,
89
89
  **kwargs,
90
90
  ) -> dict[str, float]:
91
+ if not isinstance(model, EncoderProtocol):
92
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
93
+
91
94
  if self.metadata.modalities == ["text"]:
92
95
  # for compatibility with v1 version where datasets were stored in a single row
93
96
  data_split = data_split[0] if len(data_split) == 1 else data_split
@@ -120,7 +123,7 @@ class AbsTaskPairClassification(AbsTask):
120
123
  self, similarity_scores: PairClassificationDistances, labels: list[int]
121
124
  ) -> dict[str, float]:
122
125
  logger.info("Computing metrics...")
123
- labels = np.asarray(labels)
126
+ np_labels = np.asarray(labels)
124
127
  output_scores = {}
125
128
  max_scores = defaultdict(list)
126
129
  for short_name, scores, reverse in [
@@ -142,7 +145,7 @@ class AbsTaskPairClassification(AbsTask):
142
145
  ],
143
146
  [ScoringFunction.DOT_PRODUCT.value, similarity_scores["dot_scores"], True],
144
147
  ]:
145
- metrics = self._compute_metrics_values(scores, labels, reverse)
148
+ metrics = self._compute_metrics_values(scores, np_labels, reverse) # type: ignore[arg-type]
146
149
  for metric_name, metric_value in metrics.items():
147
150
  output_scores[f"{short_name}_{metric_name}"] = metric_value
148
151
  max_scores[metric_name].append(metric_value)
@@ -237,6 +240,12 @@ class AbsTaskPairClassification(AbsTask):
237
240
 
238
241
  def _push_dataset_to_hub(self, repo_name: str) -> None:
239
242
  # previously pair classification datasets were stored in a single row
243
+ if self.dataset is None:
244
+ # overall this shouldn't happen as we check for dataset before pushing to hub
245
+ # added here for type checking purposes
246
+ raise RuntimeError(
247
+ "Dataset not loaded. To load dataset run `task.load_data()`."
248
+ )
240
249
  if self.metadata.is_multilingual:
241
250
  for subset in self.dataset:
242
251
  for split in self.dataset[subset]:
@@ -290,13 +299,13 @@ class AbsTaskPairClassification(AbsTask):
290
299
  )
291
300
 
292
301
  def _find_best_acc_and_threshold(
293
- self, scores: np.ndarray, labels: np.ndarray, high_score_more_similar: bool
302
+ self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool
294
303
  ) -> tuple[float, float]:
295
304
  rows = list(zip(scores, labels))
296
305
  rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
297
306
 
298
307
  max_acc = 0
299
- best_threshold = -1
308
+ best_threshold = -1.0
300
309
  positive_so_far = 0
301
310
  remaining_negatives = sum(np.array(labels) == 0)
302
311
 
@@ -323,7 +332,7 @@ class AbsTaskPairClassification(AbsTask):
323
332
 
324
333
  rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
325
334
 
326
- best_f1 = best_precision = best_recall = 0
335
+ best_f1 = best_precision = best_recall = 0.0
327
336
  threshold = 0
328
337
  nextract = 0
329
338
  ncorrect = 0
@@ -84,10 +84,10 @@ class AbsTaskRegression(AbsTaskClassification):
84
84
  n_samples: Number of samples to use for training the regression model. If the dataset has fewer samples than n_samples, all samples are used.
85
85
  abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
86
86
  evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LinearRegression`.
87
- Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol].
87
+
88
88
  """
89
89
 
90
- evaluator: type[SklearnModelProtocol] = SklearnEvaluator
90
+ evaluator: type[SklearnEvaluator] = SklearnEvaluator
91
91
  evaluator_model: SklearnModelProtocol = LinearRegression(n_jobs=-1)
92
92
 
93
93
  train_split: str = "train"
@@ -113,7 +113,7 @@ class AbsTaskRegression(AbsTaskClassification):
113
113
  )["train"]
114
114
  return train_split_sampled, []
115
115
 
116
- def _calculate_scores(
116
+ def _calculate_scores( # type: ignore[override]
117
117
  self,
118
118
  y_test: np.ndarray | list[int],
119
119
  y_pred: np.ndarray,
@@ -183,7 +183,7 @@ class AbsTaskRegression(AbsTaskClassification):
183
183
 
184
184
  return dataset_dict
185
185
 
186
- def _calculate_descriptive_statistics_from_split(
186
+ def _calculate_descriptive_statistics_from_split( # type: ignore[override]
187
187
  self, split: str, hf_subset: str | None = None, compute_overall: bool = False
188
188
  ) -> RegressionDescriptiveStatistics:
189
189
  train_text = []
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  from collections import defaultdict
4
- from collections.abc import Callable, Sequence
4
+ from collections.abc import Callable, Mapping, Sequence
5
5
  from pathlib import Path
6
6
  from time import time
7
7
  from typing import Any, Literal
@@ -25,6 +25,7 @@ from mteb.models import (
25
25
  SearchProtocol,
26
26
  )
27
27
  from mteb.types import (
28
+ EncodeKwargs,
28
29
  HFSubset,
29
30
  QueryDatasetType,
30
31
  RelevantDocumentsType,
@@ -184,17 +185,17 @@ class AbsTaskRetrieval(AbsTask):
184
185
  return queries, corpus
185
186
 
186
187
  if self.metadata.is_multilingual:
187
- for subset in self.queries:
188
- for split in self.queries[subset]:
189
- queries = self.queries[subset][split]
190
- corpus = self.corpus[subset][split]
188
+ for subset in self.queries: # type: ignore[attr-defined]
189
+ for split in self.queries[subset]: # type: ignore[attr-defined]
190
+ queries = self.queries[subset][split] # type: ignore[attr-defined]
191
+ corpus = self.corpus[subset][split] # type: ignore[attr-defined]
191
192
 
192
193
  (
193
194
  self.dataset[subset][split]["queries"],
194
195
  self.dataset[subset][split]["corpus"],
195
196
  ) = _process_split(queries, corpus)
196
197
 
197
- self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
198
+ self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
198
199
  subset
199
200
  ][split]
200
201
  if hasattr(self, "instructions"):
@@ -211,15 +212,15 @@ class AbsTaskRetrieval(AbsTask):
211
212
  ][split]
212
213
  else:
213
214
  subset = "default"
214
- for split in self.queries:
215
- queries = self.queries[split]
216
- corpus = self.corpus[split]
215
+ for split in self.queries: # type: ignore[attr-defined]
216
+ queries = self.queries[split] # type: ignore[attr-defined]
217
+ corpus = self.corpus[split] # type: ignore[attr-defined]
217
218
  (
218
219
  self.dataset[subset][split]["queries"],
219
220
  self.dataset[subset][split]["corpus"],
220
221
  ) = _process_split(queries, corpus)
221
222
 
222
- self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
223
+ self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
223
224
  split
224
225
  ].copy()
225
226
  if hasattr(self, "instructions"):
@@ -235,9 +236,9 @@ class AbsTaskRetrieval(AbsTask):
235
236
  split
236
237
  ].copy()
237
238
 
238
- del self.queries
239
- del self.corpus
240
- del self.relevant_docs
239
+ del self.queries # type: ignore[attr-defined]
240
+ del self.corpus # type: ignore[attr-defined]
241
+ del self.relevant_docs # type: ignore[attr-defined]
241
242
  if hasattr(self, "instructions"):
242
243
  del self.instructions
243
244
  if hasattr(self, "top_ranked"):
@@ -283,10 +284,10 @@ class AbsTaskRetrieval(AbsTask):
283
284
  split: str = "test",
284
285
  subsets_to_run: list[HFSubset] | None = None,
285
286
  *,
286
- encode_kwargs: dict[str, Any],
287
+ encode_kwargs: EncodeKwargs,
287
288
  prediction_folder: Path | None = None,
288
- **kwargs,
289
- ) -> dict[HFSubset, ScoresDict]:
289
+ **kwargs: Any,
290
+ ) -> Mapping[HFSubset, ScoresDict]:
290
291
  """Evaluate the model on the retrieval task.
291
292
 
292
293
  Args:
@@ -320,7 +321,7 @@ class AbsTaskRetrieval(AbsTask):
320
321
  self,
321
322
  model: MTEBModels,
322
323
  data_split: RetrievalSplitData,
323
- encode_kwargs: dict[str, Any],
324
+ encode_kwargs: EncodeKwargs,
324
325
  hf_split: str,
325
326
  hf_subset: str,
326
327
  prediction_folder: Path | None = None,
@@ -357,6 +358,8 @@ class AbsTaskRetrieval(AbsTask):
357
358
  **kwargs,
358
359
  )
359
360
 
361
+ search_model: SearchProtocol
362
+
360
363
  if isinstance(model, EncoderProtocol) and not isinstance(model, SearchProtocol):
361
364
  search_model = SearchEncoderWrapper(model)
362
365
  elif isinstance(model, CrossEncoderProtocol):
@@ -578,11 +581,12 @@ class AbsTaskRetrieval(AbsTask):
578
581
  if isinstance(data[split][subset_item], Dataset):
579
582
  sections[split] = data[split][subset_item]
580
583
  elif converter is not None:
584
+ subset_data = data[split][subset_item]
585
+ if subset_data is None:
586
+ continue
587
+
581
588
  sections[split] = Dataset.from_list(
582
- [
583
- converter(idx, item)
584
- for idx, item in data[split][subset_item].items()
585
- ]
589
+ [converter(idx, item) for idx, item in subset_data.items()]
586
590
  )
587
591
  else:
588
592
  raise ValueError(
@@ -680,7 +684,7 @@ class AbsTaskRetrieval(AbsTask):
680
684
 
681
685
  top_k_sorted = defaultdict(list)
682
686
  for query_id, values in top_ranked.items():
683
- sorted_keys = sorted(values, key=values.get, reverse=True)
687
+ sorted_keys = sorted(values, key=lambda k: values[k], reverse=True)
684
688
  top_k_sorted[query_id] = sorted_keys[: self._top_k]
685
689
 
686
690
  self.dataset[subset][split]["top_ranked"] = top_k_sorted
@@ -688,10 +692,10 @@ class AbsTaskRetrieval(AbsTask):
688
692
 
689
693
 
690
694
  def _process_relevant_docs(
691
- collection: dict[str, dict[str, float]],
695
+ collection: Mapping[str, Mapping[str, int]],
692
696
  hf_subset: str,
693
697
  split: str,
694
- ) -> dict[str, dict[str, float]]:
698
+ ) -> dict[str, dict[str, int]]:
695
699
  """Collections can contain overlapping ids in different splits. Prepend split and subset to avoid this
696
700
 
697
701
  Returns: