mteb 2.7.2__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. mteb/_create_dataloaders.py +16 -9
  2. mteb/_evaluators/any_sts_evaluator.py +10 -5
  3. mteb/_evaluators/clustering_evaluator.py +10 -4
  4. mteb/_evaluators/evaluator.py +9 -4
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
  6. mteb/_evaluators/pair_classification_evaluator.py +10 -5
  7. mteb/_evaluators/retrieval_evaluator.py +19 -13
  8. mteb/_evaluators/retrieval_metrics.py +9 -3
  9. mteb/_evaluators/sklearn_evaluator.py +14 -10
  10. mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
  11. mteb/_evaluators/text/summarization_evaluator.py +8 -4
  12. mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
  13. mteb/_helpful_enum.py +5 -1
  14. mteb/abstasks/_data_filter/filters.py +8 -2
  15. mteb/abstasks/_data_filter/task_pipelines.py +7 -2
  16. mteb/abstasks/_statistics_calculation.py +6 -4
  17. mteb/abstasks/abstask.py +17 -9
  18. mteb/abstasks/aggregate_task_metadata.py +20 -9
  19. mteb/abstasks/aggregated_task.py +15 -8
  20. mteb/abstasks/classification.py +15 -6
  21. mteb/abstasks/clustering.py +17 -8
  22. mteb/abstasks/clustering_legacy.py +14 -6
  23. mteb/abstasks/image/image_text_pair_classification.py +17 -7
  24. mteb/abstasks/multilabel_classification.py +11 -5
  25. mteb/abstasks/pair_classification.py +19 -9
  26. mteb/abstasks/regression.py +14 -6
  27. mteb/abstasks/retrieval.py +28 -17
  28. mteb/abstasks/retrieval_dataset_loaders.py +11 -8
  29. mteb/abstasks/sts.py +19 -10
  30. mteb/abstasks/task_metadata.py +17 -8
  31. mteb/abstasks/text/bitext_mining.py +14 -7
  32. mteb/abstasks/text/summarization.py +17 -7
  33. mteb/abstasks/zeroshot_classification.py +15 -7
  34. mteb/benchmarks/_create_table.py +13 -3
  35. mteb/benchmarks/benchmark.py +11 -1
  36. mteb/benchmarks/benchmarks/__init__.py +2 -0
  37. mteb/benchmarks/benchmarks/benchmarks.py +41 -2
  38. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  39. mteb/cache.py +10 -5
  40. mteb/cli/_display_tasks.py +9 -3
  41. mteb/cli/build_cli.py +5 -2
  42. mteb/cli/generate_model_card.py +9 -2
  43. mteb/deprecated_evaluator.py +16 -12
  44. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  45. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  46. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  47. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  48. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  49. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  50. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  51. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  52. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  64. mteb/evaluate.py +20 -18
  65. mteb/filter_tasks.py +12 -7
  66. mteb/get_tasks.py +9 -4
  67. mteb/languages/language_scripts.py +8 -3
  68. mteb/leaderboard/app.py +7 -3
  69. mteb/leaderboard/table.py +7 -2
  70. mteb/load_results.py +9 -3
  71. mteb/models/abs_encoder.py +22 -12
  72. mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
  73. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
  74. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
  75. mteb/models/cache_wrappers/cache_wrapper.py +14 -9
  76. mteb/models/get_model_meta.py +11 -4
  77. mteb/models/instruct_wrapper.py +13 -5
  78. mteb/models/model_implementations/align_models.py +10 -4
  79. mteb/models/model_implementations/amazon_models.py +1 -0
  80. mteb/models/model_implementations/andersborges.py +2 -0
  81. mteb/models/model_implementations/ara_models.py +1 -0
  82. mteb/models/model_implementations/arctic_models.py +8 -0
  83. mteb/models/model_implementations/b1ade_models.py +1 -0
  84. mteb/models/model_implementations/bedrock_models.py +20 -6
  85. mteb/models/model_implementations/bge_models.py +40 -1
  86. mteb/models/model_implementations/bica_model.py +1 -0
  87. mteb/models/model_implementations/blip2_models.py +11 -4
  88. mteb/models/model_implementations/blip_models.py +17 -4
  89. mteb/models/model_implementations/bm25.py +22 -14
  90. mteb/models/model_implementations/bmretriever_models.py +10 -2
  91. mteb/models/model_implementations/cadet_models.py +1 -0
  92. mteb/models/model_implementations/cde_models.py +11 -5
  93. mteb/models/model_implementations/clip_models.py +12 -4
  94. mteb/models/model_implementations/clips_models.py +3 -0
  95. mteb/models/model_implementations/codefuse_models.py +5 -0
  96. mteb/models/model_implementations/codesage_models.py +3 -0
  97. mteb/models/model_implementations/cohere_models.py +14 -4
  98. mteb/models/model_implementations/cohere_v.py +14 -4
  99. mteb/models/model_implementations/colpali_models.py +7 -3
  100. mteb/models/model_implementations/colqwen_models.py +17 -31
  101. mteb/models/model_implementations/colsmol_models.py +3 -1
  102. mteb/models/model_implementations/conan_models.py +11 -4
  103. mteb/models/model_implementations/dino_models.py +28 -4
  104. mteb/models/model_implementations/e5_instruct.py +4 -0
  105. mteb/models/model_implementations/e5_models.py +9 -0
  106. mteb/models/model_implementations/e5_v.py +10 -4
  107. mteb/models/model_implementations/eagerworks_models.py +11 -4
  108. mteb/models/model_implementations/emillykkejensen_models.py +3 -0
  109. mteb/models/model_implementations/en_code_retriever.py +1 -0
  110. mteb/models/model_implementations/euler_models.py +1 -0
  111. mteb/models/model_implementations/evaclip_models.py +13 -4
  112. mteb/models/model_implementations/fa_models.py +9 -0
  113. mteb/models/model_implementations/facebookai.py +2 -0
  114. mteb/models/model_implementations/geogpt_models.py +1 -0
  115. mteb/models/model_implementations/gme_v_models.py +7 -3
  116. mteb/models/model_implementations/google_models.py +15 -4
  117. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
  118. mteb/models/model_implementations/gritlm_models.py +2 -0
  119. mteb/models/model_implementations/gte_models.py +9 -0
  120. mteb/models/model_implementations/hinvec_models.py +6 -1
  121. mteb/models/model_implementations/human.py +1 -0
  122. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  123. mteb/models/model_implementations/inf_models.py +2 -0
  124. mteb/models/model_implementations/jasper_models.py +14 -5
  125. mteb/models/model_implementations/jina_clip.py +10 -4
  126. mteb/models/model_implementations/jina_models.py +17 -5
  127. mteb/models/model_implementations/kalm_models.py +24 -12
  128. mteb/models/model_implementations/kblab.py +1 -0
  129. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
  130. mteb/models/model_implementations/kfst.py +1 -0
  131. mteb/models/model_implementations/kowshik24_models.py +1 -0
  132. mteb/models/model_implementations/lens_models.py +2 -0
  133. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  134. mteb/models/model_implementations/linq_models.py +7 -1
  135. mteb/models/model_implementations/listconranker.py +10 -4
  136. mteb/models/model_implementations/llm2clip_models.py +12 -4
  137. mteb/models/model_implementations/llm2vec_models.py +20 -6
  138. mteb/models/model_implementations/mcinext_models.py +8 -2
  139. mteb/models/model_implementations/mdbr_models.py +2 -0
  140. mteb/models/model_implementations/misc_models.py +63 -0
  141. mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
  142. mteb/models/model_implementations/mme5_models.py +2 -1
  143. mteb/models/model_implementations/moco_models.py +11 -4
  144. mteb/models/model_implementations/mod_models.py +2 -1
  145. mteb/models/model_implementations/model2vec_models.py +23 -4
  146. mteb/models/model_implementations/moka_models.py +3 -0
  147. mteb/models/model_implementations/nbailab.py +3 -0
  148. mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
  149. mteb/models/model_implementations/nomic_models.py +16 -4
  150. mteb/models/model_implementations/nomic_models_vision.py +5 -3
  151. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
  152. mteb/models/model_implementations/nvidia_models.py +15 -4
  153. mteb/models/model_implementations/octen_models.py +3 -1
  154. mteb/models/model_implementations/openai_models.py +14 -4
  155. mteb/models/model_implementations/openclip_models.py +17 -4
  156. mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
  157. mteb/models/model_implementations/ops_moa_models.py +9 -2
  158. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
  159. mteb/models/model_implementations/pawan_models.py +1 -0
  160. mteb/models/model_implementations/piccolo_models.py +2 -0
  161. mteb/models/model_implementations/promptriever_models.py +16 -6
  162. mteb/models/model_implementations/pylate_models.py +22 -13
  163. mteb/models/model_implementations/qodo_models.py +2 -0
  164. mteb/models/model_implementations/qtack_models.py +1 -0
  165. mteb/models/model_implementations/qwen3_models.py +11 -1
  166. mteb/models/model_implementations/qzhou_models.py +2 -0
  167. mteb/models/model_implementations/random_baseline.py +4 -3
  168. mteb/models/model_implementations/rasgaard_models.py +1 -0
  169. mteb/models/model_implementations/reasonir_model.py +65 -0
  170. mteb/models/model_implementations/repllama_models.py +15 -6
  171. mteb/models/model_implementations/rerankers_custom.py +13 -4
  172. mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
  173. mteb/models/model_implementations/richinfoai_models.py +1 -0
  174. mteb/models/model_implementations/ru_sentence_models.py +20 -0
  175. mteb/models/model_implementations/ruri_models.py +10 -0
  176. mteb/models/model_implementations/salesforce_models.py +10 -1
  177. mteb/models/model_implementations/samilpwc_models.py +1 -0
  178. mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
  179. mteb/models/model_implementations/searchmap_models.py +1 -0
  180. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  181. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
  182. mteb/models/model_implementations/seed_models.py +2 -1
  183. mteb/models/model_implementations/sentence_transformers_models.py +18 -0
  184. mteb/models/model_implementations/shuu_model.py +1 -0
  185. mteb/models/model_implementations/siglip_models.py +19 -4
  186. mteb/models/model_implementations/slm_models.py +7 -4
  187. mteb/models/model_implementations/sonar_models.py +2 -1
  188. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
  189. mteb/models/model_implementations/stella_models.py +6 -0
  190. mteb/models/model_implementations/tarka_models.py +2 -0
  191. mteb/models/model_implementations/text2vec_models.py +3 -0
  192. mteb/models/model_implementations/ua_sentence_models.py +1 -0
  193. mteb/models/model_implementations/uae_models.py +10 -4
  194. mteb/models/model_implementations/vdr_models.py +8 -1
  195. mteb/models/model_implementations/vi_vn_models.py +6 -0
  196. mteb/models/model_implementations/vista_models.py +11 -4
  197. mteb/models/model_implementations/vlm2vec_models.py +11 -4
  198. mteb/models/model_implementations/voyage_models.py +25 -4
  199. mteb/models/model_implementations/voyage_v.py +11 -6
  200. mteb/models/model_implementations/xyz_models.py +1 -0
  201. mteb/models/model_implementations/youtu_models.py +1 -0
  202. mteb/models/model_implementations/yuan_models.py +1 -0
  203. mteb/models/model_implementations/yuan_models_en.py +2 -1
  204. mteb/models/model_meta.py +47 -9
  205. mteb/models/models_protocols.py +19 -18
  206. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  207. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
  208. mteb/models/search_wrappers.py +19 -12
  209. mteb/models/sentence_transformer_wrapper.py +4 -3
  210. mteb/models/vllm_wrapper.py +8 -6
  211. mteb/results/benchmark_results.py +22 -17
  212. mteb/results/model_result.py +21 -15
  213. mteb/results/task_result.py +15 -9
  214. mteb/similarity_functions.py +8 -2
  215. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  216. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  217. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  218. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  219. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  220. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  221. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  222. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  223. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  224. mteb/tasks/clustering/nob/snl_clustering.py +7 -2
  225. mteb/tasks/clustering/nob/vg_clustering.py +7 -2
  226. mteb/tasks/retrieval/eng/__init__.py +42 -0
  227. mteb/tasks/retrieval/eng/bright_retrieval.py +9 -1
  228. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  229. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  230. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
  231. mteb/types/_encoder_io.py +1 -1
  232. mteb/types/statistics.py +9 -2
  233. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/METADATA +1 -1
  234. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/RECORD +238 -217
  235. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/WHEEL +0 -0
  236. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/entry_points.txt +0 -0
  237. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/licenses/LICENSE +0 -0
  238. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ bica_base = ModelMeta(
9
9
  revision="31237a836e5ae908c308a256573e5f0986498574",
10
10
  release_date="2025-11-14",
11
11
  n_parameters=110_000_000,
12
+ n_embedding_parameters=23_440_896,
12
13
  memory_usage_mb=418,
13
14
  embed_dim=768,
14
15
  license="mit",
@@ -1,14 +1,19 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
8
  from mteb._requires_package import requires_package
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.types import Array, BatchedInput, PromptType
11
+
12
+ if TYPE_CHECKING:
13
+ from torch.utils.data import DataLoader
14
+
15
+ from mteb.abstasks.task_metadata import TaskMetadata
16
+ from mteb.types import Array, BatchedInput, PromptType
12
17
 
13
18
  BLIP2_CITATION = """@inproceedings{li2023blip2,
14
19
  title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
@@ -172,6 +177,7 @@ blip2_opt_2_7b = ModelMeta(
172
177
  release_date="2024-03-22",
173
178
  modalities=["image", "text"],
174
179
  n_parameters=3_740_000_000,
180
+ n_embedding_parameters=None,
175
181
  memory_usage_mb=14285,
176
182
  max_tokens=None,
177
183
  embed_dim=768,
@@ -196,6 +202,7 @@ blip2_opt_6_7b_coco = ModelMeta(
196
202
  release_date="2024-03-31",
197
203
  modalities=["image", "text"],
198
204
  n_parameters=7_750_000_000,
205
+ n_embedding_parameters=None,
199
206
  memory_usage_mb=29577,
200
207
  max_tokens=None,
201
208
  embed_dim=768,
@@ -1,14 +1,19 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
6
  from torch.nn.functional import normalize
5
- from torch.utils.data import DataLoader
6
7
  from tqdm.auto import tqdm
7
8
 
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.types import Array, BatchedInput, PromptType
11
+
12
+ if TYPE_CHECKING:
13
+ from torch.utils.data import DataLoader
14
+
15
+ from mteb.abstasks.task_metadata import TaskMetadata
16
+ from mteb.types import Array, BatchedInput, PromptType
12
17
 
13
18
  BLIP_CITATION = """@misc{https://doi.org/10.48550/arxiv.2201.12086,
14
19
  doi = {10.48550/ARXIV.2201.12086},
@@ -136,6 +141,7 @@ blip_image_captioning_large = ModelMeta(
136
141
  release_date="2023-12-07",
137
142
  modalities=["image", "text"],
138
143
  n_parameters=470_000_000,
144
+ n_embedding_parameters=23_442_432,
139
145
  memory_usage_mb=1792,
140
146
  max_tokens=512,
141
147
  embed_dim=768,
@@ -164,6 +170,7 @@ blip_image_captioning_base = ModelMeta(
164
170
  release_date="2023-08-01",
165
171
  modalities=["image", "text"],
166
172
  n_parameters=247_000_000,
173
+ n_embedding_parameters=23_442_432,
167
174
  memory_usage_mb=942,
168
175
  max_tokens=512,
169
176
  embed_dim=768,
@@ -193,6 +200,7 @@ blip_vqa_base = ModelMeta(
193
200
  release_date="2023-12-07",
194
201
  modalities=["image", "text"],
195
202
  n_parameters=247_000_000,
203
+ n_embedding_parameters=23_442_432,
196
204
  memory_usage_mb=1467,
197
205
  max_tokens=512,
198
206
  embed_dim=768,
@@ -220,6 +228,7 @@ blip_vqa_capfilt_large = ModelMeta(
220
228
  release_date="2023-01-22",
221
229
  modalities=["image", "text"],
222
230
  n_parameters=247_000_000,
231
+ n_embedding_parameters=23_442_432,
223
232
  memory_usage_mb=942,
224
233
  max_tokens=512,
225
234
  embed_dim=768,
@@ -247,6 +256,7 @@ blip_itm_base_coco = ModelMeta(
247
256
  release_date="2023-08-01",
248
257
  modalities=["image", "text"],
249
258
  n_parameters=247_000_000,
259
+ n_embedding_parameters=23_442_432,
250
260
  memory_usage_mb=942,
251
261
  max_tokens=512,
252
262
  embed_dim=768,
@@ -274,6 +284,7 @@ blip_itm_large_coco = ModelMeta(
274
284
  release_date="2023-08-01",
275
285
  modalities=["image", "text"],
276
286
  n_parameters=470_000_000,
287
+ n_embedding_parameters=23_442_432,
277
288
  memory_usage_mb=1793,
278
289
  max_tokens=512,
279
290
  embed_dim=768,
@@ -302,6 +313,7 @@ blip_itm_base_flickr = ModelMeta(
302
313
  release_date="2023-08-01",
303
314
  modalities=["image", "text"],
304
315
  n_parameters=247_000_000,
316
+ n_embedding_parameters=23_442_432,
305
317
  memory_usage_mb=942,
306
318
  max_tokens=512,
307
319
  embed_dim=768,
@@ -330,6 +342,7 @@ blip_itm_large_flickr = ModelMeta(
330
342
  release_date="2023-08-01",
331
343
  modalities=["image", "text"],
332
344
  n_parameters=470_000_000,
345
+ n_embedding_parameters=23_442_432,
333
346
  memory_usage_mb=1793,
334
347
  max_tokens=512,
335
348
  embed_dim=768,
@@ -1,18 +1,22 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
4
+ from typing import TYPE_CHECKING
2
5
 
3
6
  from mteb._create_dataloaders import _create_text_queries_dataloader
4
7
  from mteb._requires_package import requires_package
5
- from mteb.abstasks.task_metadata import TaskMetadata
6
8
  from mteb.models.model_meta import ModelMeta
7
- from mteb.models.models_protocols import SearchProtocol
8
- from mteb.types import (
9
- CorpusDatasetType,
10
- EncodeKwargs,
11
- InstructionDatasetType,
12
- QueryDatasetType,
13
- RetrievalOutputType,
14
- TopRankedDocumentsType,
15
- )
9
+
10
+ if TYPE_CHECKING:
11
+ from mteb.abstasks.task_metadata import TaskMetadata
12
+ from mteb.models.models_protocols import SearchProtocol
13
+ from mteb.types import (
14
+ CorpusDatasetType,
15
+ EncodeKwargs,
16
+ QueryDatasetType,
17
+ RetrievalOutputType,
18
+ TopRankedDocumentsType,
19
+ )
16
20
 
17
21
  logger = logging.getLogger(__name__)
18
22
 
@@ -75,7 +79,6 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
75
79
  hf_subset: str,
76
80
  top_k: int,
77
81
  encode_kwargs: EncodeKwargs,
78
- instructions: InstructionDatasetType | None = None,
79
82
  top_ranked: TopRankedDocumentsType | None = None,
80
83
  ) -> RetrievalOutputType:
81
84
  logger.info("Encoding Queries...")
@@ -98,13 +101,17 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
98
101
  query_results = queries_results[qi]
99
102
  scores = queries_scores[qi]
100
103
  doc_id_to_score = {}
104
+ query_documents = (
105
+ top_ranked[qid] if top_ranked and qid in top_ranked else None
106
+ )
101
107
 
102
108
  # Iterate over results
103
- for ri in range(len(query_results)):
104
- doc_idx = query_results[ri]
105
- score = scores[ri]
109
+ for doc_idx, score in zip(query_results, scores):
106
110
  doc_id = self.corpus_idx_to_id[doc_idx]
107
111
 
112
+ # handle reranking with a filtered set of documents
113
+ if query_documents is not None and doc_id not in query_documents:
114
+ continue
108
115
  doc_id_to_score[doc_id] = float(score)
109
116
 
110
117
  results[qid] = doc_id_to_score
@@ -127,6 +134,7 @@ bm25_s = ModelMeta(
127
134
  revision="0_1_10",
128
135
  release_date="2024-07-10", # release of version 0.1.10
129
136
  n_parameters=None,
137
+ n_embedding_parameters=None,
130
138
  memory_usage_mb=None,
131
139
  embed_dim=None,
132
140
  license=None,
@@ -1,5 +1,6 @@
1
- from collections.abc import Callable
2
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
3
4
 
4
5
  import torch
5
6
  from sentence_transformers import SentenceTransformer
@@ -9,6 +10,9 @@ from mteb.models import ModelMeta
9
10
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
10
11
  from mteb.types import PromptType
11
12
 
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Callable
15
+
12
16
 
13
17
  def instruction_template(
14
18
  instruction: str, prompt_type: PromptType | None = None
@@ -99,6 +103,7 @@ BMRetriever_410M = ModelMeta(
99
103
  release_date="2024-04-29",
100
104
  embed_dim=1024,
101
105
  n_parameters=353_822_720,
106
+ n_embedding_parameters=51_511_296,
102
107
  memory_usage_mb=1349,
103
108
  max_tokens=2048,
104
109
  license="mit",
@@ -129,6 +134,7 @@ BMRetriever_1B = ModelMeta(
129
134
  release_date="2024-04-29",
130
135
  embed_dim=2048,
131
136
  n_parameters=908_759_040,
137
+ n_embedding_parameters=103_022_592,
132
138
  memory_usage_mb=3466,
133
139
  max_tokens=2048,
134
140
  license="mit",
@@ -159,6 +165,7 @@ BMRetriever_2B = ModelMeta(
159
165
  release_date="2024-04-29",
160
166
  embed_dim=2048,
161
167
  n_parameters=2_506_172_416,
168
+ n_embedding_parameters=524_288_000,
162
169
  memory_usage_mb=9560,
163
170
  max_tokens=8192,
164
171
  license="mit",
@@ -189,6 +196,7 @@ BMRetriever_7B = ModelMeta(
189
196
  release_date="2024-04-29",
190
197
  embed_dim=4096,
191
198
  n_parameters=7_110_660_096,
199
+ n_embedding_parameters=131_072_000,
192
200
  memory_usage_mb=27124,
193
201
  max_tokens=32768,
194
202
  license="mit",
@@ -41,6 +41,7 @@ cadet_embed = ModelMeta(
41
41
  open_weights=True,
42
42
  release_date="2025-05-11",
43
43
  n_parameters=109_000_000,
44
+ n_embedding_parameters=23_440_896,
44
45
  memory_usage_mb=418,
45
46
  embed_dim=768,
46
47
  license="apache-2.0",
@@ -1,27 +1,31 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from collections.abc import Sequence
3
4
  from typing import TYPE_CHECKING, Any
4
5
 
5
6
  import numpy as np
6
7
  import torch
7
- from torch.utils.data import DataLoader
8
8
 
9
9
  import mteb
10
10
  from mteb._create_dataloaders import _corpus_to_dict
11
- from mteb.abstasks.task_metadata import TaskMetadata
12
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
13
- from mteb.models.models_protocols import PromptType
14
12
  from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
15
- from mteb.types import Array, BatchedInput
13
+ from mteb.types import PromptType
16
14
 
17
15
  from .bge_models import bge_full_data
18
16
 
19
17
  if TYPE_CHECKING:
18
+ from collections.abc import Sequence
19
+
20
+ from torch.utils.data import DataLoader
21
+
20
22
  from mteb.abstasks import (
21
23
  AbsTaskClassification,
22
24
  AbsTaskRetrieval,
23
25
  AbsTaskSummarization,
24
26
  )
27
+ from mteb.abstasks.task_metadata import TaskMetadata
28
+ from mteb.types import Array, BatchedInput
25
29
  logger = logging.getLogger(__name__)
26
30
 
27
31
  CDE_CITATION = """@misc{morris2024contextualdocumentembeddings,
@@ -222,6 +226,7 @@ cde_small_v1 = ModelMeta(
222
226
  revision="e151df18af0d7f1d1c37b074fee58406ececf19f",
223
227
  release_date="2024-09-24",
224
228
  n_parameters=int(281 * 1e6),
229
+ n_embedding_parameters=None,
225
230
  memory_usage_mb=1072, # Though the second-stage model is only 140M
226
231
  max_tokens=512,
227
232
  embed_dim=768,
@@ -251,6 +256,7 @@ cde_small_v2 = ModelMeta(
251
256
  revision="4e1d021a6c3fd7ce8aa0a7204057eee5ae61d390",
252
257
  release_date="2025-01-13",
253
258
  n_parameters=int(306 * 1e6),
259
+ n_embedding_parameters=None,
254
260
  memory_usage_mb=1166, # Though the second-stage model is only 140M
255
261
  max_tokens=512,
256
262
  embed_dim=768,
@@ -1,13 +1,18 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
- from mteb.abstasks.task_metadata import TaskMetadata
8
8
  from mteb.models.abs_encoder import AbsEncoder
9
9
  from mteb.models.model_meta import ModelMeta, ScoringFunction
10
- from mteb.types import Array, BatchedInput, PromptType
10
+
11
+ if TYPE_CHECKING:
12
+ from torch.utils.data import DataLoader
13
+
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.types import Array, BatchedInput, PromptType
11
16
 
12
17
 
13
18
  class CLIPModel(AbsEncoder):
@@ -123,6 +128,7 @@ clip_vit_large_patch14 = ModelMeta(
123
128
  release_date="2021-02-26",
124
129
  modalities=["image", "text"],
125
130
  n_parameters=428_000_000,
131
+ n_embedding_parameters=None,
126
132
  memory_usage_mb=1631,
127
133
  max_tokens=77,
128
134
  embed_dim=768,
@@ -147,6 +153,7 @@ clip_vit_base_patch32 = ModelMeta(
147
153
  release_date="2021-02-26",
148
154
  modalities=["image", "text"],
149
155
  n_parameters=151_000_000,
156
+ n_embedding_parameters=None,
150
157
  memory_usage_mb=576,
151
158
  max_tokens=77,
152
159
  embed_dim=512,
@@ -171,6 +178,7 @@ clip_vit_base_patch16 = ModelMeta(
171
178
  release_date="2021-02-26",
172
179
  modalities=["image", "text"],
173
180
  n_parameters=151_000_000,
181
+ n_embedding_parameters=None,
174
182
  memory_usage_mb=576,
175
183
  max_tokens=77,
176
184
  embed_dim=512,
@@ -30,6 +30,7 @@ e5_nl_small = ModelMeta(
30
30
  revision="0243664a6c5e12eef854b091eb283e51833c3e9f",
31
31
  release_date="2025-09-23",
32
32
  n_parameters=40_800_000,
33
+ n_embedding_parameters=19_200_768,
33
34
  memory_usage_mb=78,
34
35
  embed_dim=384,
35
36
  license="mit",
@@ -57,6 +58,7 @@ e5_nl_base = ModelMeta(
57
58
  revision="6bd5722f236da48b4b8bcb28cc1fc478f7089956",
58
59
  release_date="2025-09-23",
59
60
  n_parameters=124_400_000,
61
+ n_embedding_parameters=38_401_536,
60
62
  memory_usage_mb=237,
61
63
  embed_dim=768,
62
64
  license="mit",
@@ -84,6 +86,7 @@ e5_nl_large = ModelMeta(
84
86
  revision="683333f86ed9eb3699b5567f0fdabeb958d412b0",
85
87
  release_date="2025-09-23",
86
88
  n_parameters=355_000_000,
89
+ n_embedding_parameters=51_202_048,
87
90
  memory_usage_mb=1355,
88
91
  embed_dim=1024,
89
92
  license="mit",
@@ -236,6 +236,7 @@ F2LLM_0B6 = ModelMeta(
236
236
  revision="36416618b83d4bd84a8ca30c2ee01ed518f9f2e7",
237
237
  release_date="2025-09-18",
238
238
  n_parameters=595_776_512,
239
+ n_embedding_parameters=None,
239
240
  memory_usage_mb=1137,
240
241
  embed_dim=1024,
241
242
  license="apache-2.0",
@@ -266,6 +267,7 @@ F2LLM_1B7 = ModelMeta(
266
267
  revision="fdce0e09655f42cea26f7f66f5a70cd4507ea45c",
267
268
  release_date="2025-09-18",
268
269
  n_parameters=1_720_574_976,
270
+ n_embedding_parameters=None,
269
271
  memory_usage_mb=3282,
270
272
  embed_dim=2560,
271
273
  license="apache-2.0",
@@ -296,6 +298,7 @@ F2LLM_4B = ModelMeta(
296
298
  revision="9fe95901ed2b6b59dd7673d6e93c9d76766a1e25",
297
299
  release_date="2025-09-18",
298
300
  n_parameters=4_021_774_336,
301
+ n_embedding_parameters=None,
299
302
  memory_usage_mb=7672,
300
303
  embed_dim=2560,
301
304
  license="apache-2.0",
@@ -318,6 +321,7 @@ C2LLM_0B5 = ModelMeta(
318
321
  release_date="2025-12-22",
319
322
  languages=c2llm_languages,
320
323
  n_parameters=497252096,
324
+ n_embedding_parameters=None,
321
325
  memory_usage_mb=948.0,
322
326
  max_tokens=32768,
323
327
  embed_dim=896,
@@ -346,6 +350,7 @@ C2LLM_7B = ModelMeta(
346
350
  release_date="2025-12-22",
347
351
  languages=c2llm_languages,
348
352
  n_parameters=7667028992,
353
+ n_embedding_parameters=None,
349
354
  memory_usage_mb=14624.0,
350
355
  max_tokens=32768,
351
356
  embed_dim=3584,
@@ -28,6 +28,7 @@ codesage_large = ModelMeta(
28
28
  release_date="2024-02-03",
29
29
  modalities=["text"],
30
30
  n_parameters=1_300_000_000,
31
+ n_embedding_parameters=100_667_392,
31
32
  memory_usage_mb=4959,
32
33
  max_tokens=2048,
33
34
  embed_dim=2048,
@@ -55,6 +56,7 @@ codesage_base = ModelMeta(
55
56
  release_date="2024-02-03",
56
57
  modalities=["text"],
57
58
  n_parameters=356_000_000,
59
+ n_embedding_parameters=50_333_696,
58
60
  memory_usage_mb=1358,
59
61
  max_tokens=2048,
60
62
  embed_dim=1024,
@@ -82,6 +84,7 @@ codesage_small = ModelMeta(
82
84
  release_date="2024-02-03",
83
85
  modalities=["text"],
84
86
  n_parameters=130_000_000,
87
+ n_embedding_parameters=50_333_696,
85
88
  memory_usage_mb=496,
86
89
  max_tokens=2048,
87
90
  embed_dim=1024,
@@ -1,18 +1,24 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import time
3
5
  from functools import wraps
4
- from typing import Any, Literal, get_args
6
+ from typing import TYPE_CHECKING, Any, Literal, get_args
5
7
 
6
8
  import numpy as np
7
9
  import torch
8
- from torch.utils.data import DataLoader
9
10
  from tqdm.auto import tqdm
10
11
 
11
12
  from mteb._requires_package import requires_package
12
- from mteb.abstasks.task_metadata import TaskMetadata
13
13
  from mteb.models.abs_encoder import AbsEncoder
14
14
  from mteb.models.model_meta import ModelMeta, ScoringFunction
15
- from mteb.types import Array, BatchedInput, PromptType
15
+ from mteb.types import PromptType
16
+
17
+ if TYPE_CHECKING:
18
+ from torch.utils.data import DataLoader
19
+
20
+ from mteb.abstasks.task_metadata import TaskMetadata
21
+ from mteb.types import Array, BatchedInput
16
22
 
17
23
  logger = logging.getLogger(__name__)
18
24
 
@@ -386,6 +392,7 @@ cohere_mult_3 = ModelMeta(
386
392
  revision="1",
387
393
  release_date="2023-11-02",
388
394
  n_parameters=None,
395
+ n_embedding_parameters=None,
389
396
  memory_usage_mb=None,
390
397
  max_tokens=None,
391
398
  embed_dim=512,
@@ -412,6 +419,7 @@ cohere_eng_3 = ModelMeta(
412
419
  revision="1",
413
420
  release_date="2023-11-02",
414
421
  n_parameters=None,
422
+ n_embedding_parameters=None,
415
423
  memory_usage_mb=None,
416
424
  max_tokens=512,
417
425
  embed_dim=1024,
@@ -437,6 +445,7 @@ cohere_mult_light_3 = ModelMeta(
437
445
  reference="https://cohere.com/blog/introducing-embed-v3",
438
446
  release_date="2023-11-02",
439
447
  n_parameters=None,
448
+ n_embedding_parameters=None,
440
449
  memory_usage_mb=None,
441
450
  max_tokens=512,
442
451
  embed_dim=384,
@@ -462,6 +471,7 @@ cohere_eng_light_3 = ModelMeta(
462
471
  revision="1",
463
472
  release_date="2023-11-02",
464
473
  n_parameters=None,
474
+ n_embedding_parameters=None,
465
475
  memory_usage_mb=None,
466
476
  max_tokens=512,
467
477
  embed_dim=384,
@@ -1,15 +1,15 @@
1
+ from __future__ import annotations
2
+
1
3
  import base64
2
4
  import io
3
5
  import os
4
6
  import time
5
- from typing import Any, Literal, get_args
7
+ from typing import TYPE_CHECKING, Any, Literal, get_args
6
8
 
7
9
  import torch
8
- from torch.utils.data import DataLoader
9
10
  from tqdm.auto import tqdm
10
11
 
11
12
  from mteb._requires_package import requires_image_dependencies, requires_package
12
- from mteb.abstasks.task_metadata import TaskMetadata
13
13
  from mteb.models import ModelMeta
14
14
  from mteb.models.abs_encoder import AbsEncoder
15
15
  from mteb.models.model_implementations.cohere_models import (
@@ -18,7 +18,12 @@ from mteb.models.model_implementations.cohere_models import (
18
18
  retry_with_rate_limit,
19
19
  )
20
20
  from mteb.models.model_meta import ScoringFunction
21
- from mteb.types import Array, BatchedInput, PromptType
21
+
22
+ if TYPE_CHECKING:
23
+ from torch.utils.data import DataLoader
24
+
25
+ from mteb.abstasks.task_metadata import TaskMetadata
26
+ from mteb.types import Array, BatchedInput, PromptType
22
27
 
23
28
 
24
29
  def _post_process_embeddings(
@@ -386,6 +391,7 @@ cohere_mult_3 = ModelMeta(
386
391
  revision="1",
387
392
  release_date="2024-10-24",
388
393
  n_parameters=None,
394
+ n_embedding_parameters=None,
389
395
  memory_usage_mb=None,
390
396
  max_tokens=None,
391
397
  embed_dim=1024,
@@ -410,6 +416,7 @@ cohere_eng_3 = ModelMeta(
410
416
  revision="1",
411
417
  release_date="2024-10-24",
412
418
  n_parameters=None,
419
+ n_embedding_parameters=None,
413
420
  memory_usage_mb=None,
414
421
  max_tokens=None,
415
422
  embed_dim=1024,
@@ -434,6 +441,7 @@ cohere_embed_v4_multimodal = ModelMeta(
434
441
  revision="1",
435
442
  release_date="2024-12-01",
436
443
  n_parameters=None,
444
+ n_embedding_parameters=None,
437
445
  memory_usage_mb=None,
438
446
  max_tokens=128000,
439
447
  embed_dim=1536,
@@ -458,6 +466,7 @@ cohere_embed_v4_multimodal_binary = ModelMeta(
458
466
  revision="1",
459
467
  release_date="2024-12-01",
460
468
  n_parameters=None,
469
+ n_embedding_parameters=None,
461
470
  memory_usage_mb=None,
462
471
  max_tokens=128000,
463
472
  embed_dim=1536,
@@ -483,6 +492,7 @@ cohere_embed_v4_multimodal_int8 = ModelMeta(
483
492
  revision="1",
484
493
  release_date="2024-12-01",
485
494
  n_parameters=None,
495
+ n_embedding_parameters=None,
486
496
  memory_usage_mb=None,
487
497
  max_tokens=128000,
488
498
  embed_dim=1536,
@@ -4,20 +4,21 @@ import logging
4
4
  from typing import TYPE_CHECKING, Any
5
5
 
6
6
  import torch
7
- from torch.utils.data import DataLoader
8
7
  from tqdm.auto import tqdm
9
8
 
10
9
  from mteb._requires_package import (
11
10
  requires_image_dependencies,
12
11
  requires_package,
13
12
  )
14
- from mteb.abstasks.task_metadata import TaskMetadata
15
13
  from mteb.models.abs_encoder import AbsEncoder
16
14
  from mteb.models.model_meta import ModelMeta, ScoringFunction
17
- from mteb.types import Array, BatchedInput, PromptType
18
15
 
19
16
  if TYPE_CHECKING:
20
17
  from PIL import Image
18
+ from torch.utils.data import DataLoader
19
+
20
+ from mteb.abstasks.task_metadata import TaskMetadata
21
+ from mteb.types import Array, BatchedInput, PromptType
21
22
 
22
23
  logger = logging.getLogger(__name__)
23
24
 
@@ -219,6 +220,7 @@ colpali_v1_1 = ModelMeta(
219
220
  release_date="2024-08-21",
220
221
  modalities=["image", "text"],
221
222
  n_parameters=2_920_000_000,
223
+ n_embedding_parameters=None,
222
224
  memory_usage_mb=4700,
223
225
  max_tokens=16384,
224
226
  embed_dim=128,
@@ -246,6 +248,7 @@ colpali_v1_2 = ModelMeta(
246
248
  release_date="2024-08-26",
247
249
  modalities=["image", "text"],
248
250
  n_parameters=2_920_000_000,
251
+ n_embedding_parameters=None,
249
252
  memory_usage_mb=4700,
250
253
  max_tokens=16384,
251
254
  embed_dim=128,
@@ -273,6 +276,7 @@ colpali_v1_3 = ModelMeta(
273
276
  release_date="2024-11-01",
274
277
  modalities=["image", "text"],
275
278
  n_parameters=2_920_000_000,
279
+ n_embedding_parameters=None,
276
280
  memory_usage_mb=4700,
277
281
  max_tokens=16384,
278
282
  embed_dim=128,