mteb 2.7.3__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. mteb/abstasks/retrieval.py +1 -1
  2. mteb/benchmarks/benchmarks/__init__.py +2 -0
  3. mteb/benchmarks/benchmarks/benchmarks.py +41 -2
  4. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  5. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  6. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  7. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  8. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  9. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  10. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  11. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  12. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  13. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  14. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  15. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  16. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  17. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  18. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  19. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  20. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  21. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  22. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  23. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  24. mteb/models/model_implementations/align_models.py +1 -0
  25. mteb/models/model_implementations/amazon_models.py +1 -0
  26. mteb/models/model_implementations/andersborges.py +2 -0
  27. mteb/models/model_implementations/ara_models.py +1 -0
  28. mteb/models/model_implementations/arctic_models.py +8 -0
  29. mteb/models/model_implementations/b1ade_models.py +1 -0
  30. mteb/models/model_implementations/bedrock_models.py +4 -0
  31. mteb/models/model_implementations/bge_models.py +40 -1
  32. mteb/models/model_implementations/bica_model.py +1 -0
  33. mteb/models/model_implementations/blip2_models.py +2 -0
  34. mteb/models/model_implementations/blip_models.py +8 -0
  35. mteb/models/model_implementations/bm25.py +8 -5
  36. mteb/models/model_implementations/bmretriever_models.py +4 -0
  37. mteb/models/model_implementations/cadet_models.py +1 -0
  38. mteb/models/model_implementations/cde_models.py +2 -0
  39. mteb/models/model_implementations/clip_models.py +3 -0
  40. mteb/models/model_implementations/clips_models.py +3 -0
  41. mteb/models/model_implementations/codefuse_models.py +5 -0
  42. mteb/models/model_implementations/codesage_models.py +3 -0
  43. mteb/models/model_implementations/cohere_models.py +4 -0
  44. mteb/models/model_implementations/cohere_v.py +5 -0
  45. mteb/models/model_implementations/colpali_models.py +3 -0
  46. mteb/models/model_implementations/colqwen_models.py +7 -0
  47. mteb/models/model_implementations/colsmol_models.py +2 -0
  48. mteb/models/model_implementations/conan_models.py +1 -0
  49. mteb/models/model_implementations/dino_models.py +19 -0
  50. mteb/models/model_implementations/e5_instruct.py +4 -0
  51. mteb/models/model_implementations/e5_models.py +9 -0
  52. mteb/models/model_implementations/e5_v.py +1 -0
  53. mteb/models/model_implementations/eagerworks_models.py +1 -0
  54. mteb/models/model_implementations/emillykkejensen_models.py +3 -0
  55. mteb/models/model_implementations/en_code_retriever.py +1 -0
  56. mteb/models/model_implementations/euler_models.py +1 -0
  57. mteb/models/model_implementations/evaclip_models.py +4 -0
  58. mteb/models/model_implementations/fa_models.py +9 -0
  59. mteb/models/model_implementations/facebookai.py +2 -0
  60. mteb/models/model_implementations/geogpt_models.py +1 -0
  61. mteb/models/model_implementations/gme_v_models.py +2 -0
  62. mteb/models/model_implementations/google_models.py +5 -0
  63. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -0
  64. mteb/models/model_implementations/gritlm_models.py +2 -0
  65. mteb/models/model_implementations/gte_models.py +9 -0
  66. mteb/models/model_implementations/hinvec_models.py +1 -0
  67. mteb/models/model_implementations/human.py +1 -0
  68. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  69. mteb/models/model_implementations/inf_models.py +2 -0
  70. mteb/models/model_implementations/jasper_models.py +2 -0
  71. mteb/models/model_implementations/jina_clip.py +1 -0
  72. mteb/models/model_implementations/jina_models.py +7 -0
  73. mteb/models/model_implementations/kalm_models.py +6 -0
  74. mteb/models/model_implementations/kblab.py +1 -0
  75. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
  76. mteb/models/model_implementations/kfst.py +1 -0
  77. mteb/models/model_implementations/kowshik24_models.py +1 -0
  78. mteb/models/model_implementations/lens_models.py +2 -0
  79. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  80. mteb/models/model_implementations/linq_models.py +1 -0
  81. mteb/models/model_implementations/listconranker.py +1 -0
  82. mteb/models/model_implementations/llm2clip_models.py +3 -0
  83. mteb/models/model_implementations/llm2vec_models.py +8 -0
  84. mteb/models/model_implementations/mcinext_models.py +3 -0
  85. mteb/models/model_implementations/mdbr_models.py +2 -0
  86. mteb/models/model_implementations/misc_models.py +63 -0
  87. mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
  88. mteb/models/model_implementations/mme5_models.py +2 -1
  89. mteb/models/model_implementations/moco_models.py +2 -0
  90. mteb/models/model_implementations/mod_models.py +1 -0
  91. mteb/models/model_implementations/model2vec_models.py +13 -0
  92. mteb/models/model_implementations/moka_models.py +3 -0
  93. mteb/models/model_implementations/nbailab.py +3 -0
  94. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  95. mteb/models/model_implementations/nomic_models.py +6 -0
  96. mteb/models/model_implementations/nomic_models_vision.py +1 -0
  97. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +2 -0
  98. mteb/models/model_implementations/nvidia_models.py +3 -0
  99. mteb/models/model_implementations/octen_models.py +2 -0
  100. mteb/models/model_implementations/openai_models.py +5 -0
  101. mteb/models/model_implementations/openclip_models.py +8 -0
  102. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  103. mteb/models/model_implementations/ops_moa_models.py +2 -0
  104. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
  105. mteb/models/model_implementations/pawan_models.py +1 -0
  106. mteb/models/model_implementations/piccolo_models.py +2 -0
  107. mteb/models/model_implementations/promptriever_models.py +4 -0
  108. mteb/models/model_implementations/pylate_models.py +3 -0
  109. mteb/models/model_implementations/qodo_models.py +2 -0
  110. mteb/models/model_implementations/qtack_models.py +1 -0
  111. mteb/models/model_implementations/qwen3_models.py +3 -0
  112. mteb/models/model_implementations/qzhou_models.py +2 -0
  113. mteb/models/model_implementations/rasgaard_models.py +1 -0
  114. mteb/models/model_implementations/reasonir_model.py +65 -0
  115. mteb/models/model_implementations/repllama_models.py +2 -0
  116. mteb/models/model_implementations/rerankers_custom.py +3 -0
  117. mteb/models/model_implementations/rerankers_monot5_based.py +14 -0
  118. mteb/models/model_implementations/richinfoai_models.py +1 -0
  119. mteb/models/model_implementations/ru_sentence_models.py +20 -0
  120. mteb/models/model_implementations/ruri_models.py +10 -0
  121. mteb/models/model_implementations/salesforce_models.py +3 -0
  122. mteb/models/model_implementations/samilpwc_models.py +1 -0
  123. mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
  124. mteb/models/model_implementations/searchmap_models.py +1 -0
  125. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -0
  126. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +1 -0
  127. mteb/models/model_implementations/seed_models.py +1 -0
  128. mteb/models/model_implementations/sentence_transformers_models.py +18 -0
  129. mteb/models/model_implementations/shuu_model.py +1 -0
  130. mteb/models/model_implementations/siglip_models.py +10 -0
  131. mteb/models/model_implementations/sonar_models.py +2 -1
  132. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
  133. mteb/models/model_implementations/stella_models.py +6 -0
  134. mteb/models/model_implementations/tarka_models.py +2 -0
  135. mteb/models/model_implementations/text2vec_models.py +3 -0
  136. mteb/models/model_implementations/ua_sentence_models.py +1 -0
  137. mteb/models/model_implementations/uae_models.py +1 -0
  138. mteb/models/model_implementations/vdr_models.py +1 -0
  139. mteb/models/model_implementations/vi_vn_models.py +6 -0
  140. mteb/models/model_implementations/vista_models.py +2 -0
  141. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  142. mteb/models/model_implementations/voyage_models.py +15 -0
  143. mteb/models/model_implementations/voyage_v.py +1 -0
  144. mteb/models/model_implementations/xyz_models.py +1 -0
  145. mteb/models/model_implementations/youtu_models.py +1 -0
  146. mteb/models/model_implementations/yuan_models.py +1 -0
  147. mteb/models/model_implementations/yuan_models_en.py +1 -0
  148. mteb/models/model_meta.py +35 -2
  149. mteb/tasks/retrieval/eng/__init__.py +42 -0
  150. mteb/tasks/retrieval/eng/bright_retrieval.py +9 -1
  151. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  152. {mteb-2.7.3.dist-info → mteb-2.7.4.dist-info}/METADATA +1 -1
  153. {mteb-2.7.3.dist-info → mteb-2.7.4.dist-info}/RECORD +157 -136
  154. {mteb-2.7.3.dist-info → mteb-2.7.4.dist-info}/WHEEL +0 -0
  155. {mteb-2.7.3.dist-info → mteb-2.7.4.dist-info}/entry_points.txt +0 -0
  156. {mteb-2.7.3.dist-info → mteb-2.7.4.dist-info}/licenses/LICENSE +0 -0
  157. {mteb-2.7.3.dist-info → mteb-2.7.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,35 @@
1
+ {
2
+ "long": {
3
+ "num_samples": 662,
4
+ "number_of_characters": 21154322,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 21080575,
7
+ "min_text_length": 30,
8
+ "average_text_length": 38051.579422382674,
9
+ "max_text_length": 5732344,
10
+ "unique_texts": 551
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 73747,
15
+ "min_text_length": 158,
16
+ "average_text_length": 682.8425925925926,
17
+ "max_text_length": 2843,
18
+ "unique_texts": 108
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 129,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.1944444444444444,
25
+ "max_relevant_docs_per_query": 5,
26
+ "unique_relevant_docs": 129
27
+ },
28
+ "top_ranked_statistics": {
29
+ "num_top_ranked": 59832,
30
+ "min_top_ranked_per_query": 554,
31
+ "average_top_ranked_per_query": 554.0,
32
+ "max_top_ranked_per_query": 554
33
+ }
34
+ }
35
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "standard": {
3
+ "num_samples": 60900,
4
+ "number_of_characters": 20971763,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 20898016,
7
+ "min_text_length": 1,
8
+ "average_text_length": 343.7626003421503,
9
+ "max_text_length": 158296,
10
+ "unique_texts": 50142
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 73747,
15
+ "min_text_length": 158,
16
+ "average_text_length": 682.8425925925926,
17
+ "max_text_length": 2843,
18
+ "unique_texts": 108
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 604,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 5.592592592592593,
25
+ "max_relevant_docs_per_query": 59,
26
+ "unique_relevant_docs": 604
27
+ },
28
+ "top_ranked_statistics": {
29
+ "num_top_ranked": 6565536,
30
+ "min_top_ranked_per_query": 60792,
31
+ "average_top_ranked_per_query": 60792.0,
32
+ "max_top_ranked_per_query": 60792
33
+ }
34
+ }
35
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "standard": {
3
+ "num_samples": 188207,
4
+ "number_of_characters": 141817604,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 141734227,
7
+ "min_text_length": 58,
8
+ "average_text_length": 753.8974425803981,
9
+ "max_text_length": 7334,
10
+ "unique_texts": 176508
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 83377,
15
+ "min_text_length": 12,
16
+ "average_text_length": 406.7170731707317,
17
+ "max_text_length": 1255,
18
+ "unique_texts": 201
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 469,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 2.299019607843137,
25
+ "max_relevant_docs_per_query": 7,
26
+ "unique_relevant_docs": 234
27
+ },
28
+ "top_ranked_statistics": {
29
+ "num_top_ranked": 37946536,
30
+ "min_top_ranked_per_query": 176970,
31
+ "average_top_ranked_per_query": 185105.05365853658,
32
+ "max_top_ranked_per_query": 188176
33
+ }
34
+ }
35
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "standard": {
3
+ "num_samples": 23904,
4
+ "number_of_characters": 20825122,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 20797224,
7
+ "min_text_length": 74,
8
+ "average_text_length": 872.4033726246906,
9
+ "max_text_length": 19104,
10
+ "unique_texts": 23839
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 27898,
15
+ "min_text_length": 13,
16
+ "average_text_length": 429.2,
17
+ "max_text_length": 1255,
18
+ "unique_texts": 65
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 126,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.9384615384615385,
25
+ "max_relevant_docs_per_query": 6,
26
+ "unique_relevant_docs": 95
27
+ },
28
+ "top_ranked_statistics": {
29
+ "num_top_ranked": 1549535,
30
+ "min_top_ranked_per_query": 23839,
31
+ "average_top_ranked_per_query": 23839.0,
32
+ "max_top_ranked_per_query": 23839
33
+ }
34
+ }
35
+ }
@@ -116,6 +116,7 @@ align_base = ModelMeta(
116
116
  release_date="2023-02-24",
117
117
  modalities=["image", "text"],
118
118
  n_parameters=176_000_000,
119
+ n_embedding_parameters=None,
119
120
  memory_usage_mb=671,
120
121
  max_tokens=64,
121
122
  embed_dim=768,
@@ -8,6 +8,7 @@ amazon_titan_text_embeddings_v2 = ModelMeta(
8
8
  release_date="2024-04-30",
9
9
  languages=["eng-Latn"],
10
10
  n_parameters=None,
11
+ n_embedding_parameters=None,
11
12
  memory_usage_mb=None,
12
13
  max_tokens=None,
13
14
  embed_dim=None,
@@ -12,6 +12,7 @@ model2vecdk = ModelMeta(
12
12
  revision="cb576c78dcc1b729e4612645f61db59929d69e61",
13
13
  release_date="2025-11-21",
14
14
  n_parameters=48042496,
15
+ n_embedding_parameters=None,
15
16
  memory_usage_mb=183,
16
17
  max_tokens=np.inf,
17
18
  embed_dim=256,
@@ -43,6 +44,7 @@ model2vecdk_stem = ModelMeta(
43
44
  revision="cb576c78dcc1b729e4612645f61db59929d69e61",
44
45
  release_date="2025-11-21",
45
46
  n_parameters=48578560,
47
+ n_embedding_parameters=None,
46
48
  memory_usage_mb=185,
47
49
  max_tokens=np.inf,
48
50
  embed_dim=256,
@@ -10,6 +10,7 @@ arabic_triplet_matryoshka = ModelMeta(
10
10
  revision="ed357f222f0b6ea6670d2c9b5a1cb93950d34200",
11
11
  release_date="2024-07-28",
12
12
  n_parameters=135_000_000,
13
+ n_embedding_parameters=49_152_000,
13
14
  memory_usage_mb=516,
14
15
  embed_dim=768,
15
16
  license="apache-2.0",
@@ -147,6 +147,7 @@ arctic_embed_xs = ModelMeta(
147
147
  open_weights=True,
148
148
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
149
149
  n_parameters=22_600_000,
150
+ n_embedding_parameters=11_720_448,
150
151
  memory_usage_mb=86,
151
152
  max_tokens=512,
152
153
  embed_dim=384,
@@ -173,6 +174,7 @@ arctic_embed_s = ModelMeta(
173
174
  open_weights=True,
174
175
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
175
176
  n_parameters=32_200_000,
177
+ n_embedding_parameters=11_720_448,
176
178
  memory_usage_mb=127,
177
179
  max_tokens=512,
178
180
  embed_dim=384,
@@ -199,6 +201,7 @@ arctic_embed_m = ModelMeta(
199
201
  open_weights=True,
200
202
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
201
203
  n_parameters=109_000_000,
204
+ n_embedding_parameters=23_440_896,
202
205
  memory_usage_mb=415,
203
206
  max_tokens=512,
204
207
  embed_dim=768,
@@ -225,6 +228,7 @@ arctic_embed_m_long = ModelMeta(
225
228
  open_weights=True,
226
229
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
227
230
  n_parameters=137_000_000,
231
+ n_embedding_parameters=None,
228
232
  memory_usage_mb=522,
229
233
  max_tokens=2048,
230
234
  embed_dim=768,
@@ -250,6 +254,7 @@ arctic_embed_l = ModelMeta(
250
254
  open_weights=True,
251
255
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
252
256
  n_parameters=335_000_000,
257
+ n_embedding_parameters=31_254_528,
253
258
  memory_usage_mb=1274,
254
259
  max_tokens=512,
255
260
  embed_dim=1024,
@@ -280,6 +285,7 @@ arctic_embed_m_v1_5 = ModelMeta(
280
285
  open_weights=True,
281
286
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors", "GGUF"],
282
287
  n_parameters=109_000_000,
288
+ n_embedding_parameters=23_440_896,
283
289
  memory_usage_mb=415,
284
290
  max_tokens=512,
285
291
  embed_dim=768,
@@ -306,6 +312,7 @@ arctic_embed_m_v2_0 = ModelMeta(
306
312
  open_weights=True,
307
313
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
308
314
  n_parameters=305_000_000,
315
+ n_embedding_parameters=None,
309
316
  memory_usage_mb=1165,
310
317
  max_tokens=8192,
311
318
  embed_dim=768,
@@ -331,6 +338,7 @@ arctic_embed_l_v2_0 = ModelMeta(
331
338
  open_weights=True,
332
339
  framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
333
340
  n_parameters=568_000_000,
341
+ n_embedding_parameters=256_002_048,
334
342
  memory_usage_mb=2166,
335
343
  max_tokens=8192,
336
344
  embed_dim=1024,
@@ -16,6 +16,7 @@ b1ade_embed = ModelMeta(
16
16
  open_weights=True,
17
17
  release_date="2025-03-10",
18
18
  n_parameters=335_000_000,
19
+ n_embedding_parameters=31_254_528,
19
20
  memory_usage_mb=1278,
20
21
  embed_dim=1024,
21
22
  license="mit",
@@ -179,6 +179,7 @@ amazon_titan_embed_text_v1 = ModelMeta(
179
179
  embed_dim=1536,
180
180
  open_weights=False,
181
181
  n_parameters=None,
182
+ n_embedding_parameters=None,
182
183
  memory_usage_mb=None,
183
184
  public_training_code=None,
184
185
  public_training_data=None, # assumed
@@ -206,6 +207,7 @@ amazon_titan_embed_text_v2 = ModelMeta(
206
207
  embed_dim=1024,
207
208
  open_weights=False,
208
209
  n_parameters=None,
210
+ n_embedding_parameters=None,
209
211
  memory_usage_mb=None,
210
212
  public_training_code=None,
211
213
  public_training_data=None, # assumed
@@ -235,6 +237,7 @@ cohere_embed_english_v3 = ModelMeta(
235
237
  revision="1",
236
238
  release_date="2023-11-02",
237
239
  n_parameters=None,
240
+ n_embedding_parameters=None,
238
241
  memory_usage_mb=None,
239
242
  public_training_code=None,
240
243
  public_training_data=None, # assumed
@@ -263,6 +266,7 @@ cohere_embed_multilingual_v3 = ModelMeta(
263
266
  revision="1",
264
267
  release_date="2023-11-02",
265
268
  n_parameters=None,
269
+ n_embedding_parameters=None,
266
270
  memory_usage_mb=None,
267
271
  public_training_code=None,
268
272
  public_training_data=None, # assumed
@@ -6,7 +6,29 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
6
6
 
7
7
  from .e5_instruct import E5_MISTRAL_TRAINING_DATA
8
8
 
9
- model_prompts = {"query": "Represent this sentence for searching relevant passages: "}
9
+ model_prompts = {
10
+ "query": "Represent this sentence for searching relevant passages: ",
11
+ "BrightBiologyRetrieval-query": "Represent this biology post for searching relevant passages: ",
12
+ "BrightEarthScienceRetrieval-query": "Represent this earth_science post for searching relevant passages: ",
13
+ "BrightEconomicsRetrieval-query": "Represent this economics post for searching relevant passages: ",
14
+ "BrightPsychologyRetrieval-query": "Represent this psychology post for searching relevant passages: ",
15
+ "BrightRoboticsRetrieval-query": "Represent this robotics post for searching relevant passages: ",
16
+ "BrightStackoverflowRetrieval-query": "Represent this stackoverflow post for searching relevant passages: ",
17
+ "BrightSustainableLivingRetrieval-query": "Represent this sustainable_living post for searching relevant passages: ",
18
+ "BrightPonyRetrieval-query": "Represent this Pony question for searching relevant passages: ",
19
+ "BrightLeetcodeRetrieval-query": "Represent this Coding problem for searching relevant examples: ",
20
+ "BrightAopsRetrieval-query": "Represent this Math problem for searching relevant examples: ",
21
+ "BrightTheoremQATheoremsRetrieval-query": "Represent this Math problem for searching relevant theorems: ",
22
+ "BrightTheoremQAQuestionsRetrieval-query": "Represent this Math problem for searching relevant examples: ",
23
+ "BrightBiologyLongRetrieval-query": "Represent this biology post for searching relevant documents: ",
24
+ "BrightEarthScienceLongRetrieval-query": "Represent this earth_science post for searching relevant documents: ",
25
+ "BrightEconomicsLongRetrieval-query": "Represent this economics post for searching relevant documents: ",
26
+ "BrightPsychologyLongRetrieval-query": "Represent this psychology post for searching relevant documents: ",
27
+ "BrightRoboticsLongRetrieval-query": "Represent this robotics post for searching relevant document: ",
28
+ "BrightStackoverflowLongRetrieval-query": "Represent this stackoverflow post for searching relevant document: ",
29
+ "BrightSustainableLivingLongRetrieval-query": "Represent this sustainable_living post for searching relevant documents: ",
30
+ "BrightPonyLongRetrieval-query": "Represent this Pony question for searching relevant documents: ",
31
+ }
10
32
  BGE_15_CITATION = """@misc{bge_embedding,
11
33
  title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
12
34
  author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
@@ -325,6 +347,7 @@ bge_small_en_v1_5 = ModelMeta(
325
347
  revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a",
326
348
  release_date="2023-09-12", # initial commit of hf model.
327
349
  n_parameters=33_400_000,
350
+ n_embedding_parameters=11_720_448,
328
351
  memory_usage_mb=127,
329
352
  embed_dim=512,
330
353
  license="mit",
@@ -357,6 +380,7 @@ bge_base_en_v1_5 = ModelMeta(
357
380
  revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a",
358
381
  release_date="2023-09-11", # initial commit of hf model.
359
382
  n_parameters=109_000_000,
383
+ n_embedding_parameters=23_440_896,
360
384
  memory_usage_mb=390,
361
385
  embed_dim=768,
362
386
  license="mit",
@@ -389,6 +413,7 @@ bge_large_en_v1_5 = ModelMeta(
389
413
  revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09",
390
414
  release_date="2023-09-12", # initial commit of hf model.
391
415
  n_parameters=335_000_000,
416
+ n_embedding_parameters=31_254_528,
392
417
  memory_usage_mb=1242,
393
418
  embed_dim=1024,
394
419
  license="mit",
@@ -421,6 +446,7 @@ bge_small_zh = ModelMeta(
421
446
  revision="1d2363c5de6ce9ba9c890c8e23a4c72dce540ca8",
422
447
  release_date="2023-08-05", # initial commit of hf model.
423
448
  n_parameters=33_400_000,
449
+ n_embedding_parameters=10_817_536,
424
450
  memory_usage_mb=127,
425
451
  embed_dim=512,
426
452
  license="mit",
@@ -448,6 +474,7 @@ bge_base_zh = ModelMeta(
448
474
  revision="0e5f83d4895db7955e4cb9ed37ab73f7ded339b6",
449
475
  release_date="2023-08-05", # initial commit of hf model.
450
476
  n_parameters=109_000_000,
477
+ n_embedding_parameters=16_226_304,
451
478
  memory_usage_mb=390,
452
479
  embed_dim=768,
453
480
  license="mit",
@@ -475,6 +502,7 @@ bge_large_zh = ModelMeta(
475
502
  revision="b5d9f5c027e87b6f0b6fa4b614f8f9cdc45ce0e8",
476
503
  release_date="2023-08-02", # initial commit of hf model.
477
504
  n_parameters=335_000_000,
505
+ n_embedding_parameters=21_635_072,
478
506
  memory_usage_mb=1242,
479
507
  embed_dim=1024,
480
508
  license="mit",
@@ -502,6 +530,7 @@ bge_small_en = ModelMeta(
502
530
  revision="4778d71a06863076696b03fd2777eb118712cad8",
503
531
  release_date="2023-08-05", # initial commit of hf model.
504
532
  n_parameters=33_400_000,
533
+ n_embedding_parameters=11_720_448,
505
534
  memory_usage_mb=127,
506
535
  embed_dim=512,
507
536
  license="mit",
@@ -529,6 +558,7 @@ bge_base_en = ModelMeta(
529
558
  revision="b737bf5dcc6ee8bdc530531266b4804a5d77b5d8",
530
559
  release_date="2023-08-05", # initial commit of hf model.
531
560
  n_parameters=109_000_000,
561
+ n_embedding_parameters=23_440_896,
532
562
  memory_usage_mb=390,
533
563
  embed_dim=768,
534
564
  license="mit",
@@ -562,6 +592,7 @@ bge_large_en = ModelMeta(
562
592
  revision="abe7d9d814b775ca171121fb03f394dc42974275",
563
593
  release_date="2023-08-05", # initial commit of hf model.
564
594
  n_parameters=335_000_000,
595
+ n_embedding_parameters=31_254_528,
565
596
  memory_usage_mb=1242,
566
597
  embed_dim=1024,
567
598
  license="mit",
@@ -590,6 +621,7 @@ bge_small_zh_v1_5 = ModelMeta(
590
621
  revision="7999e1d3359715c523056ef9478215996d62a620",
591
622
  release_date="2023-09-12", # initial commit of hf model.
592
623
  n_parameters=33_400_000,
624
+ n_embedding_parameters=10_817_536,
593
625
  memory_usage_mb=91,
594
626
  embed_dim=512,
595
627
  license="mit",
@@ -616,6 +648,7 @@ bge_base_zh_v1_5 = ModelMeta(
616
648
  revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65",
617
649
  release_date="2023-09-11", # initial commit of hf model.
618
650
  n_parameters=109_000_000,
651
+ n_embedding_parameters=16_226_304,
619
652
  memory_usage_mb=416,
620
653
  embed_dim=768,
621
654
  license="mit",
@@ -642,6 +675,7 @@ bge_large_zh_v1_5 = ModelMeta(
642
675
  revision="79e7739b6ab944e86d6171e44d24c997fc1e0116",
643
676
  release_date="2023-09-12", # initial commit of hf model.
644
677
  n_parameters=335_000_000,
678
+ n_embedding_parameters=21_635_072,
645
679
  memory_usage_mb=1278,
646
680
  embed_dim=1024,
647
681
  license="mit",
@@ -665,6 +699,7 @@ bge_m3 = ModelMeta(
665
699
  revision="5617a9f61b028005a4858fdac845db406aefb181",
666
700
  release_date="2024-06-28",
667
701
  n_parameters=568_000_000,
702
+ n_embedding_parameters=256_002_048,
668
703
  memory_usage_mb=2167,
669
704
  embed_dim=1024,
670
705
  license="mit",
@@ -761,6 +796,7 @@ bge_multilingual_gemma2 = ModelMeta(
761
796
  revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a",
762
797
  release_date="2024-07-25", # initial commit of hf model.
763
798
  n_parameters=int(9.24 * 1e9),
799
+ n_embedding_parameters=917_511_168,
764
800
  memory_usage_mb=35254,
765
801
  embed_dim=3584, # from old C-MTEB leaderboard
766
802
  license="https://ai.google.dev/gemma/terms",
@@ -808,6 +844,7 @@ bge_en_icl = ModelMeta(
808
844
  revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5",
809
845
  release_date="2024-07-25", # initial commit of hf model.
810
846
  n_parameters=int(7.11 * 1e9),
847
+ n_embedding_parameters=131_084_288,
811
848
  memory_usage_mb=27125,
812
849
  embed_dim=4096,
813
850
  license="apache-2.0",
@@ -842,6 +879,7 @@ bge_m3_unsupervised = ModelMeta(
842
879
  revision="46f03bc86361cf88102b0b517b36c8259f2946b1",
843
880
  release_date="2024-01-30", # January 30, 2024 - BGE-M3 release date
844
881
  n_parameters=568_000_000,
882
+ n_embedding_parameters=256_002_048,
845
883
  memory_usage_mb=2167,
846
884
  embed_dim=1024,
847
885
  license="mit",
@@ -871,6 +909,7 @@ manu__bge_m3_custom_fr = ModelMeta(
871
909
  languages=None,
872
910
  loader=sentence_transformers_loader,
873
911
  n_parameters=567754752,
912
+ n_embedding_parameters=256_002_048,
874
913
  memory_usage_mb=2166,
875
914
  max_tokens=8194.0,
876
915
  embed_dim=1024,
@@ -9,6 +9,7 @@ bica_base = ModelMeta(
9
9
  revision="31237a836e5ae908c308a256573e5f0986498574",
10
10
  release_date="2025-11-14",
11
11
  n_parameters=110_000_000,
12
+ n_embedding_parameters=23_440_896,
12
13
  memory_usage_mb=418,
13
14
  embed_dim=768,
14
15
  license="mit",
@@ -177,6 +177,7 @@ blip2_opt_2_7b = ModelMeta(
177
177
  release_date="2024-03-22",
178
178
  modalities=["image", "text"],
179
179
  n_parameters=3_740_000_000,
180
+ n_embedding_parameters=None,
180
181
  memory_usage_mb=14285,
181
182
  max_tokens=None,
182
183
  embed_dim=768,
@@ -201,6 +202,7 @@ blip2_opt_6_7b_coco = ModelMeta(
201
202
  release_date="2024-03-31",
202
203
  modalities=["image", "text"],
203
204
  n_parameters=7_750_000_000,
205
+ n_embedding_parameters=None,
204
206
  memory_usage_mb=29577,
205
207
  max_tokens=None,
206
208
  embed_dim=768,
@@ -141,6 +141,7 @@ blip_image_captioning_large = ModelMeta(
141
141
  release_date="2023-12-07",
142
142
  modalities=["image", "text"],
143
143
  n_parameters=470_000_000,
144
+ n_embedding_parameters=23_442_432,
144
145
  memory_usage_mb=1792,
145
146
  max_tokens=512,
146
147
  embed_dim=768,
@@ -169,6 +170,7 @@ blip_image_captioning_base = ModelMeta(
169
170
  release_date="2023-08-01",
170
171
  modalities=["image", "text"],
171
172
  n_parameters=247_000_000,
173
+ n_embedding_parameters=23_442_432,
172
174
  memory_usage_mb=942,
173
175
  max_tokens=512,
174
176
  embed_dim=768,
@@ -198,6 +200,7 @@ blip_vqa_base = ModelMeta(
198
200
  release_date="2023-12-07",
199
201
  modalities=["image", "text"],
200
202
  n_parameters=247_000_000,
203
+ n_embedding_parameters=23_442_432,
201
204
  memory_usage_mb=1467,
202
205
  max_tokens=512,
203
206
  embed_dim=768,
@@ -225,6 +228,7 @@ blip_vqa_capfilt_large = ModelMeta(
225
228
  release_date="2023-01-22",
226
229
  modalities=["image", "text"],
227
230
  n_parameters=247_000_000,
231
+ n_embedding_parameters=23_442_432,
228
232
  memory_usage_mb=942,
229
233
  max_tokens=512,
230
234
  embed_dim=768,
@@ -252,6 +256,7 @@ blip_itm_base_coco = ModelMeta(
252
256
  release_date="2023-08-01",
253
257
  modalities=["image", "text"],
254
258
  n_parameters=247_000_000,
259
+ n_embedding_parameters=23_442_432,
255
260
  memory_usage_mb=942,
256
261
  max_tokens=512,
257
262
  embed_dim=768,
@@ -279,6 +284,7 @@ blip_itm_large_coco = ModelMeta(
279
284
  release_date="2023-08-01",
280
285
  modalities=["image", "text"],
281
286
  n_parameters=470_000_000,
287
+ n_embedding_parameters=23_442_432,
282
288
  memory_usage_mb=1793,
283
289
  max_tokens=512,
284
290
  embed_dim=768,
@@ -307,6 +313,7 @@ blip_itm_base_flickr = ModelMeta(
307
313
  release_date="2023-08-01",
308
314
  modalities=["image", "text"],
309
315
  n_parameters=247_000_000,
316
+ n_embedding_parameters=23_442_432,
310
317
  memory_usage_mb=942,
311
318
  max_tokens=512,
312
319
  embed_dim=768,
@@ -335,6 +342,7 @@ blip_itm_large_flickr = ModelMeta(
335
342
  release_date="2023-08-01",
336
343
  modalities=["image", "text"],
337
344
  n_parameters=470_000_000,
345
+ n_embedding_parameters=23_442_432,
338
346
  memory_usage_mb=1793,
339
347
  max_tokens=512,
340
348
  embed_dim=768,
@@ -13,7 +13,6 @@ if TYPE_CHECKING:
13
13
  from mteb.types import (
14
14
  CorpusDatasetType,
15
15
  EncodeKwargs,
16
- InstructionDatasetType,
17
16
  QueryDatasetType,
18
17
  RetrievalOutputType,
19
18
  TopRankedDocumentsType,
@@ -80,7 +79,6 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
80
79
  hf_subset: str,
81
80
  top_k: int,
82
81
  encode_kwargs: EncodeKwargs,
83
- instructions: InstructionDatasetType | None = None,
84
82
  top_ranked: TopRankedDocumentsType | None = None,
85
83
  ) -> RetrievalOutputType:
86
84
  logger.info("Encoding Queries...")
@@ -103,13 +101,17 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
103
101
  query_results = queries_results[qi]
104
102
  scores = queries_scores[qi]
105
103
  doc_id_to_score = {}
104
+ query_documents = (
105
+ top_ranked[qid] if top_ranked and qid in top_ranked else None
106
+ )
106
107
 
107
108
  # Iterate over results
108
- for ri in range(len(query_results)):
109
- doc_idx = query_results[ri]
110
- score = scores[ri]
109
+ for doc_idx, score in zip(query_results, scores):
111
110
  doc_id = self.corpus_idx_to_id[doc_idx]
112
111
 
112
+ # handle reranking with a filtered set of documents
113
+ if query_documents is not None and doc_id not in query_documents:
114
+ continue
113
115
  doc_id_to_score[doc_id] = float(score)
114
116
 
115
117
  results[qid] = doc_id_to_score
@@ -132,6 +134,7 @@ bm25_s = ModelMeta(
132
134
  revision="0_1_10",
133
135
  release_date="2024-07-10", # release of version 0.1.10
134
136
  n_parameters=None,
137
+ n_embedding_parameters=None,
135
138
  memory_usage_mb=None,
136
139
  embed_dim=None,
137
140
  license=None,
@@ -103,6 +103,7 @@ BMRetriever_410M = ModelMeta(
103
103
  release_date="2024-04-29",
104
104
  embed_dim=1024,
105
105
  n_parameters=353_822_720,
106
+ n_embedding_parameters=51_511_296,
106
107
  memory_usage_mb=1349,
107
108
  max_tokens=2048,
108
109
  license="mit",
@@ -133,6 +134,7 @@ BMRetriever_1B = ModelMeta(
133
134
  release_date="2024-04-29",
134
135
  embed_dim=2048,
135
136
  n_parameters=908_759_040,
137
+ n_embedding_parameters=103_022_592,
136
138
  memory_usage_mb=3466,
137
139
  max_tokens=2048,
138
140
  license="mit",
@@ -163,6 +165,7 @@ BMRetriever_2B = ModelMeta(
163
165
  release_date="2024-04-29",
164
166
  embed_dim=2048,
165
167
  n_parameters=2_506_172_416,
168
+ n_embedding_parameters=524_288_000,
166
169
  memory_usage_mb=9560,
167
170
  max_tokens=8192,
168
171
  license="mit",
@@ -193,6 +196,7 @@ BMRetriever_7B = ModelMeta(
193
196
  release_date="2024-04-29",
194
197
  embed_dim=4096,
195
198
  n_parameters=7_110_660_096,
199
+ n_embedding_parameters=131_072_000,
196
200
  memory_usage_mb=27124,
197
201
  max_tokens=32768,
198
202
  license="mit",
@@ -41,6 +41,7 @@ cadet_embed = ModelMeta(
41
41
  open_weights=True,
42
42
  release_date="2025-05-11",
43
43
  n_parameters=109_000_000,
44
+ n_embedding_parameters=23_440_896,
44
45
  memory_usage_mb=418,
45
46
  embed_dim=768,
46
47
  license="apache-2.0",
@@ -226,6 +226,7 @@ cde_small_v1 = ModelMeta(
226
226
  revision="e151df18af0d7f1d1c37b074fee58406ececf19f",
227
227
  release_date="2024-09-24",
228
228
  n_parameters=int(281 * 1e6),
229
+ n_embedding_parameters=None,
229
230
  memory_usage_mb=1072, # Though the second-stage model is only 140M
230
231
  max_tokens=512,
231
232
  embed_dim=768,
@@ -255,6 +256,7 @@ cde_small_v2 = ModelMeta(
255
256
  revision="4e1d021a6c3fd7ce8aa0a7204057eee5ae61d390",
256
257
  release_date="2025-01-13",
257
258
  n_parameters=int(306 * 1e6),
259
+ n_embedding_parameters=None,
258
260
  memory_usage_mb=1166, # Though the second-stage model is only 140M
259
261
  max_tokens=512,
260
262
  embed_dim=768,
@@ -128,6 +128,7 @@ clip_vit_large_patch14 = ModelMeta(
128
128
  release_date="2021-02-26",
129
129
  modalities=["image", "text"],
130
130
  n_parameters=428_000_000,
131
+ n_embedding_parameters=None,
131
132
  memory_usage_mb=1631,
132
133
  max_tokens=77,
133
134
  embed_dim=768,
@@ -152,6 +153,7 @@ clip_vit_base_patch32 = ModelMeta(
152
153
  release_date="2021-02-26",
153
154
  modalities=["image", "text"],
154
155
  n_parameters=151_000_000,
156
+ n_embedding_parameters=None,
155
157
  memory_usage_mb=576,
156
158
  max_tokens=77,
157
159
  embed_dim=512,
@@ -176,6 +178,7 @@ clip_vit_base_patch16 = ModelMeta(
176
178
  release_date="2021-02-26",
177
179
  modalities=["image", "text"],
178
180
  n_parameters=151_000_000,
181
+ n_embedding_parameters=None,
179
182
  memory_usage_mb=576,
180
183
  max_tokens=77,
181
184
  embed_dim=512,