mteb 2.7.2__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. mteb/_create_dataloaders.py +16 -9
  2. mteb/_evaluators/any_sts_evaluator.py +10 -5
  3. mteb/_evaluators/clustering_evaluator.py +10 -4
  4. mteb/_evaluators/evaluator.py +9 -4
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
  6. mteb/_evaluators/pair_classification_evaluator.py +10 -5
  7. mteb/_evaluators/retrieval_evaluator.py +19 -13
  8. mteb/_evaluators/retrieval_metrics.py +9 -3
  9. mteb/_evaluators/sklearn_evaluator.py +14 -10
  10. mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
  11. mteb/_evaluators/text/summarization_evaluator.py +8 -4
  12. mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
  13. mteb/_helpful_enum.py +5 -1
  14. mteb/abstasks/_data_filter/filters.py +8 -2
  15. mteb/abstasks/_data_filter/task_pipelines.py +7 -2
  16. mteb/abstasks/_statistics_calculation.py +6 -4
  17. mteb/abstasks/abstask.py +17 -9
  18. mteb/abstasks/aggregate_task_metadata.py +20 -9
  19. mteb/abstasks/aggregated_task.py +15 -8
  20. mteb/abstasks/classification.py +15 -6
  21. mteb/abstasks/clustering.py +17 -8
  22. mteb/abstasks/clustering_legacy.py +14 -6
  23. mteb/abstasks/image/image_text_pair_classification.py +17 -7
  24. mteb/abstasks/multilabel_classification.py +11 -5
  25. mteb/abstasks/pair_classification.py +19 -9
  26. mteb/abstasks/regression.py +14 -6
  27. mteb/abstasks/retrieval.py +28 -17
  28. mteb/abstasks/retrieval_dataset_loaders.py +11 -8
  29. mteb/abstasks/sts.py +19 -10
  30. mteb/abstasks/task_metadata.py +17 -8
  31. mteb/abstasks/text/bitext_mining.py +14 -7
  32. mteb/abstasks/text/summarization.py +17 -7
  33. mteb/abstasks/zeroshot_classification.py +15 -7
  34. mteb/benchmarks/_create_table.py +13 -3
  35. mteb/benchmarks/benchmark.py +11 -1
  36. mteb/benchmarks/benchmarks/__init__.py +2 -0
  37. mteb/benchmarks/benchmarks/benchmarks.py +41 -2
  38. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  39. mteb/cache.py +10 -5
  40. mteb/cli/_display_tasks.py +9 -3
  41. mteb/cli/build_cli.py +5 -2
  42. mteb/cli/generate_model_card.py +9 -2
  43. mteb/deprecated_evaluator.py +16 -12
  44. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  45. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  46. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  47. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  48. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  49. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  50. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  51. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  52. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  64. mteb/evaluate.py +20 -18
  65. mteb/filter_tasks.py +12 -7
  66. mteb/get_tasks.py +9 -4
  67. mteb/languages/language_scripts.py +8 -3
  68. mteb/leaderboard/app.py +7 -3
  69. mteb/leaderboard/table.py +7 -2
  70. mteb/load_results.py +9 -3
  71. mteb/models/abs_encoder.py +22 -12
  72. mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
  73. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
  74. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
  75. mteb/models/cache_wrappers/cache_wrapper.py +14 -9
  76. mteb/models/get_model_meta.py +11 -4
  77. mteb/models/instruct_wrapper.py +13 -5
  78. mteb/models/model_implementations/align_models.py +10 -4
  79. mteb/models/model_implementations/amazon_models.py +1 -0
  80. mteb/models/model_implementations/andersborges.py +2 -0
  81. mteb/models/model_implementations/ara_models.py +1 -0
  82. mteb/models/model_implementations/arctic_models.py +8 -0
  83. mteb/models/model_implementations/b1ade_models.py +1 -0
  84. mteb/models/model_implementations/bedrock_models.py +20 -6
  85. mteb/models/model_implementations/bge_models.py +40 -1
  86. mteb/models/model_implementations/bica_model.py +1 -0
  87. mteb/models/model_implementations/blip2_models.py +11 -4
  88. mteb/models/model_implementations/blip_models.py +17 -4
  89. mteb/models/model_implementations/bm25.py +22 -14
  90. mteb/models/model_implementations/bmretriever_models.py +10 -2
  91. mteb/models/model_implementations/cadet_models.py +1 -0
  92. mteb/models/model_implementations/cde_models.py +11 -5
  93. mteb/models/model_implementations/clip_models.py +12 -4
  94. mteb/models/model_implementations/clips_models.py +3 -0
  95. mteb/models/model_implementations/codefuse_models.py +5 -0
  96. mteb/models/model_implementations/codesage_models.py +3 -0
  97. mteb/models/model_implementations/cohere_models.py +14 -4
  98. mteb/models/model_implementations/cohere_v.py +14 -4
  99. mteb/models/model_implementations/colpali_models.py +7 -3
  100. mteb/models/model_implementations/colqwen_models.py +17 -31
  101. mteb/models/model_implementations/colsmol_models.py +3 -1
  102. mteb/models/model_implementations/conan_models.py +11 -4
  103. mteb/models/model_implementations/dino_models.py +28 -4
  104. mteb/models/model_implementations/e5_instruct.py +4 -0
  105. mteb/models/model_implementations/e5_models.py +9 -0
  106. mteb/models/model_implementations/e5_v.py +10 -4
  107. mteb/models/model_implementations/eagerworks_models.py +11 -4
  108. mteb/models/model_implementations/emillykkejensen_models.py +3 -0
  109. mteb/models/model_implementations/en_code_retriever.py +1 -0
  110. mteb/models/model_implementations/euler_models.py +1 -0
  111. mteb/models/model_implementations/evaclip_models.py +13 -4
  112. mteb/models/model_implementations/fa_models.py +9 -0
  113. mteb/models/model_implementations/facebookai.py +2 -0
  114. mteb/models/model_implementations/geogpt_models.py +1 -0
  115. mteb/models/model_implementations/gme_v_models.py +7 -3
  116. mteb/models/model_implementations/google_models.py +15 -4
  117. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
  118. mteb/models/model_implementations/gritlm_models.py +2 -0
  119. mteb/models/model_implementations/gte_models.py +9 -0
  120. mteb/models/model_implementations/hinvec_models.py +6 -1
  121. mteb/models/model_implementations/human.py +1 -0
  122. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  123. mteb/models/model_implementations/inf_models.py +2 -0
  124. mteb/models/model_implementations/jasper_models.py +14 -5
  125. mteb/models/model_implementations/jina_clip.py +10 -4
  126. mteb/models/model_implementations/jina_models.py +17 -5
  127. mteb/models/model_implementations/kalm_models.py +24 -12
  128. mteb/models/model_implementations/kblab.py +1 -0
  129. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
  130. mteb/models/model_implementations/kfst.py +1 -0
  131. mteb/models/model_implementations/kowshik24_models.py +1 -0
  132. mteb/models/model_implementations/lens_models.py +2 -0
  133. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  134. mteb/models/model_implementations/linq_models.py +7 -1
  135. mteb/models/model_implementations/listconranker.py +10 -4
  136. mteb/models/model_implementations/llm2clip_models.py +12 -4
  137. mteb/models/model_implementations/llm2vec_models.py +20 -6
  138. mteb/models/model_implementations/mcinext_models.py +8 -2
  139. mteb/models/model_implementations/mdbr_models.py +2 -0
  140. mteb/models/model_implementations/misc_models.py +63 -0
  141. mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
  142. mteb/models/model_implementations/mme5_models.py +2 -1
  143. mteb/models/model_implementations/moco_models.py +11 -4
  144. mteb/models/model_implementations/mod_models.py +2 -1
  145. mteb/models/model_implementations/model2vec_models.py +23 -4
  146. mteb/models/model_implementations/moka_models.py +3 -0
  147. mteb/models/model_implementations/nbailab.py +3 -0
  148. mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
  149. mteb/models/model_implementations/nomic_models.py +16 -4
  150. mteb/models/model_implementations/nomic_models_vision.py +5 -3
  151. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
  152. mteb/models/model_implementations/nvidia_models.py +15 -4
  153. mteb/models/model_implementations/octen_models.py +3 -1
  154. mteb/models/model_implementations/openai_models.py +14 -4
  155. mteb/models/model_implementations/openclip_models.py +17 -4
  156. mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
  157. mteb/models/model_implementations/ops_moa_models.py +9 -2
  158. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
  159. mteb/models/model_implementations/pawan_models.py +1 -0
  160. mteb/models/model_implementations/piccolo_models.py +2 -0
  161. mteb/models/model_implementations/promptriever_models.py +16 -6
  162. mteb/models/model_implementations/pylate_models.py +22 -13
  163. mteb/models/model_implementations/qodo_models.py +2 -0
  164. mteb/models/model_implementations/qtack_models.py +1 -0
  165. mteb/models/model_implementations/qwen3_models.py +11 -1
  166. mteb/models/model_implementations/qzhou_models.py +2 -0
  167. mteb/models/model_implementations/random_baseline.py +4 -3
  168. mteb/models/model_implementations/rasgaard_models.py +1 -0
  169. mteb/models/model_implementations/reasonir_model.py +65 -0
  170. mteb/models/model_implementations/repllama_models.py +15 -6
  171. mteb/models/model_implementations/rerankers_custom.py +13 -4
  172. mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
  173. mteb/models/model_implementations/richinfoai_models.py +1 -0
  174. mteb/models/model_implementations/ru_sentence_models.py +20 -0
  175. mteb/models/model_implementations/ruri_models.py +10 -0
  176. mteb/models/model_implementations/salesforce_models.py +10 -1
  177. mteb/models/model_implementations/samilpwc_models.py +1 -0
  178. mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
  179. mteb/models/model_implementations/searchmap_models.py +1 -0
  180. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  181. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
  182. mteb/models/model_implementations/seed_models.py +2 -1
  183. mteb/models/model_implementations/sentence_transformers_models.py +18 -0
  184. mteb/models/model_implementations/shuu_model.py +1 -0
  185. mteb/models/model_implementations/siglip_models.py +19 -4
  186. mteb/models/model_implementations/slm_models.py +7 -4
  187. mteb/models/model_implementations/sonar_models.py +2 -1
  188. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
  189. mteb/models/model_implementations/stella_models.py +6 -0
  190. mteb/models/model_implementations/tarka_models.py +2 -0
  191. mteb/models/model_implementations/text2vec_models.py +3 -0
  192. mteb/models/model_implementations/ua_sentence_models.py +1 -0
  193. mteb/models/model_implementations/uae_models.py +10 -4
  194. mteb/models/model_implementations/vdr_models.py +8 -1
  195. mteb/models/model_implementations/vi_vn_models.py +6 -0
  196. mteb/models/model_implementations/vista_models.py +11 -4
  197. mteb/models/model_implementations/vlm2vec_models.py +11 -4
  198. mteb/models/model_implementations/voyage_models.py +25 -4
  199. mteb/models/model_implementations/voyage_v.py +11 -6
  200. mteb/models/model_implementations/xyz_models.py +1 -0
  201. mteb/models/model_implementations/youtu_models.py +1 -0
  202. mteb/models/model_implementations/yuan_models.py +1 -0
  203. mteb/models/model_implementations/yuan_models_en.py +2 -1
  204. mteb/models/model_meta.py +47 -9
  205. mteb/models/models_protocols.py +19 -18
  206. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  207. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
  208. mteb/models/search_wrappers.py +19 -12
  209. mteb/models/sentence_transformer_wrapper.py +4 -3
  210. mteb/models/vllm_wrapper.py +8 -6
  211. mteb/results/benchmark_results.py +22 -17
  212. mteb/results/model_result.py +21 -15
  213. mteb/results/task_result.py +15 -9
  214. mteb/similarity_functions.py +8 -2
  215. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  216. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  217. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  218. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  219. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  220. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  221. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  222. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  223. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  224. mteb/tasks/clustering/nob/snl_clustering.py +7 -2
  225. mteb/tasks/clustering/nob/vg_clustering.py +7 -2
  226. mteb/tasks/retrieval/eng/__init__.py +42 -0
  227. mteb/tasks/retrieval/eng/bright_retrieval.py +9 -1
  228. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  229. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  230. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
  231. mteb/types/_encoder_io.py +1 -1
  232. mteb/types/statistics.py +9 -2
  233. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/METADATA +1 -1
  234. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/RECORD +238 -217
  235. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/WHEEL +0 -0
  236. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/entry_points.txt +0 -0
  237. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/licenses/LICENSE +0 -0
  238. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,7 @@ greennode_embedding_large_vn_v1 = ModelMeta(
16
16
  loader=sentence_transformers_loader,
17
17
  open_weights=True,
18
18
  n_parameters=568_000_000,
19
+ n_embedding_parameters=256_002_048,
19
20
  memory_usage_mb=2167,
20
21
  embed_dim=1024,
21
22
  license="cc-by-4.0",
@@ -41,6 +42,7 @@ greennode_embedding_large_vn_mixed_v1 = ModelMeta(
41
42
  loader=sentence_transformers_loader,
42
43
  open_weights=True,
43
44
  n_parameters=568_000_000,
45
+ n_embedding_parameters=256_002_048,
44
46
  memory_usage_mb=2167,
45
47
  embed_dim=1024,
46
48
  license="cc-by-4.0",
@@ -66,6 +68,7 @@ aiteamvn_vietnamese_embeddings = ModelMeta(
66
68
  loader=sentence_transformers_loader,
67
69
  open_weights=True,
68
70
  n_parameters=568_000_000,
71
+ n_embedding_parameters=256_002_048,
69
72
  memory_usage_mb=2166,
70
73
  embed_dim=1024,
71
74
  license="cc-by-4.0",
@@ -98,6 +101,7 @@ hiieu_halong_embedding = ModelMeta(
98
101
  use_instructions=False,
99
102
  open_weights=True,
100
103
  n_parameters=278_000_000,
104
+ n_embedding_parameters=192_001_536,
101
105
  memory_usage_mb=1061,
102
106
  embed_dim=768,
103
107
  license="apache-2.0",
@@ -129,6 +133,7 @@ sup_simcse_vietnamese_phobert_base_ = ModelMeta(
129
133
  use_instructions=False,
130
134
  open_weights=True,
131
135
  n_parameters=135_000_000,
136
+ n_embedding_parameters=49_152_768,
132
137
  memory_usage_mb=517,
133
138
  max_tokens=256,
134
139
  embed_dim=768,
@@ -167,6 +172,7 @@ bkai_foundation_models_vietnamese_bi_encoder = ModelMeta(
167
172
  use_instructions=False,
168
173
  open_weights=True,
169
174
  n_parameters=135_000_000,
175
+ n_embedding_parameters=49_152_768,
170
176
  memory_usage_mb=515,
171
177
  max_tokens=256,
172
178
  embed_dim=768,
@@ -1,14 +1,19 @@
1
- from typing import Any, Literal
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Literal
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
8
  from mteb._requires_package import requires_image_dependencies
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.types import Array, BatchedInput, PromptType
11
+
12
+ if TYPE_CHECKING:
13
+ from torch.utils.data import DataLoader
14
+
15
+ from mteb.abstasks.task_metadata import TaskMetadata
16
+ from mteb.types import Array, BatchedInput, PromptType
12
17
 
13
18
  VISTA_CITATION = """@article{zhou2024vista,
14
19
  title={VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval},
@@ -253,6 +258,7 @@ visualized_bge_base = ModelMeta(
253
258
  release_date="2024-06-06",
254
259
  modalities=["image", "text"],
255
260
  n_parameters=196_000_000,
261
+ n_embedding_parameters=None,
256
262
  memory_usage_mb=1631,
257
263
  max_tokens=512,
258
264
  embed_dim=768,
@@ -281,6 +287,7 @@ visualized_bge_m3 = ModelMeta(
281
287
  release_date="2024-06-06",
282
288
  modalities=["image", "text"],
283
289
  n_parameters=872_909_505,
290
+ n_embedding_parameters=None,
284
291
  memory_usage_mb=4263,
285
292
  max_tokens=8192,
286
293
  embed_dim=1024,
@@ -1,8 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from torch.utils.data import DataLoader
6
7
  from tqdm.auto import tqdm
7
8
 
8
9
  from mteb._requires_package import (
@@ -10,10 +11,14 @@ from mteb._requires_package import (
10
11
  requires_package,
11
12
  suggest_package,
12
13
  )
13
- from mteb.abstasks.task_metadata import TaskMetadata
14
14
  from mteb.models.abs_encoder import AbsEncoder
15
15
  from mteb.models.model_meta import ModelMeta, ScoringFunction
16
- from mteb.types import Array, BatchedInput, PromptType
16
+
17
+ if TYPE_CHECKING:
18
+ from torch.utils.data import DataLoader
19
+
20
+ from mteb.abstasks.task_metadata import TaskMetadata
21
+ from mteb.types import Array, BatchedInput, PromptType
17
22
 
18
23
  logger = logging.getLogger(__name__)
19
24
 
@@ -275,6 +280,7 @@ vlm2vec_lora = ModelMeta(
275
280
  release_date="2024-10-08",
276
281
  modalities=["image", "text"],
277
282
  n_parameters=None,
283
+ n_embedding_parameters=None,
278
284
  memory_usage_mb=None,
279
285
  max_tokens=131072,
280
286
  embed_dim=3072,
@@ -299,6 +305,7 @@ vlm2vec_full = ModelMeta(
299
305
  release_date="2024-10-08",
300
306
  modalities=["image", "text"],
301
307
  n_parameters=4_150_000_000,
308
+ n_embedding_parameters=None,
302
309
  memory_usage_mb=7909,
303
310
  max_tokens=131072,
304
311
  embed_dim=3072,
@@ -1,16 +1,22 @@
1
+ from __future__ import annotations
2
+
1
3
  import time
2
4
  from functools import wraps
3
- from typing import Any, Literal
5
+ from typing import TYPE_CHECKING, Any, Literal
4
6
 
5
7
  import numpy as np
6
- from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
9
10
  from mteb._requires_package import requires_package
10
- from mteb.abstasks.task_metadata import TaskMetadata
11
11
  from mteb.models.abs_encoder import AbsEncoder
12
12
  from mteb.models.model_meta import ModelMeta, ScoringFunction
13
- from mteb.types import Array, BatchedInput, PromptType
13
+ from mteb.types import PromptType
14
+
15
+ if TYPE_CHECKING:
16
+ from torch.utils.data import DataLoader
17
+
18
+ from mteb.abstasks.task_metadata import TaskMetadata
19
+ from mteb.types import Array, BatchedInput
14
20
 
15
21
  VOYAGE_TRAINING_DATA = set(
16
22
  # Self-reported (message from VoyageAI member)
@@ -302,6 +308,7 @@ voyage_3_large = ModelMeta(
302
308
  embed_dim=1024,
303
309
  open_weights=False,
304
310
  n_parameters=None,
311
+ n_embedding_parameters=None,
305
312
  memory_usage_mb=None,
306
313
  license=None,
307
314
  reference="https://blog.voyageai.com/2025/01/07/voyage-3-large/",
@@ -330,6 +337,7 @@ voyage_3_5 = ModelMeta(
330
337
  embed_dim=1024,
331
338
  open_weights=False,
332
339
  n_parameters=None,
340
+ n_embedding_parameters=None,
333
341
  memory_usage_mb=None,
334
342
  license=None,
335
343
  reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
@@ -357,6 +365,7 @@ voyage_3_5_int8 = ModelMeta(
357
365
  embed_dim=1024,
358
366
  open_weights=False,
359
367
  n_parameters=None,
368
+ n_embedding_parameters=None,
360
369
  memory_usage_mb=None,
361
370
  license=None,
362
371
  reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
@@ -384,6 +393,7 @@ voyage_3_5_binary = ModelMeta(
384
393
  embed_dim=1024, # Same as original after unpacking from bits
385
394
  open_weights=False,
386
395
  n_parameters=None,
396
+ n_embedding_parameters=None,
387
397
  memory_usage_mb=None,
388
398
  license=None,
389
399
  reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
@@ -411,6 +421,7 @@ voyage_large_2_instruct = ModelMeta(
411
421
  embed_dim=1024,
412
422
  open_weights=False,
413
423
  n_parameters=None,
424
+ n_embedding_parameters=None,
414
425
  memory_usage_mb=None,
415
426
  license=None,
416
427
  reference="https://blog.voyageai.com/2024/05/05/voyage-large-2-instruct-instruction-tuned-and-rank-1-on-mteb/",
@@ -437,6 +448,7 @@ voyage_finance_2 = ModelMeta(
437
448
  embed_dim=1024,
438
449
  open_weights=False,
439
450
  n_parameters=None,
451
+ n_embedding_parameters=None,
440
452
  memory_usage_mb=None,
441
453
  license=None,
442
454
  reference="https://blog.voyageai.com/2024/06/03/domain-specific-embeddings-finance-edition-voyage-finance-2/",
@@ -463,6 +475,7 @@ voyage_law_2 = ModelMeta(
463
475
  embed_dim=1024,
464
476
  open_weights=False,
465
477
  n_parameters=None,
478
+ n_embedding_parameters=None,
466
479
  memory_usage_mb=None,
467
480
  license=None,
468
481
  reference="https://blog.voyageai.com/2024/04/15/domain-specific-embeddings-and-retrieval-legal-edition-voyage-law-2/",
@@ -489,6 +502,7 @@ voyage_code_2 = ModelMeta(
489
502
  embed_dim=1536,
490
503
  open_weights=False,
491
504
  n_parameters=None,
505
+ n_embedding_parameters=None,
492
506
  memory_usage_mb=None,
493
507
  license=None,
494
508
  reference="https://blog.voyageai.com/2024/01/23/voyage-code-2-elevate-your-code-retrieval/",
@@ -515,6 +529,7 @@ voyage_code_3 = ModelMeta(
515
529
  embed_dim=1024,
516
530
  open_weights=False,
517
531
  n_parameters=None,
532
+ n_embedding_parameters=None,
518
533
  memory_usage_mb=None,
519
534
  license=None,
520
535
  reference="https://blog.voyageai.com/2024/12/04/voyage-code-3/",
@@ -542,6 +557,7 @@ voyage_large_2 = ModelMeta(
542
557
  embed_dim=1536,
543
558
  open_weights=False,
544
559
  n_parameters=None,
560
+ n_embedding_parameters=None,
545
561
  memory_usage_mb=None,
546
562
  license=None,
547
563
  reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/",
@@ -568,6 +584,7 @@ voyage_2 = ModelMeta(
568
584
  embed_dim=1024,
569
585
  open_weights=False,
570
586
  n_parameters=None,
587
+ n_embedding_parameters=None,
571
588
  memory_usage_mb=None,
572
589
  license=None,
573
590
  reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/",
@@ -593,6 +610,7 @@ voyage_multilingual_2 = ModelMeta(
593
610
  embed_dim=1024,
594
611
  open_weights=False,
595
612
  n_parameters=None,
613
+ n_embedding_parameters=None,
596
614
  memory_usage_mb=None,
597
615
  license=None,
598
616
  reference="https://blog.voyageai.com/2024/06/10/voyage-multilingual-2-multilingual-embedding-model/",
@@ -619,6 +637,7 @@ voyage_3 = ModelMeta(
619
637
  embed_dim=1024,
620
638
  open_weights=False,
621
639
  n_parameters=None,
640
+ n_embedding_parameters=None,
622
641
  memory_usage_mb=None,
623
642
  license=None,
624
643
  reference="https://blog.voyageai.com/2024/09/18/voyage-3/",
@@ -645,6 +664,7 @@ voyage_3_lite = ModelMeta(
645
664
  embed_dim=512,
646
665
  open_weights=False,
647
666
  n_parameters=None,
667
+ n_embedding_parameters=None,
648
668
  memory_usage_mb=None,
649
669
  license=None,
650
670
  reference="https://blog.voyageai.com/2024/09/18/voyage-3/",
@@ -673,6 +693,7 @@ voyage_3_exp = ModelMeta(
673
693
  open_weights=False,
674
694
  # from their card https://huggingface.co/voyageai/voyage-3-m-exp#model-information
675
695
  n_parameters=int(6918 * 1e6),
696
+ n_embedding_parameters=None,
676
697
  memory_usage_mb=None,
677
698
  license=None,
678
699
  reference="https://huggingface.co/voyageai/voyage-3-m-exp",
@@ -4,17 +4,19 @@ import logging
4
4
  from typing import TYPE_CHECKING, Any, Literal
5
5
 
6
6
  import torch
7
- from torch.utils.data import DataLoader
8
7
  from tqdm.auto import tqdm
9
8
 
10
9
  from mteb._requires_package import requires_image_dependencies, requires_package
11
- from mteb.abstasks.task_metadata import TaskMetadata
12
10
  from mteb.models.abs_encoder import AbsEncoder
13
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
14
- from mteb.types import Array, BatchedInput, PromptType
12
+ from mteb.types import PromptType
15
13
 
16
14
  if TYPE_CHECKING:
17
15
  from PIL import Image
16
+ from torch.utils.data import DataLoader
17
+
18
+ from mteb.abstasks.task_metadata import TaskMetadata
19
+ from mteb.types import Array, BatchedInput
18
20
 
19
21
  logger = logging.getLogger(__name__)
20
22
 
@@ -27,6 +29,8 @@ def _downsample_image(
27
29
  Returns:
28
30
  The downsampled image.
29
31
  """
32
+ from PIL.Image import Resampling
33
+
30
34
  width, height = image.size
31
35
  pixels = width * height
32
36
 
@@ -42,15 +46,15 @@ def _downsample_image(
42
46
  logger.info(
43
47
  f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
44
48
  )
45
- return image.resize(new_size, Image.LANCZOS)
49
+ return image.resize(new_size, Resampling.LANCZOS)
46
50
  if width > height:
47
51
  if width > 10000:
48
52
  logger.error("Processing extremely wide images.")
49
- return image.resize((10000, height), Image.LANCZOS)
53
+ return image.resize((10000, height), Resampling.LANCZOS)
50
54
  else:
51
55
  if height > 10000:
52
56
  logger.error("Processing extremely high images.")
53
- return image.resize((width, 10000), Image.LANCZOS)
57
+ return image.resize((width, 10000), Resampling.LANCZOS)
54
58
  return image
55
59
 
56
60
 
@@ -211,6 +215,7 @@ voyage_v = ModelMeta(
211
215
  revision="1",
212
216
  release_date="2024-11-10",
213
217
  n_parameters=None,
218
+ n_embedding_parameters=None,
214
219
  memory_usage_mb=None,
215
220
  max_tokens=32768,
216
221
  embed_dim=1024,
@@ -31,6 +31,7 @@ xyz_embedding = ModelMeta(
31
31
  revision="4004120220b99baea764a1d3508427248ac3bccf",
32
32
  release_date="2024-09-13",
33
33
  n_parameters=326000000,
34
+ n_embedding_parameters=21_635_072,
34
35
  memory_usage_mb=1242,
35
36
  max_tokens=512,
36
37
  embed_dim=768,
@@ -121,6 +121,7 @@ Youtu_Embedding_V1 = ModelMeta(
121
121
  release_date="2025-09-28",
122
122
  open_weights=True,
123
123
  n_parameters=2672957440,
124
+ n_embedding_parameters=None,
124
125
  memory_usage_mb=None,
125
126
  embed_dim=2048,
126
127
  license="apache-2.0",
@@ -20,6 +20,7 @@ yuan_embedding_2_zh = ModelMeta(
20
20
  revision="b5ebcace6f4fc6e5a4d1852557eb2dc2d1040cee",
21
21
  release_date="2025-11-24",
22
22
  n_parameters=326000000,
23
+ n_embedding_parameters=21_635_072,
23
24
  memory_usage_mb=1242,
24
25
  embed_dim=1792,
25
26
  license="apache-2.0",
@@ -1,6 +1,6 @@
1
1
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
2
  from mteb.models.model_meta import ModelMeta
3
- from mteb.models.models_protocols import PromptType
3
+ from mteb.types import PromptType
4
4
 
5
5
 
6
6
  def instruction_template(
@@ -43,6 +43,7 @@ yuan_embedding_2_en = ModelMeta(
43
43
  revision="b2fd15da3bcae3473c8529593825c15068f09fce",
44
44
  release_date="2025-11-27",
45
45
  n_parameters=595776512,
46
+ n_embedding_parameters=None,
46
47
  memory_usage_mb=2272,
47
48
  embed_dim=1024,
48
49
  max_tokens=2048,
mteb/models/model_meta.py CHANGED
@@ -3,17 +3,16 @@ from __future__ import annotations
3
3
  import json
4
4
  import logging
5
5
  import warnings
6
- from collections.abc import Callable, Sequence
6
+ from collections.abc import Callable
7
7
  from dataclasses import field
8
8
  from enum import Enum
9
9
  from functools import partial
10
10
  from pathlib import Path
11
11
  from typing import TYPE_CHECKING, Any, Literal, cast
12
12
 
13
+ import numpy as np
13
14
  from huggingface_hub import (
14
- GitCommitInfo,
15
15
  ModelCard,
16
- ModelCardData,
17
16
  get_safetensors_metadata,
18
17
  hf_hub_download,
19
18
  list_repo_commits,
@@ -29,18 +28,27 @@ from huggingface_hub.errors import (
29
28
  SafetensorsParsingError,
30
29
  )
31
30
  from pydantic import BaseModel, ConfigDict, field_validator, model_validator
31
+ from sentence_transformers.models import Transformer
32
+ from torch import nn
32
33
  from transformers import AutoConfig
33
- from typing_extensions import Self
34
34
 
35
35
  from mteb._helpful_enum import HelpfulStrEnum
36
36
  from mteb.languages import check_language_code
37
- from mteb.models.models_protocols import EncoderProtocol, MTEBModels
37
+ from mteb.models.models_protocols import MTEBModels
38
38
  from mteb.types import ISOLanguageScript, Licenses, Modalities, StrDate, StrURL
39
39
 
40
40
  if TYPE_CHECKING:
41
+ from collections.abc import Sequence
42
+
43
+ from huggingface_hub import (
44
+ GitCommitInfo,
45
+ ModelCardData,
46
+ )
41
47
  from sentence_transformers import CrossEncoder, SentenceTransformer
48
+ from typing_extensions import Self
42
49
 
43
50
  from mteb.abstasks import AbsTask
51
+ from mteb.models.models_protocols import EncoderProtocol
44
52
 
45
53
 
46
54
  logger = logging.getLogger(__name__)
@@ -94,8 +102,9 @@ class ModelMeta(BaseModel):
94
102
  loader: The function that loads the model. If None it assumes that the model is not implemented.
95
103
  loader_kwargs: The keyword arguments to pass to the loader function.
96
104
  name: The name of the model, ideally the name on huggingface. It should be in the format "organization/model_name".
97
- n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be None if the number of parameters is not known (e.g. for proprietary models) or
98
- if the loader returns a SentenceTransformer model from which it can be derived.
105
+ n_parameters: The total number of parameters in the model, e.g. `7_000_000` for a 7M parameter model. Can be none in case the number of parameters is unknown.
106
+ n_embedding_parameters: The number of parameters used for the embedding layer. Can be None if the number of embedding parameters is not known (e.g. for proprietary models).
107
+ n_active_parameters_override: The number of active parameters used bu model. Should be used **only** for Mixture of Experts models.
99
108
  memory_usage_mb: The memory usage of the model in MB. Can be None if the memory usage is not known (e.g. for proprietary models). To calculate it use the `calculate_memory_usage_mb` method.
100
109
  max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary
101
110
  models).
@@ -134,6 +143,8 @@ class ModelMeta(BaseModel):
134
143
  release_date: StrDate | None
135
144
  languages: list[ISOLanguageScript] | None
136
145
  n_parameters: int | None
146
+ n_active_parameters_override: int | None = None
147
+ n_embedding_parameters: int | None = None
137
148
  memory_usage_mb: float | None
138
149
  max_tokens: float | None
139
150
  embed_dim: int | None
@@ -192,6 +203,16 @@ class ModelMeta(BaseModel):
192
203
  """
193
204
  return "cross-encoder" in self.model_type
194
205
 
206
+ @property
207
+ def n_active_parameters(self):
208
+ """Number of active parameters. Assumed to be `n_parameters - n_embedding_parameters`. Can be overwritten using `n_active_parameters_override` e.g. for MoE models."""
209
+ if self.n_active_parameters_override is not None:
210
+ return self.n_active_parameters_override
211
+
212
+ if self.n_parameters is not None and self.n_embedding_parameters is not None:
213
+ return self.n_parameters - self.n_embedding_parameters
214
+ return None
215
+
195
216
  @field_validator("similarity_fn_name", mode="before")
196
217
  @classmethod
197
218
  def _validate_similarity_fn_name(cls, value: str) -> ScoringFunction | None:
@@ -384,6 +405,14 @@ class ModelMeta(BaseModel):
384
405
  else model.model_card_data.base_model
385
406
  )
386
407
  meta = cls._from_hub(name, revision, compute_metadata)
408
+ try:
409
+ first = model[0]
410
+
411
+ if isinstance(first, Transformer):
412
+ emb = first.auto_model.get_input_embeddings()
413
+ meta.n_embedding_parameters = int(np.prod(emb.weight.shape))
414
+ except Exception as e:
415
+ logger.warning(f"Could not calculate embedding parameters for {name}: {e}")
387
416
  meta.revision = model.model_card_data.base_model_revision or meta.revision
388
417
  meta.max_tokens = model.max_seq_length
389
418
  meta.embed_dim = model.get_sentence_embedding_dimension()
@@ -455,6 +484,15 @@ class ModelMeta(BaseModel):
455
484
  from mteb.models import CrossEncoderWrapper
456
485
 
457
486
  meta = cls._from_hub(model.model.name_or_path, revision, compute_metadata)
487
+ try:
488
+ emb = model.model.get_input_embeddings()
489
+
490
+ if isinstance(emb, nn.Embedding):
491
+ meta.n_embedding_parameters = int(np.prod(emb.weight.shape))
492
+ except Exception as e:
493
+ logger.warning(
494
+ f"Could not calculate embedding parameters for {model.model.name_or_path}: {e}"
495
+ )
458
496
  meta.revision = model.config._commit_hash or meta.revision
459
497
  meta.loader = CrossEncoderWrapper
460
498
  meta.embed_dim = None
@@ -479,7 +517,7 @@ class ModelMeta(BaseModel):
479
517
  if isinstance(tasks[0], str):
480
518
  benchmark_datasets = set(tasks)
481
519
  else:
482
- tasks = cast(Sequence["AbsTask"], tasks)
520
+ tasks = cast("Sequence[AbsTask]", tasks)
483
521
  benchmark_datasets = set()
484
522
  for task in tasks:
485
523
  benchmark_datasets.add(task.metadata.name)
@@ -534,7 +572,7 @@ class ModelMeta(BaseModel):
534
572
  if isinstance(tasks[0], str):
535
573
  benchmark_datasets = set(tasks)
536
574
  else:
537
- tasks = cast(Sequence["AbsTask"], tasks)
575
+ tasks = cast("Sequence[AbsTask]", tasks)
538
576
  benchmark_datasets = {task.metadata.name for task in tasks}
539
577
  overlap = training_datasets & benchmark_datasets
540
578
  perc_overlap = 100 * (len(overlap) / len(benchmark_datasets))
@@ -1,22 +1,23 @@
1
- from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
1
+ from __future__ import annotations
2
2
 
3
- from torch.utils.data import DataLoader
4
- from typing_extensions import Unpack
5
-
6
- from mteb.abstasks.task_metadata import TaskMetadata
7
- from mteb.types import (
8
- Array,
9
- BatchedInput,
10
- CorpusDatasetType,
11
- EncodeKwargs,
12
- PromptType,
13
- QueryDatasetType,
14
- RetrievalOutputType,
15
- TopRankedDocumentsType,
16
- )
3
+ from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
17
4
 
18
5
  if TYPE_CHECKING:
6
+ from torch.utils.data import DataLoader
7
+ from typing_extensions import Unpack
8
+
9
+ from mteb.abstasks.task_metadata import TaskMetadata
19
10
  from mteb.models.model_meta import ModelMeta
11
+ from mteb.types import (
12
+ Array,
13
+ BatchedInput,
14
+ CorpusDatasetType,
15
+ EncodeKwargs,
16
+ PromptType,
17
+ QueryDatasetType,
18
+ RetrievalOutputType,
19
+ TopRankedDocumentsType,
20
+ )
20
21
 
21
22
 
22
23
  @runtime_checkable
@@ -72,7 +73,7 @@ class SearchProtocol(Protocol):
72
73
  ...
73
74
 
74
75
  @property
75
- def mteb_model_meta(self) -> "ModelMeta":
76
+ def mteb_model_meta(self) -> ModelMeta:
76
77
  """Metadata of the model"""
77
78
  ...
78
79
 
@@ -177,7 +178,7 @@ class EncoderProtocol(Protocol):
177
178
  ...
178
179
 
179
180
  @property
180
- def mteb_model_meta(self) -> "ModelMeta":
181
+ def mteb_model_meta(self) -> ModelMeta:
181
182
  """Metadata of the model"""
182
183
  ...
183
184
 
@@ -236,7 +237,7 @@ class CrossEncoderProtocol(Protocol):
236
237
  ...
237
238
 
238
239
  @property
239
- def mteb_model_meta(self) -> "ModelMeta":
240
+ def mteb_model_meta(self) -> ModelMeta:
240
241
  """Metadata of the model"""
241
242
  ...
242
243
 
@@ -1,7 +1,11 @@
1
- from collections.abc import Callable
2
- from typing import Protocol
1
+ from __future__ import annotations
3
2
 
4
- from mteb.types import Array, TopRankedDocumentsType
3
+ from typing import TYPE_CHECKING, Protocol
4
+
5
+ if TYPE_CHECKING:
6
+ from collections.abc import Callable
7
+
8
+ from mteb.types import Array, TopRankedDocumentsType
5
9
 
6
10
 
7
11
  class IndexEncoderSearchProtocol(Protocol):
@@ -1,14 +1,23 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import warnings
3
- from collections.abc import Callable
5
+ from typing import TYPE_CHECKING
4
6
 
5
7
  import numpy as np
6
8
  import torch
7
9
 
8
10
  from mteb._requires_package import requires_package
9
11
  from mteb.models.model_meta import ScoringFunction
10
- from mteb.models.models_protocols import EncoderProtocol
11
- from mteb.types import Array, TopRankedDocumentsType
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Callable
15
+
16
+ import faiss
17
+
18
+ from mteb.models.models_protocols import EncoderProtocol
19
+ from mteb.types import Array, TopRankedDocumentsType
20
+
12
21
 
13
22
  logger = logging.getLogger(__name__)
14
23
 
@@ -33,7 +42,6 @@ class FaissSearchIndex:
33
42
  install_instruction="pip install mteb[faiss-cpu]",
34
43
  )
35
44
 
36
- import faiss
37
45
  from faiss import IndexFlatIP, IndexFlatL2
38
46
 
39
47
  # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
@@ -1,28 +1,35 @@
1
+ from __future__ import annotations
2
+
1
3
  import heapq
2
4
  import logging
3
- from typing import Any
5
+ from typing import TYPE_CHECKING, Any
4
6
 
5
7
  import torch
6
8
  from datasets import Dataset
7
- from torch.utils.data import DataLoader
8
9
 
9
10
  from mteb._create_dataloaders import (
10
11
  create_dataloader,
11
12
  )
12
- from mteb.abstasks.task_metadata import TaskMetadata
13
13
  from mteb.types import (
14
- Array,
15
- BatchedInput,
16
- CorpusDatasetType,
17
- EncodeKwargs,
18
14
  PromptType,
19
- QueryDatasetType,
20
- RetrievalOutputType,
21
- TopRankedDocumentsType,
22
15
  )
23
16
 
24
- from .models_protocols import CrossEncoderProtocol, EncoderProtocol
25
- from .search_encoder_index.search_backend_protocol import IndexEncoderSearchProtocol
17
+ if TYPE_CHECKING:
18
+ from torch.utils.data import DataLoader
19
+
20
+ from mteb.abstasks.task_metadata import TaskMetadata
21
+ from mteb.types import (
22
+ Array,
23
+ BatchedInput,
24
+ CorpusDatasetType,
25
+ EncodeKwargs,
26
+ QueryDatasetType,
27
+ RetrievalOutputType,
28
+ TopRankedDocumentsType,
29
+ )
30
+
31
+ from .models_protocols import CrossEncoderProtocol, EncoderProtocol
32
+ from .search_encoder_index.search_backend_protocol import IndexEncoderSearchProtocol
26
33
 
27
34
  logger = logging.getLogger(__name__)
28
35