mteb 2.7.2__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. mteb/_create_dataloaders.py +16 -9
  2. mteb/_evaluators/any_sts_evaluator.py +10 -5
  3. mteb/_evaluators/clustering_evaluator.py +10 -4
  4. mteb/_evaluators/evaluator.py +9 -4
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
  6. mteb/_evaluators/pair_classification_evaluator.py +10 -5
  7. mteb/_evaluators/retrieval_evaluator.py +19 -13
  8. mteb/_evaluators/retrieval_metrics.py +9 -3
  9. mteb/_evaluators/sklearn_evaluator.py +14 -10
  10. mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
  11. mteb/_evaluators/text/summarization_evaluator.py +8 -4
  12. mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
  13. mteb/_helpful_enum.py +5 -1
  14. mteb/abstasks/_data_filter/filters.py +8 -2
  15. mteb/abstasks/_data_filter/task_pipelines.py +7 -2
  16. mteb/abstasks/_statistics_calculation.py +6 -4
  17. mteb/abstasks/abstask.py +17 -9
  18. mteb/abstasks/aggregate_task_metadata.py +20 -9
  19. mteb/abstasks/aggregated_task.py +15 -8
  20. mteb/abstasks/classification.py +15 -6
  21. mteb/abstasks/clustering.py +17 -8
  22. mteb/abstasks/clustering_legacy.py +14 -6
  23. mteb/abstasks/image/image_text_pair_classification.py +17 -7
  24. mteb/abstasks/multilabel_classification.py +11 -5
  25. mteb/abstasks/pair_classification.py +19 -9
  26. mteb/abstasks/regression.py +14 -6
  27. mteb/abstasks/retrieval.py +28 -17
  28. mteb/abstasks/retrieval_dataset_loaders.py +11 -8
  29. mteb/abstasks/sts.py +19 -10
  30. mteb/abstasks/task_metadata.py +17 -8
  31. mteb/abstasks/text/bitext_mining.py +14 -7
  32. mteb/abstasks/text/summarization.py +17 -7
  33. mteb/abstasks/zeroshot_classification.py +15 -7
  34. mteb/benchmarks/_create_table.py +13 -3
  35. mteb/benchmarks/benchmark.py +11 -1
  36. mteb/benchmarks/benchmarks/__init__.py +2 -0
  37. mteb/benchmarks/benchmarks/benchmarks.py +41 -2
  38. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  39. mteb/cache.py +10 -5
  40. mteb/cli/_display_tasks.py +9 -3
  41. mteb/cli/build_cli.py +5 -2
  42. mteb/cli/generate_model_card.py +9 -2
  43. mteb/deprecated_evaluator.py +16 -12
  44. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  45. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  46. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  47. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  48. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  49. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  50. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  51. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  52. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  64. mteb/evaluate.py +20 -18
  65. mteb/filter_tasks.py +12 -7
  66. mteb/get_tasks.py +9 -4
  67. mteb/languages/language_scripts.py +8 -3
  68. mteb/leaderboard/app.py +7 -3
  69. mteb/leaderboard/table.py +7 -2
  70. mteb/load_results.py +9 -3
  71. mteb/models/abs_encoder.py +22 -12
  72. mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
  73. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
  74. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
  75. mteb/models/cache_wrappers/cache_wrapper.py +14 -9
  76. mteb/models/get_model_meta.py +11 -4
  77. mteb/models/instruct_wrapper.py +13 -5
  78. mteb/models/model_implementations/align_models.py +10 -4
  79. mteb/models/model_implementations/amazon_models.py +1 -0
  80. mteb/models/model_implementations/andersborges.py +2 -0
  81. mteb/models/model_implementations/ara_models.py +1 -0
  82. mteb/models/model_implementations/arctic_models.py +8 -0
  83. mteb/models/model_implementations/b1ade_models.py +1 -0
  84. mteb/models/model_implementations/bedrock_models.py +20 -6
  85. mteb/models/model_implementations/bge_models.py +40 -1
  86. mteb/models/model_implementations/bica_model.py +1 -0
  87. mteb/models/model_implementations/blip2_models.py +11 -4
  88. mteb/models/model_implementations/blip_models.py +17 -4
  89. mteb/models/model_implementations/bm25.py +22 -14
  90. mteb/models/model_implementations/bmretriever_models.py +10 -2
  91. mteb/models/model_implementations/cadet_models.py +1 -0
  92. mteb/models/model_implementations/cde_models.py +11 -5
  93. mteb/models/model_implementations/clip_models.py +12 -4
  94. mteb/models/model_implementations/clips_models.py +3 -0
  95. mteb/models/model_implementations/codefuse_models.py +5 -0
  96. mteb/models/model_implementations/codesage_models.py +3 -0
  97. mteb/models/model_implementations/cohere_models.py +14 -4
  98. mteb/models/model_implementations/cohere_v.py +14 -4
  99. mteb/models/model_implementations/colpali_models.py +7 -3
  100. mteb/models/model_implementations/colqwen_models.py +17 -31
  101. mteb/models/model_implementations/colsmol_models.py +3 -1
  102. mteb/models/model_implementations/conan_models.py +11 -4
  103. mteb/models/model_implementations/dino_models.py +28 -4
  104. mteb/models/model_implementations/e5_instruct.py +4 -0
  105. mteb/models/model_implementations/e5_models.py +9 -0
  106. mteb/models/model_implementations/e5_v.py +10 -4
  107. mteb/models/model_implementations/eagerworks_models.py +11 -4
  108. mteb/models/model_implementations/emillykkejensen_models.py +3 -0
  109. mteb/models/model_implementations/en_code_retriever.py +1 -0
  110. mteb/models/model_implementations/euler_models.py +1 -0
  111. mteb/models/model_implementations/evaclip_models.py +13 -4
  112. mteb/models/model_implementations/fa_models.py +9 -0
  113. mteb/models/model_implementations/facebookai.py +2 -0
  114. mteb/models/model_implementations/geogpt_models.py +1 -0
  115. mteb/models/model_implementations/gme_v_models.py +7 -3
  116. mteb/models/model_implementations/google_models.py +15 -4
  117. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
  118. mteb/models/model_implementations/gritlm_models.py +2 -0
  119. mteb/models/model_implementations/gte_models.py +9 -0
  120. mteb/models/model_implementations/hinvec_models.py +6 -1
  121. mteb/models/model_implementations/human.py +1 -0
  122. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  123. mteb/models/model_implementations/inf_models.py +2 -0
  124. mteb/models/model_implementations/jasper_models.py +14 -5
  125. mteb/models/model_implementations/jina_clip.py +10 -4
  126. mteb/models/model_implementations/jina_models.py +17 -5
  127. mteb/models/model_implementations/kalm_models.py +24 -12
  128. mteb/models/model_implementations/kblab.py +1 -0
  129. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
  130. mteb/models/model_implementations/kfst.py +1 -0
  131. mteb/models/model_implementations/kowshik24_models.py +1 -0
  132. mteb/models/model_implementations/lens_models.py +2 -0
  133. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  134. mteb/models/model_implementations/linq_models.py +7 -1
  135. mteb/models/model_implementations/listconranker.py +10 -4
  136. mteb/models/model_implementations/llm2clip_models.py +12 -4
  137. mteb/models/model_implementations/llm2vec_models.py +20 -6
  138. mteb/models/model_implementations/mcinext_models.py +8 -2
  139. mteb/models/model_implementations/mdbr_models.py +2 -0
  140. mteb/models/model_implementations/misc_models.py +63 -0
  141. mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
  142. mteb/models/model_implementations/mme5_models.py +2 -1
  143. mteb/models/model_implementations/moco_models.py +11 -4
  144. mteb/models/model_implementations/mod_models.py +2 -1
  145. mteb/models/model_implementations/model2vec_models.py +23 -4
  146. mteb/models/model_implementations/moka_models.py +3 -0
  147. mteb/models/model_implementations/nbailab.py +3 -0
  148. mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
  149. mteb/models/model_implementations/nomic_models.py +16 -4
  150. mteb/models/model_implementations/nomic_models_vision.py +5 -3
  151. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
  152. mteb/models/model_implementations/nvidia_models.py +15 -4
  153. mteb/models/model_implementations/octen_models.py +3 -1
  154. mteb/models/model_implementations/openai_models.py +14 -4
  155. mteb/models/model_implementations/openclip_models.py +17 -4
  156. mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
  157. mteb/models/model_implementations/ops_moa_models.py +9 -2
  158. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
  159. mteb/models/model_implementations/pawan_models.py +1 -0
  160. mteb/models/model_implementations/piccolo_models.py +2 -0
  161. mteb/models/model_implementations/promptriever_models.py +16 -6
  162. mteb/models/model_implementations/pylate_models.py +22 -13
  163. mteb/models/model_implementations/qodo_models.py +2 -0
  164. mteb/models/model_implementations/qtack_models.py +1 -0
  165. mteb/models/model_implementations/qwen3_models.py +11 -1
  166. mteb/models/model_implementations/qzhou_models.py +2 -0
  167. mteb/models/model_implementations/random_baseline.py +4 -3
  168. mteb/models/model_implementations/rasgaard_models.py +1 -0
  169. mteb/models/model_implementations/reasonir_model.py +65 -0
  170. mteb/models/model_implementations/repllama_models.py +15 -6
  171. mteb/models/model_implementations/rerankers_custom.py +13 -4
  172. mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
  173. mteb/models/model_implementations/richinfoai_models.py +1 -0
  174. mteb/models/model_implementations/ru_sentence_models.py +20 -0
  175. mteb/models/model_implementations/ruri_models.py +10 -0
  176. mteb/models/model_implementations/salesforce_models.py +10 -1
  177. mteb/models/model_implementations/samilpwc_models.py +1 -0
  178. mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
  179. mteb/models/model_implementations/searchmap_models.py +1 -0
  180. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  181. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
  182. mteb/models/model_implementations/seed_models.py +2 -1
  183. mteb/models/model_implementations/sentence_transformers_models.py +18 -0
  184. mteb/models/model_implementations/shuu_model.py +1 -0
  185. mteb/models/model_implementations/siglip_models.py +19 -4
  186. mteb/models/model_implementations/slm_models.py +7 -4
  187. mteb/models/model_implementations/sonar_models.py +2 -1
  188. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
  189. mteb/models/model_implementations/stella_models.py +6 -0
  190. mteb/models/model_implementations/tarka_models.py +2 -0
  191. mteb/models/model_implementations/text2vec_models.py +3 -0
  192. mteb/models/model_implementations/ua_sentence_models.py +1 -0
  193. mteb/models/model_implementations/uae_models.py +10 -4
  194. mteb/models/model_implementations/vdr_models.py +8 -1
  195. mteb/models/model_implementations/vi_vn_models.py +6 -0
  196. mteb/models/model_implementations/vista_models.py +11 -4
  197. mteb/models/model_implementations/vlm2vec_models.py +11 -4
  198. mteb/models/model_implementations/voyage_models.py +25 -4
  199. mteb/models/model_implementations/voyage_v.py +11 -6
  200. mteb/models/model_implementations/xyz_models.py +1 -0
  201. mteb/models/model_implementations/youtu_models.py +1 -0
  202. mteb/models/model_implementations/yuan_models.py +1 -0
  203. mteb/models/model_implementations/yuan_models_en.py +2 -1
  204. mteb/models/model_meta.py +47 -9
  205. mteb/models/models_protocols.py +19 -18
  206. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  207. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
  208. mteb/models/search_wrappers.py +19 -12
  209. mteb/models/sentence_transformer_wrapper.py +4 -3
  210. mteb/models/vllm_wrapper.py +8 -6
  211. mteb/results/benchmark_results.py +22 -17
  212. mteb/results/model_result.py +21 -15
  213. mteb/results/task_result.py +15 -9
  214. mteb/similarity_functions.py +8 -2
  215. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  216. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  217. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  218. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  219. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  220. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  221. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  222. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  223. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  224. mteb/tasks/clustering/nob/snl_clustering.py +7 -2
  225. mteb/tasks/clustering/nob/vg_clustering.py +7 -2
  226. mteb/tasks/retrieval/eng/__init__.py +42 -0
  227. mteb/tasks/retrieval/eng/bright_retrieval.py +9 -1
  228. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  229. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  230. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
  231. mteb/types/_encoder_io.py +1 -1
  232. mteb/types/statistics.py +9 -2
  233. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/METADATA +1 -1
  234. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/RECORD +238 -217
  235. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/WHEEL +0 -0
  236. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/entry_points.txt +0 -0
  237. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/licenses/LICENSE +0 -0
  238. {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/top_level.txt +0 -0
@@ -97,6 +97,7 @@ m3e_base = ModelMeta(
97
97
  revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c",
98
98
  release_date="2023-06-06", # first commit
99
99
  n_parameters=int(102 * 1e6),
100
+ n_embedding_parameters=16_226_304,
100
101
  memory_usage_mb=390,
101
102
  embed_dim=768,
102
103
  # They don't give a specific license but commercial use is not allowed
@@ -123,6 +124,7 @@ m3e_small = ModelMeta(
123
124
  revision="44c696631b2a8c200220aaaad5f987f096e986df",
124
125
  release_date="2023-06-02", # first commit
125
126
  n_parameters=None,
127
+ n_embedding_parameters=10_817_536,
126
128
  memory_usage_mb=None, # Can't be seen on HF page
127
129
  embed_dim=512,
128
130
  # They don't give a specific license but commercial use is not allowed
@@ -149,6 +151,7 @@ m3e_large = ModelMeta(
149
151
  revision="12900375086c37ba5d83d1e417b21dc7d1d1f388",
150
152
  release_date="2023-06-21", # first commit
151
153
  n_parameters=None,
154
+ n_embedding_parameters=21_635_072,
152
155
  memory_usage_mb=None, # Can't be seen on HF page
153
156
  embed_dim=768,
154
157
  # They don't give a specific license but commercial use is not allowed
@@ -12,6 +12,7 @@ nb_sbert = ModelMeta(
12
12
  revision="b95656350a076aeafd2d23763660f80655408cc6",
13
13
  release_date="2022-11-23",
14
14
  n_parameters=1_780_000_000,
15
+ n_embedding_parameters=91_812_096,
15
16
  memory_usage_mb=678,
16
17
  embed_dim=4096,
17
18
  license="apache-2.0",
@@ -34,6 +35,7 @@ nb_bert_large = ModelMeta(
34
35
  revision="f9d0fc184adab4dc354d85e1854b7634540d7550",
35
36
  release_date="2021-04-29",
36
37
  n_parameters=355087360,
38
+ n_embedding_parameters=51_200_000,
37
39
  memory_usage_mb=1359,
38
40
  embed_dim=1024,
39
41
  license="cc-by-4.0",
@@ -56,6 +58,7 @@ nb_bert_base = ModelMeta(
56
58
  revision="9417c3f62a3adc99f17ff92bff446f35d011f994",
57
59
  release_date="2021-01-13",
58
60
  n_parameters=177853440,
61
+ n_embedding_parameters=91_812_096,
59
62
  memory_usage_mb=681,
60
63
  embed_dim=768,
61
64
  license="cc-by-4.0",
@@ -1,15 +1,22 @@
1
- from collections.abc import Generator
1
+ from __future__ import annotations
2
+
2
3
  from itertools import islice
3
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
4
5
 
5
6
  import numpy as np
6
7
  import torch
7
- from torch.utils.data import DataLoader
8
8
 
9
- from mteb.abstasks.task_metadata import TaskMetadata
10
9
  from mteb.models.abs_encoder import AbsEncoder
11
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
12
- from mteb.types import Array, BatchedInput, PromptType
11
+ from mteb.types import PromptType
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Generator
15
+
16
+ from torch.utils.data import DataLoader
17
+
18
+ from mteb.abstasks.task_metadata import TaskMetadata
19
+ from mteb.types import Array, BatchedInput
13
20
 
14
21
 
15
22
  # https://docs.python.org/3/library/itertools.html#itertools.batched
@@ -103,6 +110,7 @@ no_instruct_small_v0 = ModelMeta(
103
110
  revision="b38747000553d8268915c95a55fc87e707c9aadd",
104
111
  release_date="2024-05-01", # first commit
105
112
  n_parameters=33_400_000,
113
+ n_embedding_parameters=11_720_448,
106
114
  memory_usage_mb=127,
107
115
  max_tokens=512,
108
116
  embed_dim=384,
@@ -1,15 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
7
  import torch.nn.functional as F
6
8
  from packaging.version import Version
7
- from torch.utils.data import DataLoader
8
9
 
9
- from mteb.abstasks.task_metadata import TaskMetadata
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
12
- from mteb.types import Array, BatchedInput, PromptType
12
+ from mteb.types import PromptType
13
+
14
+ if TYPE_CHECKING:
15
+ from torch.utils.data import DataLoader
16
+
17
+ from mteb.abstasks.task_metadata import TaskMetadata
18
+ from mteb.types import Array, BatchedInput
13
19
 
14
20
  logger = logging.getLogger(__name__)
15
21
 
@@ -209,6 +215,7 @@ nomic_embed_v1_5 = ModelMeta(
209
215
  release_date="2024-02-10", # first commit
210
216
  citation=NOMIC_CITATION,
211
217
  n_parameters=137_000_000,
218
+ n_embedding_parameters=None,
212
219
  memory_usage_mb=522,
213
220
  max_tokens=8192,
214
221
  embed_dim=768,
@@ -243,6 +250,7 @@ nomic_embed_v1 = ModelMeta(
243
250
  revision="0759316f275aa0cb93a5b830973843ca66babcf5",
244
251
  release_date="2024-01-31", # first commit
245
252
  n_parameters=None,
253
+ n_embedding_parameters=None,
246
254
  memory_usage_mb=522,
247
255
  max_tokens=8192,
248
256
  embed_dim=768,
@@ -278,6 +286,7 @@ nomic_embed_v1_ablated = ModelMeta(
278
286
  revision="7d948905c5d5d3874fa55a925d68e49dbf411e5f",
279
287
  release_date="2024-01-15", # first commit
280
288
  n_parameters=None,
289
+ n_embedding_parameters=None,
281
290
  memory_usage_mb=None,
282
291
  max_tokens=8192,
283
292
  embed_dim=768,
@@ -306,6 +315,7 @@ nomic_embed_v1_unsupervised = ModelMeta(
306
315
  revision="b53d557b15ae63852847c222d336c1609eced93c",
307
316
  release_date="2024-01-15", # first commit
308
317
  n_parameters=None,
318
+ n_embedding_parameters=None,
309
319
  memory_usage_mb=None,
310
320
  max_tokens=8192,
311
321
  embed_dim=768,
@@ -334,6 +344,7 @@ nomic_modern_bert_embed = ModelMeta(
334
344
  revision="5960f1566fb7cb1adf1eb6e816639cf4646d9b12",
335
345
  release_date="2024-12-29",
336
346
  n_parameters=149_000_000,
347
+ n_embedding_parameters=None,
337
348
  memory_usage_mb=568,
338
349
  max_tokens=8192,
339
350
  embed_dim=768,
@@ -473,6 +484,7 @@ nomic_embed_text_v2_moe = ModelMeta(
473
484
  revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85",
474
485
  release_date="2025-02-07",
475
486
  n_parameters=475292928,
487
+ n_embedding_parameters=None,
476
488
  memory_usage_mb=1813,
477
489
  max_tokens=512,
478
490
  embed_dim=768,
@@ -4,17 +4,18 @@ from typing import TYPE_CHECKING, Any
4
4
 
5
5
  import torch
6
6
  import torch.nn.functional as F
7
- from torch.utils.data import DataLoader
8
7
  from tqdm.auto import tqdm
9
8
 
10
9
  from mteb._requires_package import requires_package
11
- from mteb.abstasks.task_metadata import TaskMetadata
12
10
  from mteb.models.abs_encoder import AbsEncoder
13
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
14
- from mteb.types import Array, BatchedInput, PromptType
15
12
 
16
13
  if TYPE_CHECKING:
17
14
  from PIL import Image
15
+ from torch.utils.data import DataLoader
16
+
17
+ from mteb.abstasks.task_metadata import TaskMetadata
18
+ from mteb.types import Array, BatchedInput, PromptType
18
19
 
19
20
  NOMIC_EMBED_VISION_CITATION = """@article{nussbaum2024nomicembedvision,
20
21
  title={Nomic Embed Vision: Expanding the Latent Space},
@@ -174,6 +175,7 @@ nomic_embed_vision_v1_5 = ModelMeta(
174
175
  release_date="2024-06-08",
175
176
  modalities=["image", "text"],
176
177
  n_parameters=92_900_000,
178
+ n_embedding_parameters=None,
177
179
  memory_usage_mb=355,
178
180
  max_tokens=2048,
179
181
  embed_dim=768,
@@ -1,14 +1,18 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
6
  from packaging.version import Version
5
7
  from torch.utils.data import DataLoader
6
8
  from transformers import __version__ as transformers_version
7
9
 
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
10
  from mteb.models.abs_encoder import AbsEncoder
10
11
  from mteb.models.model_meta import ModelMeta
11
- from mteb.types import Array, BatchedInput, PromptType
12
+
13
+ if TYPE_CHECKING:
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.types import Array, BatchedInput, PromptType
12
16
 
13
17
  LLAMA_NEMORETRIEVER_CITATION = """@misc{xu2025llamanemoretrievercolembedtopperforming,
14
18
  title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
@@ -158,6 +162,7 @@ llama_nemoretriever_colembed_1b_v1 = ModelMeta(
158
162
  release_date="2025-06-27",
159
163
  modalities=["image", "text"],
160
164
  n_parameters=2_418_000_000,
165
+ n_embedding_parameters=None,
161
166
  memory_usage_mb=4610,
162
167
  max_tokens=8192,
163
168
  embed_dim=2048,
@@ -185,6 +190,7 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
185
190
  release_date="2025-06-27",
186
191
  modalities=["image", "text"],
187
192
  n_parameters=4_407_000_000,
193
+ n_embedding_parameters=None,
188
194
  memory_usage_mb=8403,
189
195
  max_tokens=8192,
190
196
  embed_dim=3072,
@@ -1,11 +1,11 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from collections.abc import Callable
3
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
4
5
 
5
6
  import torch
6
7
  import torch.nn.functional as F
7
8
  from packaging.version import Version
8
- from torch.utils.data import DataLoader
9
9
  from tqdm import tqdm
10
10
  from transformers import AutoModel, AutoTokenizer
11
11
  from transformers import __version__ as transformers_version
@@ -16,7 +16,15 @@ from mteb.models import CrossEncoderWrapper
16
16
  from mteb.models.abs_encoder import AbsEncoder
17
17
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
18
18
  from mteb.models.model_meta import ModelMeta, ScoringFunction
19
- from mteb.types import Array, BatchedInput, PromptType
19
+ from mteb.types import PromptType
20
+
21
+ if TYPE_CHECKING:
22
+ from collections.abc import Callable
23
+
24
+ from torch.utils.data import DataLoader
25
+
26
+ from mteb import TaskMetadata
27
+ from mteb.types import Array, BatchedInput
20
28
 
21
29
  logger = logging.getLogger(__name__)
22
30
 
@@ -196,6 +204,7 @@ NV_embed_v2 = ModelMeta(
196
204
  revision="7604d305b621f14095a1aa23d351674c2859553a",
197
205
  release_date="2024-09-09", # initial commit of hf model.
198
206
  n_parameters=7_850_000_000,
207
+ n_embedding_parameters=None,
199
208
  memory_usage_mb=14975,
200
209
  embed_dim=4096,
201
210
  license="cc-by-nc-4.0",
@@ -227,6 +236,7 @@ NV_embed_v1 = ModelMeta(
227
236
  revision="570834afd5fef5bf3a3c2311a2b6e0a66f6f4f2c",
228
237
  release_date="2024-09-13", # initial commit of hf model.
229
238
  n_parameters=7_850_000_000,
239
+ n_embedding_parameters=None,
230
240
  memory_usage_mb=14975,
231
241
  embed_dim=4096,
232
242
  license="cc-by-nc-4.0",
@@ -616,6 +626,7 @@ llama_embed_nemotron_8b = ModelMeta(
616
626
  revision="84a375593d27d3528beb4e104822515659e093b4",
617
627
  release_date="2025-10-23",
618
628
  n_parameters=7_504_924_672,
629
+ n_embedding_parameters=None,
619
630
  memory_usage_mb=28629,
620
631
  embed_dim=4096,
621
632
  license="https://huggingface.co/nvidia/llama-embed-nemotron-8b/blob/main/LICENSE",
@@ -1,6 +1,6 @@
1
1
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
2
  from mteb.models.model_meta import ModelMeta
3
- from mteb.models.models_protocols import PromptType
3
+ from mteb.types import PromptType
4
4
 
5
5
 
6
6
  def instruction_template(
@@ -208,6 +208,7 @@ Octen_Embedding_4B = ModelMeta(
208
208
  revision="6e188e3b072c3e3678b235ad84e6e97bcbb71e8f",
209
209
  release_date="2025-12-30",
210
210
  n_parameters=4021774336,
211
+ n_embedding_parameters=None,
211
212
  memory_usage_mb=7671,
212
213
  embed_dim=2560,
213
214
  max_tokens=32768,
@@ -238,6 +239,7 @@ Octen_Embedding_8B = ModelMeta(
238
239
  revision="f7db178d5a82fb841f606a6a67c423cead2fdbba",
239
240
  release_date="2025-12-23",
240
241
  n_parameters=7567295488,
242
+ n_embedding_parameters=None,
241
243
  memory_usage_mb=14433,
242
244
  embed_dim=4096,
243
245
  max_tokens=32768,
@@ -1,15 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any, ClassVar
4
+ from typing import TYPE_CHECKING, Any, ClassVar
3
5
 
4
6
  import numpy as np
5
- from torch.utils.data import DataLoader
6
7
  from tqdm.auto import tqdm
7
8
 
8
9
  from mteb._requires_package import requires_package
9
- from mteb.abstasks.task_metadata import TaskMetadata
10
10
  from mteb.models.abs_encoder import AbsEncoder
11
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
12
- from mteb.types import Array, BatchedInput, PromptType
12
+
13
+ if TYPE_CHECKING:
14
+ from torch.utils.data import DataLoader
15
+
16
+ from mteb.abstasks.task_metadata import TaskMetadata
17
+ from mteb.types import Array, BatchedInput, PromptType
13
18
 
14
19
  logger = logging.getLogger(__name__)
15
20
 
@@ -180,6 +185,7 @@ text_embedding_3_small = ModelMeta(
180
185
  embed_dim=1536,
181
186
  open_weights=False,
182
187
  n_parameters=None,
188
+ n_embedding_parameters=None,
183
189
  memory_usage_mb=None,
184
190
  license=None,
185
191
  reference="https://openai.com/index/new-embedding-models-and-api-updates/",
@@ -208,6 +214,7 @@ text_embedding_3_large = ModelMeta(
208
214
  framework=["API"],
209
215
  use_instructions=False,
210
216
  n_parameters=None,
217
+ n_embedding_parameters=None,
211
218
  memory_usage_mb=None,
212
219
  public_training_code=None,
213
220
  public_training_data=None, # assumed
@@ -233,6 +240,7 @@ text_embedding_ada_002 = ModelMeta(
233
240
  framework=["API"],
234
241
  use_instructions=False,
235
242
  n_parameters=None,
243
+ n_embedding_parameters=None,
236
244
  memory_usage_mb=None,
237
245
  public_training_code=None,
238
246
  public_training_data=None, # assumed
@@ -257,6 +265,7 @@ text_embedding_3_small_512 = ModelMeta(
257
265
  embed_dim=512,
258
266
  open_weights=False,
259
267
  n_parameters=None,
268
+ n_embedding_parameters=None,
260
269
  memory_usage_mb=None,
261
270
  license=None,
262
271
  reference="https://openai.com/index/new-embedding-models-and-api-updates/",
@@ -287,6 +296,7 @@ text_embedding_3_large_512 = ModelMeta(
287
296
  framework=["API"],
288
297
  use_instructions=False,
289
298
  n_parameters=None,
299
+ n_embedding_parameters=None,
290
300
  memory_usage_mb=None,
291
301
  public_training_code=None,
292
302
  public_training_data=None, # assumed
@@ -1,14 +1,19 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
  from tqdm.auto import tqdm
6
7
 
7
8
  from mteb._requires_package import requires_image_dependencies, requires_package
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.types import Array, BatchedInput, PromptType
11
+
12
+ if TYPE_CHECKING:
13
+ from torch.utils.data import DataLoader
14
+
15
+ from mteb.abstasks.task_metadata import TaskMetadata
16
+ from mteb.types import Array, BatchedInput, PromptType
12
17
 
13
18
  OPENCLIP_CITATION = """@inproceedings{cherti2023reproducible,
14
19
  title={Reproducible scaling laws for contrastive language-image learning},
@@ -128,6 +133,7 @@ CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
128
133
  release_date="2023-04-26",
129
134
  modalities=["image", "text"],
130
135
  n_parameters=428_000_000,
136
+ n_embedding_parameters=None,
131
137
  memory_usage_mb=1633,
132
138
  max_tokens=77,
133
139
  embed_dim=768,
@@ -154,6 +160,7 @@ CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
154
160
  release_date="2023-04-26",
155
161
  modalities=["image", "text"],
156
162
  n_parameters=151_000_000,
163
+ n_embedding_parameters=None,
157
164
  memory_usage_mb=576,
158
165
  max_tokens=77,
159
166
  embed_dim=512,
@@ -180,6 +187,7 @@ CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
180
187
  release_date="2023-04-26",
181
188
  modalities=["image", "text"],
182
189
  n_parameters=150_000_000,
190
+ n_embedding_parameters=None,
183
191
  memory_usage_mb=572,
184
192
  max_tokens=77,
185
193
  embed_dim=512,
@@ -206,6 +214,7 @@ CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
206
214
  release_date="2023-01-23",
207
215
  modalities=["image", "text"],
208
216
  n_parameters=2_540_000_000,
217
+ n_embedding_parameters=None,
209
218
  memory_usage_mb=9689,
210
219
  max_tokens=77,
211
220
  embed_dim=1280,
@@ -232,6 +241,7 @@ CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
232
241
  release_date="2023-03-06",
233
242
  modalities=["image", "text"],
234
243
  n_parameters=1_367_000_000,
244
+ n_embedding_parameters=None,
235
245
  memory_usage_mb=5215,
236
246
  max_tokens=77,
237
247
  embed_dim=1024,
@@ -258,6 +268,7 @@ CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
258
268
  release_date="2022-09-15",
259
269
  modalities=["image", "text"],
260
270
  n_parameters=986_000_000,
271
+ n_embedding_parameters=None,
261
272
  memory_usage_mb=3762,
262
273
  max_tokens=77,
263
274
  embed_dim=1024,
@@ -284,6 +295,7 @@ CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
284
295
  release_date="2022-09-15",
285
296
  modalities=["image", "text"],
286
297
  n_parameters=428_000_000,
298
+ n_embedding_parameters=None,
287
299
  memory_usage_mb=1631,
288
300
  max_tokens=77,
289
301
  embed_dim=768,
@@ -310,6 +322,7 @@ CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta(
310
322
  release_date="2022-09-15",
311
323
  modalities=["image", "text"],
312
324
  n_parameters=151_000_000,
325
+ n_embedding_parameters=None,
313
326
  memory_usage_mb=577,
314
327
  max_tokens=77,
315
328
  embed_dim=512,
@@ -1,12 +1,18 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
- from torch.utils.data import DataLoader
5
6
 
6
- from mteb.abstasks.task_metadata import TaskMetadata
7
7
  from mteb.models.abs_encoder import AbsEncoder
8
8
  from mteb.models.model_meta import ModelMeta
9
- from mteb.types import Array, BatchedInput, PromptType
9
+ from mteb.types import PromptType
10
+
11
+ if TYPE_CHECKING:
12
+ from torch.utils.data import DataLoader
13
+
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.types import Array, BatchedInput
10
16
 
11
17
  v2_training_data = {
12
18
  "MSMARCO",
@@ -134,6 +140,7 @@ opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta(
134
140
  revision="a8abaa916125ee512a7a8f4d706d07eb0128a8e6",
135
141
  release_date="2025-06-18",
136
142
  n_parameters=137_394_234,
143
+ n_embedding_parameters=23_440_896,
137
144
  memory_usage_mb=549,
138
145
  embed_dim=30522,
139
146
  license="apache-2.0",
@@ -160,6 +167,7 @@ opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta(
160
167
  revision="babf71f3c48695e2e53a978208e8aba48335e3c0",
161
168
  release_date="2025-03-28",
162
169
  n_parameters=66_985_530,
170
+ n_embedding_parameters=23_440_896,
163
171
  memory_usage_mb=267,
164
172
  embed_dim=30522,
165
173
  license="apache-2.0",
@@ -182,6 +190,7 @@ opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta(
182
190
  revision="8921a26c78b8559d6604eb1f5c0b74c079bee38f",
183
191
  release_date="2024-07-17",
184
192
  n_parameters=66_985_530,
193
+ n_embedding_parameters=23_440_896,
185
194
  memory_usage_mb=267,
186
195
  embed_dim=30522,
187
196
  license="apache-2.0",
@@ -205,6 +214,7 @@ opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta(
205
214
  revision="4af867a426867dfdd744097531046f4289a32fdd",
206
215
  release_date="2024-07-18",
207
216
  n_parameters=22_744_506,
217
+ n_embedding_parameters=11_720_448,
208
218
  memory_usage_mb=86,
209
219
  embed_dim=30522,
210
220
  license="apache-2.0",
@@ -227,6 +237,7 @@ opensearch_neural_sparse_encoding_doc_v1 = ModelMeta(
227
237
  revision="98cdcbd72867c547f72f2b7b7bed9cdf9f09922d",
228
238
  release_date="2024-03-07",
229
239
  n_parameters=132_955_194,
240
+ n_embedding_parameters=23_440_896,
230
241
  memory_usage_mb=507,
231
242
  embed_dim=30522,
232
243
  license="apache-2.0",
@@ -1,8 +1,13 @@
1
- import numpy as np
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
2
4
 
3
5
  from mteb.models.abs_encoder import AbsEncoder
4
6
  from mteb.models.model_meta import ModelMeta
5
7
 
8
+ if TYPE_CHECKING:
9
+ from mteb.types import Array
10
+
6
11
 
7
12
  class OPSWrapper(AbsEncoder):
8
13
  def __init__(self, model_name: str, revision: str):
@@ -15,7 +20,7 @@ class OPSWrapper(AbsEncoder):
15
20
  )
16
21
  self.output_dim = 1536
17
22
 
18
- def encode(self, sentences: list[str], **kwargs) -> np.ndarray:
23
+ def encode(self, sentences: list[str], **kwargs) -> Array:
19
24
  embeddings = self.model.encode(sentences, **kwargs)
20
25
  return embeddings[:, : self.output_dim]
21
26
 
@@ -28,6 +33,7 @@ ops_moa_conan_embedding = ModelMeta(
28
33
  languages=["zho-Hans"],
29
34
  loader=OPSWrapper,
30
35
  n_parameters=int(343 * 1e6),
36
+ n_embedding_parameters=21_635_072,
31
37
  memory_usage_mb=1308,
32
38
  max_tokens=512,
33
39
  embed_dim=1536,
@@ -60,6 +66,7 @@ ops_moa_yuan_embedding = ModelMeta(
60
66
  languages=["zho-Hans"],
61
67
  loader=OPSWrapper,
62
68
  n_parameters=int(343 * 1e6),
69
+ n_embedding_parameters=21_635_072,
63
70
  memory_usage_mb=1242,
64
71
  max_tokens=512,
65
72
  embed_dim=1536,
@@ -4,6 +4,7 @@ solon_embeddings_1_1 = ModelMeta(
4
4
  name="OrdalieTech/Solon-embeddings-mini-beta-1.1",
5
5
  languages=["fra-Latn"],
6
6
  n_parameters=210_000_000,
7
+ n_embedding_parameters=None,
7
8
  public_training_code=None,
8
9
  memory_usage_mb=808.0,
9
10
  open_weights=True,
@@ -20,6 +20,7 @@ pawan_embd_68m = ModelMeta(
20
20
  revision="32f295145802bdbd65699ad65fd27d2a5b69a909",
21
21
  release_date="2025-12-08",
22
22
  n_parameters=68_000_000,
23
+ n_embedding_parameters=None,
23
24
  memory_usage_mb=260,
24
25
  embed_dim=768,
25
26
  license="apache-2.0",
@@ -12,6 +12,7 @@ piccolo_base_zh = ModelMeta(
12
12
  revision="47c0a63b8f667c3482e05b2fd45577bb19252196",
13
13
  release_date="2023-09-04", # first commit
14
14
  n_parameters=None,
15
+ n_embedding_parameters=16_226_304,
15
16
  memory_usage_mb=None, # can't see on model card
16
17
  embed_dim=768,
17
18
  license="mit",
@@ -37,6 +38,7 @@ piccolo_large_zh_v2 = ModelMeta(
37
38
  revision="05948c1d889355936bdf9db7d30df57dd78d25a3",
38
39
  release_date="2024-04-22", # first commit
39
40
  n_parameters=None,
41
+ n_embedding_parameters=None,
40
42
  memory_usage_mb=None, # we don't know because they removed the model
41
43
  embed_dim=1024,
42
44
  license="not specified",
@@ -1,15 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from collections.abc import Callable
3
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
4
5
 
5
6
  import torch
6
- from torch.utils.data import DataLoader
7
7
 
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
8
  from mteb.models.abs_encoder import AbsEncoder
10
9
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.models.models_protocols import EncoderProtocol
12
- from mteb.types import Array, BatchedInput, PromptType
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Callable
13
+
14
+ from torch.utils.data import DataLoader
15
+
16
+ from mteb.abstasks.task_metadata import TaskMetadata
17
+ from mteb.models.models_protocols import EncoderProtocol
18
+ from mteb.types import Array, BatchedInput, PromptType
13
19
 
14
20
  from .repllama_models import RepLLaMAModel, model_prompts
15
21
 
@@ -81,6 +87,7 @@ promptriever_llama2 = ModelMeta(
81
87
  revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision
82
88
  release_date="2024-09-15",
83
89
  n_parameters=7_000_000_000,
90
+ n_embedding_parameters=None,
84
91
  memory_usage_mb=26703,
85
92
  max_tokens=4096,
86
93
  embed_dim=4096,
@@ -117,6 +124,7 @@ promptriever_llama3 = ModelMeta(
117
124
  },
118
125
  release_date="2024-09-15",
119
126
  n_parameters=8_000_000_000,
127
+ n_embedding_parameters=None,
120
128
  memory_usage_mb=30518,
121
129
  max_tokens=8192,
122
130
  embed_dim=4096,
@@ -146,6 +154,7 @@ promptriever_llama3_instruct = ModelMeta(
146
154
  revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision
147
155
  release_date="2024-09-15",
148
156
  n_parameters=8_000_000_000,
157
+ n_embedding_parameters=None,
149
158
  memory_usage_mb=30518,
150
159
  max_tokens=8192,
151
160
  embed_dim=4096,
@@ -179,6 +188,7 @@ promptriever_mistral_v1 = ModelMeta(
179
188
  revision="7231864981174d9bee8c7687c24c8344414eae6b-876d63e49b6115ecb6839893a56298fadee7e8f5", # base-peft revision
180
189
  release_date="2024-09-15",
181
190
  n_parameters=7_000_000_000,
191
+ n_embedding_parameters=131_072_000,
182
192
  memory_usage_mb=26703,
183
193
  training_datasets={
184
194
  # "samaya-ai/msmarco-w-instructions",