mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (529) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +78 -30
  3. mteb/_evaluators/any_sts_evaluator.py +13 -6
  4. mteb/_evaluators/clustering_evaluator.py +13 -5
  5. mteb/_evaluators/evaluator.py +12 -4
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
  7. mteb/_evaluators/pair_classification_evaluator.py +17 -7
  8. mteb/_evaluators/retrieval_evaluator.py +23 -14
  9. mteb/_evaluators/retrieval_metrics.py +26 -19
  10. mteb/_evaluators/sklearn_evaluator.py +27 -17
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
  12. mteb/_evaluators/text/summarization_evaluator.py +31 -20
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
  14. mteb/_helpful_enum.py +5 -1
  15. mteb/abstasks/_data_filter/filters.py +9 -3
  16. mteb/abstasks/_data_filter/task_pipelines.py +10 -2
  17. mteb/abstasks/_statistics_calculation.py +21 -11
  18. mteb/abstasks/_stratification.py +18 -18
  19. mteb/abstasks/abstask.py +78 -44
  20. mteb/abstasks/aggregate_task_metadata.py +21 -18
  21. mteb/abstasks/aggregated_task.py +23 -35
  22. mteb/abstasks/classification.py +39 -18
  23. mteb/abstasks/clustering.py +37 -20
  24. mteb/abstasks/clustering_legacy.py +30 -16
  25. mteb/abstasks/image/image_text_pair_classification.py +26 -9
  26. mteb/abstasks/multilabel_classification.py +33 -21
  27. mteb/abstasks/pair_classification.py +44 -19
  28. mteb/abstasks/regression.py +18 -10
  29. mteb/abstasks/retrieval.py +82 -52
  30. mteb/abstasks/retrieval_dataset_loaders.py +50 -39
  31. mteb/abstasks/sts.py +34 -15
  32. mteb/abstasks/task_metadata.py +44 -37
  33. mteb/abstasks/text/bitext_mining.py +57 -35
  34. mteb/abstasks/text/reranking.py +10 -8
  35. mteb/abstasks/text/summarization.py +26 -10
  36. mteb/abstasks/zeroshot_classification.py +27 -9
  37. mteb/benchmarks/_create_table.py +13 -7
  38. mteb/benchmarks/benchmark.py +15 -3
  39. mteb/benchmarks/benchmarks/__init__.py +6 -0
  40. mteb/benchmarks/benchmarks/benchmarks.py +153 -13
  41. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  42. mteb/benchmarks/get_benchmark.py +14 -55
  43. mteb/cache.py +189 -31
  44. mteb/cli/_display_tasks.py +10 -4
  45. mteb/cli/build_cli.py +112 -13
  46. mteb/cli/generate_model_card.py +50 -23
  47. mteb/deprecated_evaluator.py +72 -54
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  49. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  50. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  52. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  64. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  65. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  66. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  67. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  68. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  69. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  70. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  71. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  72. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  73. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  74. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  75. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  81. mteb/evaluate.py +71 -47
  82. mteb/filter_tasks.py +36 -32
  83. mteb/get_tasks.py +37 -33
  84. mteb/languages/language_scripts.py +11 -4
  85. mteb/leaderboard/app.py +172 -37
  86. mteb/leaderboard/table.py +7 -2
  87. mteb/load_results.py +20 -14
  88. mteb/models/abs_encoder.py +30 -16
  89. mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
  90. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
  91. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
  92. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  93. mteb/models/cache_wrappers/cache_wrapper.py +16 -11
  94. mteb/models/get_model_meta.py +53 -9
  95. mteb/models/instruct_wrapper.py +41 -13
  96. mteb/models/model_implementations/align_models.py +11 -5
  97. mteb/models/model_implementations/amazon_models.py +1 -0
  98. mteb/models/model_implementations/andersborges.py +6 -4
  99. mteb/models/model_implementations/ara_models.py +2 -1
  100. mteb/models/model_implementations/arctic_models.py +16 -8
  101. mteb/models/model_implementations/b1ade_models.py +2 -1
  102. mteb/models/model_implementations/bedrock_models.py +20 -6
  103. mteb/models/model_implementations/bge_models.py +85 -22
  104. mteb/models/model_implementations/bica_model.py +4 -3
  105. mteb/models/model_implementations/blip2_models.py +13 -6
  106. mteb/models/model_implementations/blip_models.py +33 -20
  107. mteb/models/model_implementations/bm25.py +27 -17
  108. mteb/models/model_implementations/bmretriever_models.py +16 -6
  109. mteb/models/model_implementations/cadet_models.py +2 -1
  110. mteb/models/model_implementations/cde_models.py +22 -9
  111. mteb/models/model_implementations/clip_models.py +18 -10
  112. mteb/models/model_implementations/clips_models.py +6 -3
  113. mteb/models/model_implementations/codefuse_models.py +10 -5
  114. mteb/models/model_implementations/codesage_models.py +6 -3
  115. mteb/models/model_implementations/cohere_models.py +19 -9
  116. mteb/models/model_implementations/cohere_v.py +16 -6
  117. mteb/models/model_implementations/colpali_models.py +10 -6
  118. mteb/models/model_implementations/colqwen_models.py +24 -38
  119. mteb/models/model_implementations/colsmol_models.py +5 -3
  120. mteb/models/model_implementations/conan_models.py +12 -5
  121. mteb/models/model_implementations/dino_models.py +70 -46
  122. mteb/models/model_implementations/e5_instruct.py +27 -4
  123. mteb/models/model_implementations/e5_models.py +18 -9
  124. mteb/models/model_implementations/e5_v.py +16 -10
  125. mteb/models/model_implementations/eagerworks_models.py +12 -5
  126. mteb/models/model_implementations/emillykkejensen_models.py +9 -6
  127. mteb/models/model_implementations/en_code_retriever.py +2 -1
  128. mteb/models/model_implementations/euler_models.py +3 -2
  129. mteb/models/model_implementations/evaclip_models.py +13 -4
  130. mteb/models/model_implementations/fa_models.py +18 -9
  131. mteb/models/model_implementations/facebookai.py +16 -2
  132. mteb/models/model_implementations/geogpt_models.py +2 -1
  133. mteb/models/model_implementations/gme_v_models.py +13 -8
  134. mteb/models/model_implementations/google_models.py +16 -5
  135. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
  136. mteb/models/model_implementations/gritlm_models.py +5 -2
  137. mteb/models/model_implementations/gte_models.py +34 -13
  138. mteb/models/model_implementations/hinvec_models.py +7 -2
  139. mteb/models/model_implementations/human.py +1 -0
  140. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  141. mteb/models/model_implementations/inf_models.py +4 -2
  142. mteb/models/model_implementations/jasper_models.py +16 -7
  143. mteb/models/model_implementations/jina_clip.py +58 -14
  144. mteb/models/model_implementations/jina_models.py +35 -16
  145. mteb/models/model_implementations/kalm_models.py +24 -12
  146. mteb/models/model_implementations/kblab.py +13 -6
  147. mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
  148. mteb/models/model_implementations/kfst.py +2 -1
  149. mteb/models/model_implementations/kowshik24_models.py +2 -1
  150. mteb/models/model_implementations/lens_models.py +2 -0
  151. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  152. mteb/models/model_implementations/linq_models.py +8 -2
  153. mteb/models/model_implementations/listconranker.py +11 -5
  154. mteb/models/model_implementations/llm2clip_models.py +18 -10
  155. mteb/models/model_implementations/llm2vec_models.py +28 -14
  156. mteb/models/model_implementations/mcinext_models.py +12 -3
  157. mteb/models/model_implementations/mdbr_models.py +19 -3
  158. mteb/models/model_implementations/misc_models.py +131 -68
  159. mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
  160. mteb/models/model_implementations/mme5_models.py +3 -2
  161. mteb/models/model_implementations/moco_models.py +15 -8
  162. mteb/models/model_implementations/mod_models.py +3 -2
  163. mteb/models/model_implementations/model2vec_models.py +37 -18
  164. mteb/models/model_implementations/moka_models.py +4 -1
  165. mteb/models/model_implementations/nbailab.py +6 -3
  166. mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
  167. mteb/models/model_implementations/nomic_models.py +47 -19
  168. mteb/models/model_implementations/nomic_models_vision.py +6 -4
  169. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
  170. mteb/models/model_implementations/nvidia_models.py +165 -22
  171. mteb/models/model_implementations/octen_models.py +64 -3
  172. mteb/models/model_implementations/openai_models.py +14 -4
  173. mteb/models/model_implementations/openclip_models.py +30 -17
  174. mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
  175. mteb/models/model_implementations/ops_moa_models.py +10 -3
  176. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
  177. mteb/models/model_implementations/pawan_models.py +2 -1
  178. mteb/models/model_implementations/piccolo_models.py +3 -1
  179. mteb/models/model_implementations/pixie_models.py +56 -0
  180. mteb/models/model_implementations/promptriever_models.py +20 -10
  181. mteb/models/model_implementations/pylate_models.py +41 -21
  182. mteb/models/model_implementations/qodo_models.py +4 -2
  183. mteb/models/model_implementations/qtack_models.py +2 -1
  184. mteb/models/model_implementations/qwen3_models.py +14 -4
  185. mteb/models/model_implementations/qzhou_models.py +4 -2
  186. mteb/models/model_implementations/random_baseline.py +7 -6
  187. mteb/models/model_implementations/rasgaard_models.py +3 -2
  188. mteb/models/model_implementations/reasonir_model.py +66 -1
  189. mteb/models/model_implementations/repllama_models.py +18 -9
  190. mteb/models/model_implementations/rerankers_custom.py +25 -10
  191. mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
  192. mteb/models/model_implementations/richinfoai_models.py +2 -1
  193. mteb/models/model_implementations/ru_sentence_models.py +40 -20
  194. mteb/models/model_implementations/ruri_models.py +20 -10
  195. mteb/models/model_implementations/salesforce_models.py +13 -4
  196. mteb/models/model_implementations/samilpwc_models.py +2 -1
  197. mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
  198. mteb/models/model_implementations/searchmap_models.py +2 -1
  199. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  200. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
  201. mteb/models/model_implementations/seed_models.py +2 -1
  202. mteb/models/model_implementations/sentence_transformers_models.py +142 -22
  203. mteb/models/model_implementations/shuu_model.py +2 -1
  204. mteb/models/model_implementations/siglip_models.py +39 -24
  205. mteb/models/model_implementations/slm_models.py +419 -0
  206. mteb/models/model_implementations/sonar_models.py +2 -1
  207. mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
  208. mteb/models/model_implementations/stella_models.py +23 -4
  209. mteb/models/model_implementations/tarka_models.py +4 -2
  210. mteb/models/model_implementations/text2vec_models.py +12 -3
  211. mteb/models/model_implementations/ua_sentence_models.py +2 -1
  212. mteb/models/model_implementations/uae_models.py +17 -5
  213. mteb/models/model_implementations/vdr_models.py +9 -2
  214. mteb/models/model_implementations/vi_vn_models.py +12 -6
  215. mteb/models/model_implementations/vista_models.py +11 -4
  216. mteb/models/model_implementations/vlm2vec_models.py +14 -7
  217. mteb/models/model_implementations/voyage_models.py +136 -4
  218. mteb/models/model_implementations/voyage_v.py +17 -10
  219. mteb/models/model_implementations/xyz_models.py +1 -0
  220. mteb/models/model_implementations/youtu_models.py +2 -1
  221. mteb/models/model_implementations/yuan_models.py +2 -1
  222. mteb/models/model_implementations/yuan_models_en.py +3 -2
  223. mteb/models/model_meta.py +127 -40
  224. mteb/models/models_protocols.py +43 -22
  225. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  226. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
  227. mteb/models/search_wrappers.py +63 -29
  228. mteb/models/sentence_transformer_wrapper.py +52 -26
  229. mteb/models/vllm_wrapper.py +329 -0
  230. mteb/py.typed +0 -0
  231. mteb/results/benchmark_results.py +48 -35
  232. mteb/results/model_result.py +68 -32
  233. mteb/results/task_result.py +110 -72
  234. mteb/similarity_functions.py +19 -9
  235. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  236. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  237. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  238. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  239. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  240. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  241. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  242. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  243. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  244. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  245. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  246. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  247. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  248. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  249. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  251. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  252. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  253. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  256. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  257. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  258. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  259. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  260. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  261. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  262. mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
  263. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  264. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  265. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  266. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  267. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  268. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  269. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  270. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  271. mteb/tasks/classification/est/estonian_valence.py +2 -2
  272. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  273. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  274. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  275. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  276. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  277. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  278. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  279. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  280. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  281. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  282. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  283. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  284. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  285. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  286. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  287. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  288. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  289. mteb/tasks/classification/kor/klue_tc.py +2 -2
  290. mteb/tasks/classification/kor/kor_fin.py +1 -1
  291. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  292. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  293. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  294. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  295. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  296. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  297. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  298. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  299. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  300. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  301. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  302. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  303. mteb/tasks/classification/multilingual/scala_classification.py +2 -2
  304. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  305. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  306. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  307. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  308. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  309. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  310. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  311. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  312. mteb/tasks/classification/ron/moroco.py +1 -1
  313. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  314. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  315. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  316. mteb/tasks/classification/rus/headline_classification.py +2 -2
  317. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  318. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  319. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  320. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  321. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  322. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  323. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  324. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  325. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  326. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  327. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  328. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  329. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  330. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  331. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  332. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  333. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  334. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  335. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  336. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  337. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  338. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  339. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  340. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  341. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  342. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  343. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  344. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  345. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  346. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  347. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  348. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  349. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  350. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  351. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  352. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  353. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  354. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  355. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  356. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  357. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  358. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  359. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  360. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  361. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  362. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  363. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  364. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  365. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  366. mteb/tasks/clustering/nob/snl_clustering.py +8 -3
  367. mteb/tasks/clustering/nob/vg_clustering.py +8 -3
  368. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  369. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  370. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  371. mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
  372. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  373. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
  374. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  375. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  376. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  377. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  378. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  379. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  380. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  381. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  382. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  383. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  384. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  385. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
  386. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  387. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  388. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  389. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  390. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  391. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  392. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  393. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  394. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  395. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  396. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  397. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  398. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  399. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  400. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  401. mteb/tasks/pair_classification/rus/terra.py +2 -2
  402. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  403. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  404. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  405. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  406. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  407. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  408. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  409. mteb/tasks/retrieval/code/code_rag.py +16 -16
  410. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  411. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  412. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  413. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  414. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  415. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  416. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  417. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
  418. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
  419. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
  420. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  421. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  422. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  423. mteb/tasks/retrieval/eng/__init__.py +44 -0
  424. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  425. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  426. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  427. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  428. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  429. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  430. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  431. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  432. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  433. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  434. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  435. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  436. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  437. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  438. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  439. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  440. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  441. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  442. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  443. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  445. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  446. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  447. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  448. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  449. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  450. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  451. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  452. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  453. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  454. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  455. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  456. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  457. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  458. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  459. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  460. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  461. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  462. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  463. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  464. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  465. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  466. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  467. mteb/tasks/retrieval/kor/__init__.py +15 -1
  468. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  469. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  470. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  471. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  472. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  473. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  474. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  475. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  476. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  477. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  478. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
  479. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  480. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  481. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  482. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  483. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  484. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  485. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  486. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  487. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  488. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  489. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  490. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  491. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  492. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  493. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  494. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  495. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  496. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  497. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  498. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  499. mteb/tasks/retrieval/nob/norquad.py +3 -3
  500. mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
  501. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  502. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  503. mteb/tasks/retrieval/vie/__init__.py +14 -6
  504. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  505. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  506. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  507. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  508. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  509. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  510. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  511. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  512. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  513. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  514. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  515. mteb/tasks/sts/kor/klue_sts.py +1 -1
  516. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  517. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  518. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  519. mteb/types/__init__.py +2 -0
  520. mteb/types/_encoder_io.py +13 -1
  521. mteb/types/_result.py +2 -1
  522. mteb/types/statistics.py +18 -5
  523. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
  524. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
  525. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
  526. mteb/models/model_implementations/mxbai_models.py +0 -111
  527. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
  528. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
  529. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
@@ -97,6 +97,7 @@ m3e_base = ModelMeta(
97
97
  revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c",
98
98
  release_date="2023-06-06", # first commit
99
99
  n_parameters=int(102 * 1e6),
100
+ n_embedding_parameters=16_226_304,
100
101
  memory_usage_mb=390,
101
102
  embed_dim=768,
102
103
  # They don't give a specific license but commercial use is not allowed
@@ -104,7 +105,7 @@ m3e_base = ModelMeta(
104
105
  max_tokens=512,
105
106
  reference="https://huggingface.co/moka-ai/m3e-base",
106
107
  similarity_fn_name=ScoringFunction.COSINE,
107
- framework=["Sentence Transformers", "PyTorch"],
108
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
108
109
  use_instructions=False,
109
110
  superseded_by=None,
110
111
  adapted_from=None,
@@ -123,6 +124,7 @@ m3e_small = ModelMeta(
123
124
  revision="44c696631b2a8c200220aaaad5f987f096e986df",
124
125
  release_date="2023-06-02", # first commit
125
126
  n_parameters=None,
127
+ n_embedding_parameters=10_817_536,
126
128
  memory_usage_mb=None, # Can't be seen on HF page
127
129
  embed_dim=512,
128
130
  # They don't give a specific license but commercial use is not allowed
@@ -149,6 +151,7 @@ m3e_large = ModelMeta(
149
151
  revision="12900375086c37ba5d83d1e417b21dc7d1d1f388",
150
152
  release_date="2023-06-21", # first commit
151
153
  n_parameters=None,
154
+ n_embedding_parameters=21_635_072,
152
155
  memory_usage_mb=None, # Can't be seen on HF page
153
156
  embed_dim=768,
154
157
  # They don't give a specific license but commercial use is not allowed
@@ -12,13 +12,14 @@ nb_sbert = ModelMeta(
12
12
  revision="b95656350a076aeafd2d23763660f80655408cc6",
13
13
  release_date="2022-11-23",
14
14
  n_parameters=1_780_000_000,
15
+ n_embedding_parameters=91_812_096,
15
16
  memory_usage_mb=678,
16
17
  embed_dim=4096,
17
18
  license="apache-2.0",
18
19
  max_tokens=75,
19
20
  reference="https://huggingface.co/NbAiLab/nb-sbert-base",
20
21
  similarity_fn_name=ScoringFunction.COSINE,
21
- framework=["Sentence Transformers", "PyTorch"],
22
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
22
23
  use_instructions=False,
23
24
  public_training_code=None,
24
25
  public_training_data="https://huggingface.co/datasets/NbAiLab/mnli-norwegian",
@@ -34,13 +35,14 @@ nb_bert_large = ModelMeta(
34
35
  revision="f9d0fc184adab4dc354d85e1854b7634540d7550",
35
36
  release_date="2021-04-29",
36
37
  n_parameters=355087360,
38
+ n_embedding_parameters=51_200_000,
37
39
  memory_usage_mb=1359,
38
40
  embed_dim=1024,
39
41
  license="cc-by-4.0",
40
42
  max_tokens=512,
41
43
  reference="https://huggingface.co/NbAiLab/nb-bert-large",
42
44
  similarity_fn_name=ScoringFunction.COSINE,
43
- framework=["Sentence Transformers", "PyTorch"],
45
+ framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
44
46
  use_instructions=False,
45
47
  public_training_code=None,
46
48
  public_training_data="https://huggingface.co/NbAiLab/nb-bert-large#training-data",
@@ -56,13 +58,14 @@ nb_bert_base = ModelMeta(
56
58
  revision="9417c3f62a3adc99f17ff92bff446f35d011f994",
57
59
  release_date="2021-01-13",
58
60
  n_parameters=177853440,
61
+ n_embedding_parameters=91_812_096,
59
62
  memory_usage_mb=681,
60
63
  embed_dim=768,
61
64
  license="cc-by-4.0",
62
65
  max_tokens=512,
63
66
  reference="https://huggingface.co/NbAiLab/nb-bert-base",
64
67
  similarity_fn_name=ScoringFunction.COSINE,
65
- framework=["Sentence Transformers", "PyTorch"],
68
+ framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
66
69
  use_instructions=False,
67
70
  public_training_code=None,
68
71
  public_training_data="https://huggingface.co/NbAiLab/nb-bert-base#training-data",
@@ -1,15 +1,22 @@
1
- from collections.abc import Generator
1
+ from __future__ import annotations
2
+
2
3
  from itertools import islice
3
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
4
5
 
5
6
  import numpy as np
6
7
  import torch
7
- from torch.utils.data import DataLoader
8
8
 
9
- from mteb.abstasks.task_metadata import TaskMetadata
10
9
  from mteb.models.abs_encoder import AbsEncoder
11
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
12
- from mteb.types import Array, BatchedInput, PromptType
11
+ from mteb.types import PromptType
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Generator
15
+
16
+ from torch.utils.data import DataLoader
17
+
18
+ from mteb.abstasks.task_metadata import TaskMetadata
19
+ from mteb.types import Array, BatchedInput
13
20
 
14
21
 
15
22
  # https://docs.python.org/3/library/itertools.html#itertools.batched
@@ -30,13 +37,13 @@ class NoInstructModel(AbsEncoder):
30
37
  self,
31
38
  model_name: str,
32
39
  revision: str,
40
+ device: str | None = None,
33
41
  model_prompts: dict[str, str] | None = None,
34
42
  **kwargs: Any,
35
43
  ):
36
44
  from transformers import AutoModel, AutoTokenizer
37
45
 
38
46
  self.model_name = model_name
39
- device = kwargs.pop("device", None)
40
47
  self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
41
48
  self.model = AutoModel.from_pretrained(
42
49
  model_name, revision=revision, **kwargs
@@ -103,13 +110,14 @@ no_instruct_small_v0 = ModelMeta(
103
110
  revision="b38747000553d8268915c95a55fc87e707c9aadd",
104
111
  release_date="2024-05-01", # first commit
105
112
  n_parameters=33_400_000,
113
+ n_embedding_parameters=11_720_448,
106
114
  memory_usage_mb=127,
107
115
  max_tokens=512,
108
116
  embed_dim=384,
109
117
  license="mit",
110
118
  reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0",
111
119
  similarity_fn_name=ScoringFunction.COSINE,
112
- framework=["PyTorch"],
120
+ framework=["PyTorch", "Sentence Transformers", "safetensors", "Transformers"],
113
121
  use_instructions=False,
114
122
  adapted_from=None,
115
123
  superseded_by=None,
@@ -1,15 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
7
  import torch.nn.functional as F
6
8
  from packaging.version import Version
7
- from torch.utils.data import DataLoader
8
9
 
9
- from mteb.abstasks.task_metadata import TaskMetadata
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
12
- from mteb.types import Array, BatchedInput, PromptType
12
+ from mteb.types import PromptType
13
+
14
+ if TYPE_CHECKING:
15
+ from torch.utils.data import DataLoader
16
+
17
+ from mteb.abstasks.task_metadata import TaskMetadata
18
+ from mteb.types import Array, BatchedInput
13
19
 
14
20
  logger = logging.getLogger(__name__)
15
21
 
@@ -23,6 +29,7 @@ class NomicWrapper(SentenceTransformerEncoderWrapper):
23
29
  self,
24
30
  model_name: str,
25
31
  revision: str,
32
+ device: str | None = None,
26
33
  model_prompts: dict[str, str] | None = None,
27
34
  **kwargs: Any,
28
35
  ):
@@ -37,7 +44,9 @@ class NomicWrapper(SentenceTransformerEncoderWrapper):
37
44
  f"Current transformers version is {transformers.__version__} is lower than the required version"
38
45
  f" {MODERN_BERT_TRANSFORMERS_MIN_VERSION}"
39
46
  )
40
- super().__init__(model_name, revision, model_prompts, **kwargs)
47
+ super().__init__(
48
+ model_name, revision, device=device, model_prompts=model_prompts, **kwargs
49
+ )
41
50
 
42
51
  def to(self, device: torch.device) -> None:
43
52
  self.model.to(device)
@@ -193,7 +202,7 @@ NOMIC_CITATION = """
193
202
  """
194
203
 
195
204
  nomic_embed_v1_5 = ModelMeta(
196
- loader=NomicWrapper, # type: ignore
205
+ loader=NomicWrapper,
197
206
  loader_kwargs=dict(
198
207
  trust_remote_code=True,
199
208
  model_prompts=model_prompts,
@@ -206,13 +215,20 @@ nomic_embed_v1_5 = ModelMeta(
206
215
  release_date="2024-02-10", # first commit
207
216
  citation=NOMIC_CITATION,
208
217
  n_parameters=137_000_000,
218
+ n_embedding_parameters=None,
209
219
  memory_usage_mb=522,
210
220
  max_tokens=8192,
211
221
  embed_dim=768,
212
222
  license="apache-2.0",
213
223
  reference="https://huggingface.co/nomic-ai/nomic-embed-text-v1.5",
214
224
  similarity_fn_name=ScoringFunction.COSINE,
215
- framework=["Sentence Transformers", "PyTorch"],
225
+ framework=[
226
+ "Sentence Transformers",
227
+ "PyTorch",
228
+ "ONNX",
229
+ "safetensors",
230
+ "Transformers",
231
+ ],
216
232
  use_instructions=True,
217
233
  adapted_from=None,
218
234
  superseded_by=None,
@@ -222,7 +238,7 @@ nomic_embed_v1_5 = ModelMeta(
222
238
  )
223
239
 
224
240
  nomic_embed_v1 = ModelMeta(
225
- loader=NomicWrapper, # type: ignore
241
+ loader=NomicWrapper,
226
242
  loader_kwargs=dict(
227
243
  trust_remote_code=True,
228
244
  model_prompts=model_prompts,
@@ -234,13 +250,20 @@ nomic_embed_v1 = ModelMeta(
234
250
  revision="0759316f275aa0cb93a5b830973843ca66babcf5",
235
251
  release_date="2024-01-31", # first commit
236
252
  n_parameters=None,
253
+ n_embedding_parameters=None,
237
254
  memory_usage_mb=522,
238
255
  max_tokens=8192,
239
256
  embed_dim=768,
240
257
  license="apache-2.0",
241
258
  reference="https://huggingface.co/nomic-ai/nomic-embed-text-v1",
242
259
  similarity_fn_name=ScoringFunction.COSINE,
243
- framework=["Sentence Transformers", "PyTorch"],
260
+ framework=[
261
+ "Sentence Transformers",
262
+ "PyTorch",
263
+ "ONNX",
264
+ "safetensors",
265
+ "Transformers",
266
+ ],
244
267
  use_instructions=True,
245
268
  citation=NOMIC_CITATION,
246
269
  adapted_from=None,
@@ -251,7 +274,7 @@ nomic_embed_v1 = ModelMeta(
251
274
  )
252
275
 
253
276
  nomic_embed_v1_ablated = ModelMeta(
254
- loader=NomicWrapper, # type: ignore
277
+ loader=NomicWrapper,
255
278
  loader_kwargs=dict(
256
279
  trust_remote_code=True,
257
280
  model_prompts=model_prompts,
@@ -263,13 +286,14 @@ nomic_embed_v1_ablated = ModelMeta(
263
286
  revision="7d948905c5d5d3874fa55a925d68e49dbf411e5f",
264
287
  release_date="2024-01-15", # first commit
265
288
  n_parameters=None,
289
+ n_embedding_parameters=None,
266
290
  memory_usage_mb=None,
267
291
  max_tokens=8192,
268
292
  embed_dim=768,
269
293
  license="apache-2.0",
270
294
  reference="https://huggingface.co/nomic-ai/nomic-embed-text-v1-ablated",
271
295
  similarity_fn_name=ScoringFunction.COSINE,
272
- framework=["Sentence Transformers", "PyTorch"],
296
+ framework=["Sentence Transformers", "PyTorch", "ONNX"],
273
297
  use_instructions=True,
274
298
  adapted_from=None,
275
299
  superseded_by=None,
@@ -279,7 +303,7 @@ nomic_embed_v1_ablated = ModelMeta(
279
303
  )
280
304
 
281
305
  nomic_embed_v1_unsupervised = ModelMeta(
282
- loader=NomicWrapper, # type: ignore
306
+ loader=NomicWrapper,
283
307
  loader_kwargs=dict(
284
308
  trust_remote_code=True,
285
309
  model_prompts=model_prompts,
@@ -291,13 +315,14 @@ nomic_embed_v1_unsupervised = ModelMeta(
291
315
  revision="b53d557b15ae63852847c222d336c1609eced93c",
292
316
  release_date="2024-01-15", # first commit
293
317
  n_parameters=None,
318
+ n_embedding_parameters=None,
294
319
  memory_usage_mb=None,
295
320
  max_tokens=8192,
296
321
  embed_dim=768,
297
322
  license="apache-2.0",
298
323
  reference="https://huggingface.co/nomic-ai/nomic-embed-text-v1-unsupervised",
299
324
  similarity_fn_name=ScoringFunction.COSINE,
300
- framework=["Sentence Transformers", "PyTorch"],
325
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "Transformers"],
301
326
  use_instructions=True,
302
327
  adapted_from=None,
303
328
  superseded_by=None,
@@ -319,13 +344,14 @@ nomic_modern_bert_embed = ModelMeta(
319
344
  revision="5960f1566fb7cb1adf1eb6e816639cf4646d9b12",
320
345
  release_date="2024-12-29",
321
346
  n_parameters=149_000_000,
347
+ n_embedding_parameters=None,
322
348
  memory_usage_mb=568,
323
349
  max_tokens=8192,
324
350
  embed_dim=768,
325
351
  license="apache-2.0",
326
352
  reference="https://huggingface.co/nomic-ai/modernbert-embed-base",
327
353
  similarity_fn_name=ScoringFunction.COSINE,
328
- framework=["Sentence Transformers", "PyTorch"],
354
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
329
355
  use_instructions=True,
330
356
  adapted_from="answerdotai/ModernBERT-base",
331
357
  public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_pretrain_modernbert.yaml",
@@ -334,7 +360,7 @@ nomic_modern_bert_embed = ModelMeta(
334
360
  training_datasets=nomic_training_data,
335
361
  public_training_data=None,
336
362
  citation="""@misc{nussbaum2024nomic,
337
- title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
363
+ title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
338
364
  author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
339
365
  year={2024},
340
366
  eprint={2402.01613},
@@ -446,7 +472,7 @@ m_languages = [
446
472
  ]
447
473
 
448
474
  nomic_embed_text_v2_moe = ModelMeta(
449
- loader=NomicWrapper, # type: ignore
475
+ loader=NomicWrapper,
450
476
  loader_kwargs=dict(
451
477
  trust_remote_code=True,
452
478
  model_prompts=model_prompts,
@@ -458,13 +484,15 @@ nomic_embed_text_v2_moe = ModelMeta(
458
484
  revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85",
459
485
  release_date="2025-02-07",
460
486
  n_parameters=475292928,
487
+ n_embedding_parameters=192036864,
488
+ n_active_parameters_override=141628032,
461
489
  memory_usage_mb=1813,
462
490
  max_tokens=512,
463
491
  embed_dim=768,
464
492
  license="apache-2.0",
465
493
  reference="https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
466
494
  similarity_fn_name=ScoringFunction.COSINE,
467
- framework=["Sentence Transformers", "PyTorch"],
495
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
468
496
  use_instructions=True,
469
497
  adapted_from="nomic-ai/nomic-xlm-2048",
470
498
  public_training_data="https://github.com/nomic-ai/contrastors?tab=readme-ov-file#data-access",
@@ -472,12 +500,12 @@ nomic_embed_text_v2_moe = ModelMeta(
472
500
  training_datasets=None, # did not look into this further
473
501
  superseded_by=None,
474
502
  citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
475
- title={Training Sparse Mixture Of Experts Text Embedding Models},
503
+ title={Training Sparse Mixture Of Experts Text Embedding Models},
476
504
  author={Zach Nussbaum and Brandon Duderstadt},
477
505
  year={2025},
478
506
  eprint={2502.07972},
479
507
  archivePrefix={arXiv},
480
508
  primaryClass={cs.CL},
481
- url={https://arxiv.org/abs/2502.07972},
509
+ url={https://arxiv.org/abs/2502.07972},
482
510
  }""",
483
511
  )
@@ -4,17 +4,18 @@ from typing import TYPE_CHECKING, Any
4
4
 
5
5
  import torch
6
6
  import torch.nn.functional as F
7
- from torch.utils.data import DataLoader
8
7
  from tqdm.auto import tqdm
9
8
 
10
9
  from mteb._requires_package import requires_package
11
- from mteb.abstasks.task_metadata import TaskMetadata
12
10
  from mteb.models.abs_encoder import AbsEncoder
13
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
14
- from mteb.types import Array, BatchedInput, PromptType
15
12
 
16
13
  if TYPE_CHECKING:
17
14
  from PIL import Image
15
+ from torch.utils.data import DataLoader
16
+
17
+ from mteb.abstasks.task_metadata import TaskMetadata
18
+ from mteb.types import Array, BatchedInput, PromptType
18
19
 
19
20
  NOMIC_EMBED_VISION_CITATION = """@article{nussbaum2024nomicembedvision,
20
21
  title={Nomic Embed Vision: Expanding the Latent Space},
@@ -174,6 +175,7 @@ nomic_embed_vision_v1_5 = ModelMeta(
174
175
  release_date="2024-06-08",
175
176
  modalities=["image", "text"],
176
177
  n_parameters=92_900_000,
178
+ n_embedding_parameters=None,
177
179
  memory_usage_mb=355,
178
180
  max_tokens=2048,
179
181
  embed_dim=768,
@@ -181,7 +183,7 @@ nomic_embed_vision_v1_5 = ModelMeta(
181
183
  open_weights=True,
182
184
  public_training_code="https://github.com/nomic-ai/contrastors",
183
185
  public_training_data=None,
184
- framework=["PyTorch"],
186
+ framework=["PyTorch", "Transformers", "ONNX", "safetensors"],
185
187
  reference="https://huggingface.co/nomic-ai/nomic-embed-vision-v1.5",
186
188
  similarity_fn_name=ScoringFunction.COSINE,
187
189
  use_instructions=True,
@@ -1,16 +1,18 @@
1
+ from __future__ import annotations
2
+
1
3
  from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
6
+ from packaging.version import Version
4
7
  from torch.utils.data import DataLoader
8
+ from transformers import __version__ as transformers_version
5
9
 
6
- from mteb.abstasks.task_metadata import TaskMetadata
7
10
  from mteb.models.abs_encoder import AbsEncoder
8
11
  from mteb.models.model_meta import ModelMeta
9
- from mteb.types import Array, BatchedInput, PromptType
10
12
 
11
13
  if TYPE_CHECKING:
12
- pass
13
-
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.types import Array, BatchedInput, PromptType
14
16
 
15
17
  LLAMA_NEMORETRIEVER_CITATION = """@misc{xu2025llamanemoretrievercolembedtopperforming,
16
18
  title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
@@ -34,6 +36,14 @@ class LlamaNemoretrieverColembed(AbsEncoder):
34
36
  attn_implementation="flash_attention_2",
35
37
  **kwargs,
36
38
  ):
39
+ required_transformers_version = "4.49.0"
40
+
41
+ if Version(transformers_version) != Version(required_transformers_version):
42
+ raise RuntimeError(
43
+ f"transformers version {transformers_version} is not match with required "
44
+ f"install version {required_transformers_version} to run `nvidia/llama-nemoretriever-colembed`"
45
+ )
46
+
37
47
  from transformers import AutoModel
38
48
 
39
49
  self.model = AutoModel.from_pretrained(
@@ -148,10 +158,11 @@ llama_nemoretriever_colembed_1b_v1 = ModelMeta(
148
158
  name="nvidia/llama-nemoretriever-colembed-1b-v1",
149
159
  model_type=["late-interaction"],
150
160
  languages=["eng-Latn"],
151
- revision="1f0fdea7f5b19532a750be109b19072d719b8177",
161
+ revision="6eade800103413033f260bb55b49fe039fd28a6e",
152
162
  release_date="2025-06-27",
153
163
  modalities=["image", "text"],
154
164
  n_parameters=2_418_000_000,
165
+ n_embedding_parameters=None,
155
166
  memory_usage_mb=4610,
156
167
  max_tokens=8192,
157
168
  embed_dim=2048,
@@ -159,7 +170,7 @@ llama_nemoretriever_colembed_1b_v1 = ModelMeta(
159
170
  open_weights=True,
160
171
  public_training_code="Proprietary Code",
161
172
  public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset",
162
- framework=["PyTorch"],
173
+ framework=["PyTorch", "Transformers", "safetensors"],
163
174
  reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1",
164
175
  similarity_fn_name="MaxSim",
165
176
  use_instructions=True,
@@ -175,10 +186,11 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
175
186
  name="nvidia/llama-nemoretriever-colembed-3b-v1",
176
187
  model_type=["late-interaction"],
177
188
  languages=["eng-Latn"],
178
- revision="50c36f4d5271c6851aa08bd26d69f6e7ca8b870c",
189
+ revision="4194bdd2cd2871f220ddba6273ce173ef1217a1e",
179
190
  release_date="2025-06-27",
180
191
  modalities=["image", "text"],
181
192
  n_parameters=4_407_000_000,
193
+ n_embedding_parameters=None,
182
194
  memory_usage_mb=8403,
183
195
  max_tokens=8192,
184
196
  embed_dim=3072,
@@ -186,7 +198,7 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
186
198
  open_weights=True,
187
199
  public_training_code="Proprietary Code",
188
200
  public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset",
189
- framework=["PyTorch"],
201
+ framework=["PyTorch", "Transformers", "safetensors"],
190
202
  reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1",
191
203
  similarity_fn_name="MaxSim",
192
204
  use_instructions=True,