mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
@@ -140,6 +140,7 @@ arctic_v2_training_datasets = {
140
140
  arctic_embed_xs = ModelMeta(
141
141
  loader=sentence_transformers_loader,
142
142
  name="Snowflake/snowflake-arctic-embed-xs",
143
+ model_type=["dense"],
143
144
  revision="742da4f66e1823b5b4dbe6c320a1375a1fd85f9e",
144
145
  release_date="2024-07-08", # initial commit of hf model.
145
146
  languages=["eng-Latn"],
@@ -165,6 +166,7 @@ arctic_embed_xs = ModelMeta(
165
166
  arctic_embed_s = ModelMeta(
166
167
  loader=sentence_transformers_loader,
167
168
  name="Snowflake/snowflake-arctic-embed-s",
169
+ model_type=["dense"],
168
170
  revision="d3c1d2d433dd0fdc8e9ca01331a5f225639e798f",
169
171
  release_date="2024-04-12", # initial commit of hf model.
170
172
  languages=["eng-Latn"],
@@ -190,6 +192,7 @@ arctic_embed_s = ModelMeta(
190
192
  arctic_embed_m = ModelMeta(
191
193
  loader=sentence_transformers_loader,
192
194
  name="Snowflake/snowflake-arctic-embed-m",
195
+ model_type=["dense"],
193
196
  revision="cc17beacbac32366782584c8752220405a0f3f40",
194
197
  release_date="2024-04-12", # initial commit of hf model.
195
198
  languages=["eng-Latn"],
@@ -215,6 +218,7 @@ arctic_embed_m_long = ModelMeta(
215
218
  loader=sentence_transformers_loader,
216
219
  loader_kwargs={"trust_remote_code": True},
217
220
  name="Snowflake/snowflake-arctic-embed-m-long",
221
+ model_type=["dense"],
218
222
  revision="89d0f6ab196eead40b90cb6f9fefec01a908d2d1",
219
223
  release_date="2024-04-12", # initial commit of hf model.
220
224
  languages=["eng-Latn"],
@@ -239,6 +243,7 @@ arctic_embed_m_long = ModelMeta(
239
243
  arctic_embed_l = ModelMeta(
240
244
  loader=sentence_transformers_loader,
241
245
  name="Snowflake/snowflake-arctic-embed-l",
246
+ model_type=["dense"],
242
247
  revision="9a9e5834d2e89cdd8bb72b64111dde496e4fe78c",
243
248
  release_date="2024-04-12", # initial commit of hf model.
244
249
  languages=["eng-Latn"],
@@ -268,6 +273,7 @@ arctic_embed_m_v1_5 = ModelMeta(
268
273
  },
269
274
  ),
270
275
  name="Snowflake/snowflake-arctic-embed-m-v1.5",
276
+ model_type=["dense"],
271
277
  revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47",
272
278
  release_date="2024-07-08", # initial commit of hf model.
273
279
  languages=["eng-Latn"],
@@ -293,6 +299,7 @@ arctic_embed_m_v2_0 = ModelMeta(
293
299
  loader=sentence_transformers_loader,
294
300
  loader_kwargs={"trust_remote_code": True},
295
301
  name="Snowflake/snowflake-arctic-embed-m-v2.0",
302
+ model_type=["dense"],
296
303
  revision="f2a7d59d80dfda5b1d14f096f3ce88bb6bf9ebdc",
297
304
  release_date="2024-12-04", # initial commit of hf model.
298
305
  languages=LANGUAGES_V2_0,
@@ -317,6 +324,7 @@ arctic_embed_m_v2_0 = ModelMeta(
317
324
  arctic_embed_l_v2_0 = ModelMeta(
318
325
  loader=sentence_transformers_loader,
319
326
  name="Snowflake/snowflake-arctic-embed-l-v2.0",
327
+ model_type=["dense"],
320
328
  revision="edc2df7b6c25794b340229ca082e7c78782e6374",
321
329
  release_date="2024-12-04", # initial commit of hf model.
322
330
  languages=LANGUAGES_V2_0,
@@ -10,6 +10,7 @@ b1ade_training_data = {
10
10
  b1ade_embed = ModelMeta(
11
11
  loader=sentence_transformers_loader,
12
12
  name="w601sxs/b1ade-embed",
13
+ model_type=["dense"],
13
14
  languages=["eng-Latn"],
14
15
  revision="3bdac13927fdc888b903db93b2ffdbd90b295a69",
15
16
  open_weights=True,
@@ -155,6 +155,7 @@ class BedrockModel(AbsEncoder):
155
155
 
156
156
  amazon_titan_embed_text_v1 = ModelMeta(
157
157
  name="bedrock/amazon-titan-embed-text-v1",
158
+ model_type=["dense"],
158
159
  revision="1",
159
160
  release_date="2023-09-27",
160
161
  languages=None, # not specified
@@ -181,6 +182,7 @@ amazon_titan_embed_text_v1 = ModelMeta(
181
182
 
182
183
  amazon_titan_embed_text_v2 = ModelMeta(
183
184
  name="bedrock/amazon-titan-embed-text-v2",
185
+ model_type=["dense"],
184
186
  revision="1",
185
187
  release_date="2024-04-30",
186
188
  languages=None, # not specified
@@ -216,6 +218,7 @@ cohere_embed_english_v3 = ModelMeta(
216
218
  model_prompts=cohere_model_prompts,
217
219
  ),
218
220
  name="bedrock/cohere-embed-english-v3",
221
+ model_type=["dense"],
219
222
  languages=["eng-Latn"],
220
223
  open_weights=False,
221
224
  reference="https://cohere.com/blog/introducing-embed-v3",
@@ -243,6 +246,7 @@ cohere_embed_multilingual_v3 = ModelMeta(
243
246
  model_prompts=cohere_model_prompts,
244
247
  ),
245
248
  name="bedrock/cohere-embed-multilingual-v3",
249
+ model_type=["dense"],
246
250
  languages=cohere_supported_languages,
247
251
  open_weights=False,
248
252
  reference="https://cohere.com/blog/introducing-embed-v3",
@@ -319,6 +319,7 @@ bge_small_en_v1_5 = ModelMeta(
319
319
  model_prompts=model_prompts,
320
320
  ),
321
321
  name="BAAI/bge-small-en-v1.5",
322
+ model_type=["dense"],
322
323
  languages=["eng-Latn"],
323
324
  open_weights=True,
324
325
  revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a",
@@ -344,6 +345,7 @@ bge_base_en_v1_5 = ModelMeta(
344
345
  model_prompts=model_prompts,
345
346
  ),
346
347
  name="BAAI/bge-base-en-v1.5",
348
+ model_type=["dense"],
347
349
  languages=["eng-Latn"],
348
350
  open_weights=True,
349
351
  revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a",
@@ -369,6 +371,7 @@ bge_large_en_v1_5 = ModelMeta(
369
371
  model_prompts=model_prompts,
370
372
  ),
371
373
  name="BAAI/bge-large-en-v1.5",
374
+ model_type=["dense"],
372
375
  languages=["eng-Latn"],
373
376
  open_weights=True,
374
377
  revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09",
@@ -394,6 +397,7 @@ bge_small_zh = ModelMeta(
394
397
  model_prompts=model_prompts_zh,
395
398
  ),
396
399
  name="BAAI/bge-small-zh",
400
+ model_type=["dense"],
397
401
  languages=["zho-Hans"],
398
402
  open_weights=True,
399
403
  revision="1d2363c5de6ce9ba9c890c8e23a4c72dce540ca8",
@@ -411,6 +415,7 @@ bge_small_zh = ModelMeta(
411
415
  public_training_data=None,
412
416
  training_datasets=bge_chinese_training_data,
413
417
  superseded_by="BAAI/bge-small-zh-v1.5",
418
+ citation=BGE_15_CITATION,
414
419
  )
415
420
 
416
421
  bge_base_zh = ModelMeta(
@@ -419,6 +424,7 @@ bge_base_zh = ModelMeta(
419
424
  model_prompts=model_prompts_zh,
420
425
  ),
421
426
  name="BAAI/bge-base-zh",
427
+ model_type=["dense"],
422
428
  languages=["zho-Hans"],
423
429
  open_weights=True,
424
430
  revision="0e5f83d4895db7955e4cb9ed37ab73f7ded339b6",
@@ -436,6 +442,7 @@ bge_base_zh = ModelMeta(
436
442
  public_training_data=None,
437
443
  training_datasets=bge_chinese_training_data,
438
444
  superseded_by="BAAI/bge-base-zh-v1.5",
445
+ citation=BGE_15_CITATION,
439
446
  )
440
447
 
441
448
  bge_large_zh = ModelMeta(
@@ -444,6 +451,7 @@ bge_large_zh = ModelMeta(
444
451
  model_prompts=model_prompts_zh,
445
452
  ),
446
453
  name="BAAI/bge-large-zh",
454
+ model_type=["dense"],
447
455
  languages=["zho-Hans"],
448
456
  open_weights=True,
449
457
  revision="b5d9f5c027e87b6f0b6fa4b614f8f9cdc45ce0e8",
@@ -461,6 +469,7 @@ bge_large_zh = ModelMeta(
461
469
  public_training_data=None,
462
470
  training_datasets=bge_chinese_training_data,
463
471
  superseded_by="BAAI/bge-large-zh-v1.5",
472
+ citation=BGE_15_CITATION,
464
473
  )
465
474
 
466
475
  bge_small_en = ModelMeta(
@@ -469,6 +478,7 @@ bge_small_en = ModelMeta(
469
478
  model_prompts=model_prompts,
470
479
  ),
471
480
  name="BAAI/bge-small-en",
481
+ model_type=["dense"],
472
482
  languages=["eng-Latn"],
473
483
  open_weights=True,
474
484
  revision="4778d71a06863076696b03fd2777eb118712cad8",
@@ -486,6 +496,7 @@ bge_small_en = ModelMeta(
486
496
  public_training_data="https://data.baai.ac.cn/details/BAAI-MTP",
487
497
  training_datasets=bge_training_data,
488
498
  superseded_by="BAAI/bge-small-en-v1.5",
499
+ citation=BGE_15_CITATION,
489
500
  )
490
501
 
491
502
  bge_base_en = ModelMeta(
@@ -494,6 +505,7 @@ bge_base_en = ModelMeta(
494
505
  model_prompts=model_prompts,
495
506
  ),
496
507
  name="BAAI/bge-base-en",
508
+ model_type=["dense"],
497
509
  languages=["eng-Latn"],
498
510
  open_weights=True,
499
511
  revision="b737bf5dcc6ee8bdc530531266b4804a5d77b5d8",
@@ -511,6 +523,7 @@ bge_base_en = ModelMeta(
511
523
  public_training_data="https://data.baai.ac.cn/details/BAAI-MTP",
512
524
  training_datasets=bge_training_data,
513
525
  superseded_by="BAAI/bge-base-en-v1.5",
526
+ citation=BGE_15_CITATION,
514
527
  )
515
528
 
516
529
  bge_large_en = ModelMeta(
@@ -519,6 +532,7 @@ bge_large_en = ModelMeta(
519
532
  model_prompts=model_prompts,
520
533
  ),
521
534
  name="BAAI/bge-large-en",
535
+ model_type=["dense"],
522
536
  languages=["eng-Latn"],
523
537
  open_weights=True,
524
538
  revision="abe7d9d814b775ca171121fb03f394dc42974275",
@@ -536,6 +550,7 @@ bge_large_en = ModelMeta(
536
550
  public_training_data="https://data.baai.ac.cn/details/BAAI-MTP",
537
551
  training_datasets=bge_training_data,
538
552
  superseded_by="BAAI/bge-large-en-v1.5",
553
+ citation=BGE_15_CITATION,
539
554
  )
540
555
 
541
556
 
@@ -545,6 +560,7 @@ bge_small_zh_v1_5 = ModelMeta(
545
560
  model_prompts=model_prompts_zh,
546
561
  ),
547
562
  name="BAAI/bge-small-zh-v1.5",
563
+ model_type=["dense"],
548
564
  languages=["zho-Hans"],
549
565
  open_weights=True,
550
566
  revision="7999e1d3359715c523056ef9478215996d62a620",
@@ -561,6 +577,7 @@ bge_small_zh_v1_5 = ModelMeta(
561
577
  public_training_code=None,
562
578
  public_training_data=None,
563
579
  training_datasets=bge_chinese_training_data,
580
+ citation=BGE_15_CITATION,
564
581
  )
565
582
 
566
583
  bge_base_zh_v1_5 = ModelMeta(
@@ -569,6 +586,7 @@ bge_base_zh_v1_5 = ModelMeta(
569
586
  model_prompts=model_prompts_zh,
570
587
  ),
571
588
  name="BAAI/bge-base-zh-v1.5",
589
+ model_type=["dense"],
572
590
  languages=["zho-Hans"],
573
591
  open_weights=True,
574
592
  revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65",
@@ -585,6 +603,7 @@ bge_base_zh_v1_5 = ModelMeta(
585
603
  public_training_code=None,
586
604
  public_training_data=None,
587
605
  training_datasets=bge_chinese_training_data,
606
+ citation=BGE_15_CITATION,
588
607
  )
589
608
 
590
609
  bge_large_zh_v1_5 = ModelMeta(
@@ -593,6 +612,7 @@ bge_large_zh_v1_5 = ModelMeta(
593
612
  model_prompts=model_prompts_zh,
594
613
  ),
595
614
  name="BAAI/bge-large-zh-v1.5",
615
+ model_type=["dense"],
596
616
  languages=["zho-Hans"],
597
617
  open_weights=True,
598
618
  revision="79e7739b6ab944e86d6171e44d24c997fc1e0116",
@@ -609,11 +629,13 @@ bge_large_zh_v1_5 = ModelMeta(
609
629
  public_training_code=None,
610
630
  public_training_data=None,
611
631
  training_datasets=bge_chinese_training_data,
632
+ citation=BGE_15_CITATION,
612
633
  )
613
634
 
614
635
  bge_m3 = ModelMeta(
615
636
  loader=sentence_transformers_loader,
616
637
  name="BAAI/bge-m3",
638
+ model_type=["dense"],
617
639
  languages=bgem3_languages,
618
640
  open_weights=True,
619
641
  revision="5617a9f61b028005a4858fdac845db406aefb181",
@@ -630,6 +652,14 @@ bge_m3 = ModelMeta(
630
652
  public_training_code=None,
631
653
  public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
632
654
  training_datasets=bge_m3_training_data,
655
+ citation="""@misc{bge-m3,
656
+ title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
657
+ author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
658
+ year={2024},
659
+ eprint={2402.03216},
660
+ archivePrefix={arXiv},
661
+ primaryClass={cs.CL}
662
+ }""",
633
663
  )
634
664
 
635
665
  # Contents of cfli/bge-full-data
@@ -692,6 +722,7 @@ bge_full_data = {
692
722
  bge_multilingual_gemma2 = ModelMeta(
693
723
  loader=sentence_transformers_loader,
694
724
  name="BAAI/bge-multilingual-gemma2",
725
+ model_type=["dense"],
695
726
  languages=[
696
727
  "eng-Latn",
697
728
  "zho-Hans",
@@ -722,11 +753,30 @@ bge_multilingual_gemma2 = ModelMeta(
722
753
  }
723
754
  | bge_full_data
724
755
  | bge_m3_training_data,
756
+ citation="""@misc{bge-m3,
757
+ title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
758
+ author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
759
+ year={2024},
760
+ eprint={2402.03216},
761
+ archivePrefix={arXiv},
762
+ primaryClass={cs.CL}
763
+ }
764
+
765
+
766
+ @misc{bge_embedding,
767
+ title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
768
+ author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
769
+ year={2023},
770
+ eprint={2309.07597},
771
+ archivePrefix={arXiv},
772
+ primaryClass={cs.CL}
773
+ }""",
725
774
  )
726
775
 
727
776
  bge_en_icl = ModelMeta(
728
777
  loader=sentence_transformers_loader,
729
778
  name="BAAI/bge-en-icl",
779
+ model_type=["dense"],
730
780
  languages=[
731
781
  "eng-Latn",
732
782
  ],
@@ -762,6 +812,7 @@ bge_en_icl = ModelMeta(
762
812
  bge_m3_unsupervised = ModelMeta(
763
813
  loader=sentence_transformers_loader,
764
814
  name="BAAI/bge-m3-unsupervised",
815
+ model_type=["dense"],
765
816
  languages=bgem3_languages,
766
817
  open_weights=True,
767
818
  revision="46f03bc86361cf88102b0b517b36c8259f2946b1",
@@ -778,10 +829,19 @@ bge_m3_unsupervised = ModelMeta(
778
829
  public_training_code="https://github.com/FlagOpen/FlagEmbedding",
779
830
  public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
780
831
  training_datasets=bge_m3_training_data,
832
+ citation="""@misc{bge-m3,
833
+ title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
834
+ author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
835
+ year={2024},
836
+ eprint={2402.03216},
837
+ archivePrefix={arXiv},
838
+ primaryClass={cs.CL}
839
+ }""",
781
840
  )
782
841
 
783
842
  manu__bge_m3_custom_fr = ModelMeta(
784
843
  name="manu/bge-m3-custom-fr",
844
+ model_type=["dense"],
785
845
  revision="ed3ef88678ba83ddf4c0fab71a93cb90d89a9078",
786
846
  release_date="2024-04-11",
787
847
  languages=None,
@@ -0,0 +1,35 @@
1
+ from mteb.models import ModelMeta, sentence_transformers_loader
2
+
3
+ bica_base = ModelMeta(
4
+ name="bisectgroup/BiCA-base",
5
+ model_type=["dense"],
6
+ loader=sentence_transformers_loader,
7
+ languages=["eng-Latn"],
8
+ open_weights=True,
9
+ revision="31237a836e5ae908c308a256573e5f0986498574",
10
+ release_date="2025-11-14",
11
+ n_parameters=110_000_000,
12
+ memory_usage_mb=418,
13
+ embed_dim=768,
14
+ license="mit",
15
+ max_tokens=512,
16
+ reference="https://huggingface.co/bisectgroup/BiCA-base",
17
+ similarity_fn_name="cosine",
18
+ framework=["Sentence Transformers", "PyTorch"],
19
+ use_instructions=False,
20
+ public_training_code="https://github.com/NiravBhattLab/BiCA",
21
+ public_training_data="https://huggingface.co/datasets/bisectgroup/hard-negatives-traversal",
22
+ adapted_from="thenlper/gte-base",
23
+ citation="""
24
+ @misc{sinha2025bicaeffectivebiomedicaldense,
25
+ title={BiCA: Effective Biomedical Dense Retrieval with Citation-Aware Hard Negatives},
26
+ author={Aarush Sinha and Pavan Kumar S and Roshan Balaji and Nirav Pravinbhai Bhatt},
27
+ year={2025},
28
+ eprint={2511.08029},
29
+ archivePrefix={arXiv},
30
+ primaryClass={cs.IR},
31
+ url={https://arxiv.org/abs/2511.08029},
32
+ }
33
+ """,
34
+ training_datasets=set(),
35
+ )
@@ -10,6 +10,13 @@ from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.types import Array, BatchedInput, PromptType
12
12
 
13
+ BLIP2_CITATION = """@inproceedings{li2023blip2,
14
+ title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
15
+ author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
16
+ year={2023},
17
+ booktitle={ICML},
18
+ }"""
19
+
13
20
 
14
21
  def blip2_loader(model_name, **kwargs):
15
22
  requires_package(
@@ -159,6 +166,7 @@ blip2_training_datasets = set(
159
166
  blip2_opt_2_7b = ModelMeta(
160
167
  loader=blip2_loader,
161
168
  name="Salesforce/blip2-opt-2.7b",
169
+ model_type=["dense"],
162
170
  languages=["eng-Latn"],
163
171
  revision="51572668da0eb669e01a189dc22abe6088589a24",
164
172
  release_date="2024-03-22",
@@ -176,11 +184,13 @@ blip2_opt_2_7b = ModelMeta(
176
184
  similarity_fn_name=ScoringFunction.COSINE,
177
185
  use_instructions=False,
178
186
  training_datasets=blip2_training_datasets,
187
+ citation=BLIP2_CITATION,
179
188
  )
180
189
 
181
190
  blip2_opt_6_7b_coco = ModelMeta(
182
191
  loader=blip2_loader,
183
192
  name="Salesforce/blip2-opt-6.7b-coco",
193
+ model_type=["dense"],
184
194
  languages=["eng-Latn"],
185
195
  revision="0d580de59320a25a4d2c386387bcef310d5f286e",
186
196
  release_date="2024-03-31",
@@ -198,4 +208,5 @@ blip2_opt_6_7b_coco = ModelMeta(
198
208
  similarity_fn_name=ScoringFunction.COSINE,
199
209
  use_instructions=False,
200
210
  training_datasets=blip2_training_datasets,
211
+ citation=BLIP2_CITATION,
201
212
  )
@@ -10,6 +10,17 @@ from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.types import Array, BatchedInput, PromptType
12
12
 
13
+ BLIP_CITATION = """@misc{https://doi.org/10.48550/arxiv.2201.12086,
14
+ doi = {10.48550/ARXIV.2201.12086},
15
+ url = {https://arxiv.org/abs/2201.12086},
16
+ author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
17
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
18
+ title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
19
+ publisher = {arXiv},
20
+ year = {2022},
21
+ copyright = {Creative Commons Attribution 4.0 International}
22
+ }"""
23
+
13
24
 
14
25
  class BLIPModel(AbsEncoder):
15
26
  def __init__(
@@ -119,6 +130,7 @@ class BLIPModel(AbsEncoder):
119
130
  blip_image_captioning_large = ModelMeta(
120
131
  loader=BLIPModel, # type: ignore
121
132
  name="Salesforce/blip-image-captioning-large",
133
+ model_type=["dense"],
122
134
  languages=["eng-Latn"],
123
135
  revision="2227ac38c9f16105cb0412e7cab4759978a8fd90",
124
136
  release_date="2023-12-07",
@@ -140,11 +152,13 @@ blip_image_captioning_large = ModelMeta(
140
152
  # CC3M+CC12M+SBU
141
153
  # LAION115M
142
154
  ),
155
+ citation=BLIP_CITATION,
143
156
  )
144
157
 
145
158
  blip_image_captioning_base = ModelMeta(
146
159
  loader=BLIPModel, # type: ignore
147
160
  name="Salesforce/blip-image-captioning-base",
161
+ model_type=["dense"],
148
162
  languages=["eng-Latn"],
149
163
  revision="89b09ea1789f7addf2f6d6f0dfc4ce10ab58ef84",
150
164
  release_date="2023-08-01",
@@ -166,12 +180,14 @@ blip_image_captioning_base = ModelMeta(
166
180
  # CC3M+CC12M+SBU
167
181
  # LAION115M
168
182
  ),
183
+ citation=BLIP_CITATION,
169
184
  )
170
185
 
171
186
 
172
187
  blip_vqa_base = ModelMeta(
173
188
  loader=BLIPModel, # type: ignore
174
189
  name="Salesforce/blip-vqa-base",
190
+ model_type=["dense"],
175
191
  languages=["eng-Latn"],
176
192
  revision="c7df8e7cd7aa2ee9af18f56e2b29e59a92651b64",
177
193
  release_date="2023-12-07",
@@ -192,11 +208,13 @@ blip_vqa_base = ModelMeta(
192
208
  # CC3M+CC12M+SBU
193
209
  # LAION115M
194
210
  ),
211
+ citation=BLIP_CITATION,
195
212
  )
196
213
 
197
214
  blip_vqa_capfilt_large = ModelMeta(
198
215
  loader=BLIPModel, # type: ignore
199
216
  name="Salesforce/blip-vqa-capfilt-large",
217
+ model_type=["dense"],
200
218
  languages=["eng-Latn"],
201
219
  revision="e53f95265aeab69013fabb5380500ab984adbbb4",
202
220
  release_date="2023-01-22",
@@ -217,11 +235,13 @@ blip_vqa_capfilt_large = ModelMeta(
217
235
  # CC3M+CC12M+SBU
218
236
  # LAION115M
219
237
  ),
238
+ citation=BLIP_CITATION,
220
239
  )
221
240
 
222
241
  blip_itm_base_coco = ModelMeta(
223
242
  loader=BLIPModel, # type: ignore
224
243
  name="Salesforce/blip-itm-base-coco",
244
+ model_type=["dense"],
225
245
  languages=["eng-Latn"],
226
246
  revision="7eaa90c11850c0b17fc38c6a11e7d88bd6ac231f",
227
247
  release_date="2023-08-01",
@@ -242,11 +262,13 @@ blip_itm_base_coco = ModelMeta(
242
262
  # CC3M+CC12M+SBU
243
263
  # LAION115M
244
264
  ),
265
+ citation=BLIP_CITATION,
245
266
  )
246
267
 
247
268
  blip_itm_large_coco = ModelMeta(
248
269
  loader=BLIPModel, # type: ignore
249
270
  name="Salesforce/blip-itm-large-coco",
271
+ model_type=["dense"],
250
272
  languages=["eng-Latn"],
251
273
  revision="fef05cafc05298067cbbca00b125749394a77a6f",
252
274
  release_date="2023-08-01",
@@ -268,11 +290,13 @@ blip_itm_large_coco = ModelMeta(
268
290
  # CC3M+CC12M+SBU
269
291
  # LAION115M
270
292
  ),
293
+ citation=BLIP_CITATION,
271
294
  )
272
295
 
273
296
  blip_itm_base_flickr = ModelMeta(
274
297
  loader=BLIPModel, # type: ignore
275
298
  name="Salesforce/blip-itm-base-flickr",
299
+ model_type=["dense"],
276
300
  languages=["eng-Latn"],
277
301
  revision="1de29e660d91ae1786c1876212ea805a22eab251",
278
302
  release_date="2023-08-01",
@@ -294,11 +318,13 @@ blip_itm_base_flickr = ModelMeta(
294
318
  # LAION115M
295
319
  # Flickr30k
296
320
  ),
321
+ citation=BLIP_CITATION,
297
322
  )
298
323
 
299
324
  blip_itm_large_flickr = ModelMeta(
300
325
  loader=BLIPModel, # type: ignore
301
326
  name="Salesforce/blip-itm-large-flickr",
327
+ model_type=["dense"],
302
328
  languages=["eng-Latn"],
303
329
  revision="bda12e6506758f54261b5ab174b2c55a3ba143fb",
304
330
  release_date="2023-08-01",
@@ -319,4 +345,5 @@ blip_itm_large_flickr = ModelMeta(
319
345
  # CC3M+CC12M+SBU
320
346
  # LAION115M
321
347
  ),
348
+ citation=BLIP_CITATION,
322
349
  )
@@ -121,6 +121,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
121
121
  bm25_s = ModelMeta(
122
122
  loader=bm25_loader,
123
123
  name="bm25s",
124
+ model_type=["dense"],
124
125
  languages=["eng-Latn"],
125
126
  open_weights=True,
126
127
  revision="0_1_10",
@@ -90,6 +90,7 @@ BMRetriever_410M = ModelMeta(
90
90
  apply_instruction_to_passages=True,
91
91
  ),
92
92
  name="BMRetriever/BMRetriever-410M",
93
+ model_type=["dense"],
93
94
  languages=["eng-Latn"],
94
95
  open_weights=True,
95
96
  revision="e3569bfbcfe3a1bc48c142e11a7b0f38e86065a3",
@@ -119,6 +120,7 @@ BMRetriever_1B = ModelMeta(
119
120
  apply_instruction_to_passages=True,
120
121
  ),
121
122
  name="BMRetriever/BMRetriever-1B",
123
+ model_type=["dense"],
122
124
  languages=["eng-Latn"],
123
125
  open_weights=True,
124
126
  revision="1b758c5f4d3af48ef6035cc4088bdbcd7df43ca6",
@@ -148,6 +150,7 @@ BMRetriever_2B = ModelMeta(
148
150
  apply_instruction_to_passages=True,
149
151
  ),
150
152
  name="BMRetriever/BMRetriever-2B",
153
+ model_type=["dense"],
151
154
  languages=["eng-Latn"],
152
155
  open_weights=True,
153
156
  revision="718179afd57926369c347f46eee616db81084941",
@@ -177,6 +180,7 @@ BMRetriever_7B = ModelMeta(
177
180
  apply_instruction_to_passages=True,
178
181
  ),
179
182
  name="BMRetriever/BMRetriever-7B",
183
+ model_type=["dense"],
180
184
  languages=["eng-Latn"],
181
185
  open_weights=True,
182
186
  revision="13e6adb9273c5f254e037987d6b44e9e4b005b9a",
@@ -3,6 +3,13 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
3
3
 
4
4
  from .bge_models import bge_m3_training_data
5
5
 
6
+ CADET_CITATION = """@article{tamber2025conventionalcontrastivelearningfalls,
7
+ title={Conventional Contrastive Learning Often Falls Short: Improving Dense Retrieval with Cross-Encoder Listwise Distillation and Synthetic Data},
8
+ author={Manveer Singh Tamber and Suleman Kazi and Vivek Sourabh and Jimmy Lin},
9
+ journal={arXiv:2505.19274},
10
+ year={2025}
11
+ }"""
12
+
6
13
  cadet_training_data = {
7
14
  # we train with the corpora of FEVER, MSMARCO, and DBPEDIA. We only train with synthetic generated queries.
8
15
  # However, we do use queries from MSMARCO as examples for synthetic query generation.
@@ -28,6 +35,7 @@ cadet_embed = ModelMeta(
28
35
  },
29
36
  ),
30
37
  name="manveertamber/cadet-embed-base-v1",
38
+ model_type=["dense"],
31
39
  languages=["eng-Latn"],
32
40
  revision="8056d118be37a566f20972a5f35cda815f6bc47e",
33
41
  open_weights=True,
@@ -46,4 +54,5 @@ cadet_embed = ModelMeta(
46
54
  public_training_data="https://github.com/manveertamber/cadet-dense-retrieval",
47
55
  training_datasets=cadet_training_data,
48
56
  adapted_from="intfloat/e5-base-unsupervised",
57
+ citation=CADET_CITATION,
49
58
  )
@@ -24,6 +24,16 @@ if TYPE_CHECKING:
24
24
  )
25
25
  logger = logging.getLogger(__name__)
26
26
 
27
+ CDE_CITATION = """@misc{morris2024contextualdocumentembeddings,
28
+ title={Contextual Document Embeddings},
29
+ author={John X. Morris and Alexander M. Rush},
30
+ year={2024},
31
+ eprint={2410.02525},
32
+ archivePrefix={arXiv},
33
+ primaryClass={cs.CL},
34
+ url={https://arxiv.org/abs/2410.02525},
35
+ }"""
36
+
27
37
 
28
38
  class CDEWrapper(SentenceTransformerEncoderWrapper):
29
39
  dataset_embeddings: torch.Tensor | None = None
@@ -199,6 +209,7 @@ cde_small_v1 = ModelMeta(
199
209
  trust_remote_code=True,
200
210
  ),
201
211
  name="jxm/cde-small-v1",
212
+ model_type=["dense"],
202
213
  languages=["eng-Latn"],
203
214
  open_weights=True,
204
215
  revision="e151df18af0d7f1d1c37b074fee58406ececf19f",
@@ -217,6 +228,7 @@ cde_small_v1 = ModelMeta(
217
228
  training_datasets=bge_full_data,
218
229
  public_training_code="https://github.com/jxmorris12/cde",
219
230
  public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
231
+ citation=CDE_CITATION,
220
232
  )
221
233
 
222
234
  cde_small_v2 = ModelMeta(
@@ -226,6 +238,7 @@ cde_small_v2 = ModelMeta(
226
238
  trust_remote_code=True,
227
239
  ),
228
240
  name="jxm/cde-small-v2",
241
+ model_type=["dense"],
229
242
  languages=["eng-Latn"],
230
243
  open_weights=True,
231
244
  revision="4e1d021a6c3fd7ce8aa0a7204057eee5ae61d390",
@@ -244,4 +257,5 @@ cde_small_v2 = ModelMeta(
244
257
  training_datasets=bge_full_data,
245
258
  public_training_code="https://github.com/jxmorris12/cde",
246
259
  public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
260
+ citation=CDE_CITATION,
247
261
  )