mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,14 @@ from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.types import Array, BatchedInput, PromptType
12
12
 
13
+ OPENCLIP_CITATION = """@inproceedings{cherti2023reproducible,
14
+ title={Reproducible scaling laws for contrastive language-image learning},
15
+ author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
16
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
17
+ pages={2818--2829},
18
+ year={2023}
19
+ }"""
20
+
13
21
 
14
22
  def openclip_loader(model_name, **kwargs):
15
23
  requires_package(
@@ -114,6 +122,7 @@ def openclip_loader(model_name, **kwargs):
114
122
  CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
115
123
  loader=openclip_loader, # type: ignore
116
124
  name="laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K",
125
+ model_type=["dense"],
117
126
  languages=["eng-Latn"],
118
127
  revision="84c9828e63dc9a9351d1fe637c346d4c1c4db341",
119
128
  release_date="2023-04-26",
@@ -133,11 +142,13 @@ CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
133
142
  training_datasets=set(
134
143
  # DataComp-1B
135
144
  ),
145
+ citation=OPENCLIP_CITATION,
136
146
  )
137
147
 
138
148
  CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
139
149
  loader=openclip_loader, # type: ignore
140
150
  name="laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K",
151
+ model_type=["dense"],
141
152
  languages=["eng-Latn"],
142
153
  revision="f0e2ffa09cbadab3db6a261ec1ec56407ce42912",
143
154
  release_date="2023-04-26",
@@ -157,11 +168,13 @@ CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
157
168
  training_datasets=set(
158
169
  # DataComp-1B
159
170
  ),
171
+ citation=OPENCLIP_CITATION,
160
172
  )
161
173
 
162
174
  CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
163
175
  loader=openclip_loader, # type: ignore
164
176
  name="laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K",
177
+ model_type=["dense"],
165
178
  languages=["eng-Latn"],
166
179
  revision="d110532e8d4ff91c574ee60a342323f28468b287",
167
180
  release_date="2023-04-26",
@@ -181,11 +194,13 @@ CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
181
194
  training_datasets=set(
182
195
  # DataComp-1B
183
196
  ),
197
+ citation=OPENCLIP_CITATION,
184
198
  )
185
199
 
186
200
  CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
187
201
  loader=openclip_loader, # type: ignore
188
202
  name="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
203
+ model_type=["dense"],
189
204
  languages=["eng-Latn"],
190
205
  revision="bc7788f151930d91b58474715fdce5524ad9a189",
191
206
  release_date="2023-01-23",
@@ -205,11 +220,13 @@ CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
205
220
  training_datasets=set(
206
221
  # 2 Billion sample English subset of LAION-5B
207
222
  ),
223
+ citation=OPENCLIP_CITATION,
208
224
  )
209
225
 
210
226
  CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
211
227
  loader=openclip_loader, # type: ignore
212
228
  name="laion/CLIP-ViT-g-14-laion2B-s34B-b88K",
229
+ model_type=["dense"],
213
230
  languages=["eng-Latn"],
214
231
  revision="15efd0f6ac0c40c0f9da7becca03c974d7012604",
215
232
  release_date="2023-03-06",
@@ -229,11 +246,13 @@ CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
229
246
  training_datasets=set(
230
247
  # 2 Billion sample English subset of LAION-5B
231
248
  ),
249
+ citation=OPENCLIP_CITATION,
232
250
  )
233
251
 
234
252
  CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
235
253
  loader=openclip_loader, # type: ignore
236
254
  name="laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
255
+ model_type=["dense"],
237
256
  languages=["eng-Latn"],
238
257
  revision="de081ac0a0ca8dc9d1533eed1ae884bb8ae1404b",
239
258
  release_date="2022-09-15",
@@ -253,11 +272,13 @@ CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
253
272
  training_datasets=set(
254
273
  # 2 Billion sample English subset of LAION-5B
255
274
  ),
275
+ citation=OPENCLIP_CITATION,
256
276
  )
257
277
 
258
278
  CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
259
279
  loader=openclip_loader, # type: ignore
260
280
  name="laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
281
+ model_type=["dense"],
261
282
  languages=["eng-Latn"],
262
283
  revision="1627032197142fbe2a7cfec626f4ced3ae60d07a",
263
284
  release_date="2022-09-15",
@@ -277,11 +298,13 @@ CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
277
298
  training_datasets=set(
278
299
  # 2 Billion sample English subset of LAION-5B
279
300
  ),
301
+ citation=OPENCLIP_CITATION,
280
302
  )
281
303
 
282
304
  CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta(
283
305
  loader=openclip_loader,
284
306
  name="laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
307
+ model_type=["dense"],
285
308
  languages=["eng-Latn"],
286
309
  revision="08f73555f1b2fb7c82058aebbd492887a94968ef",
287
310
  release_date="2022-09-15",
@@ -301,4 +324,5 @@ CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta(
301
324
  training_datasets=set(
302
325
  # 2 Billion sample English subset of LAION-5B
303
326
  ),
327
+ citation=OPENCLIP_CITATION,
304
328
  )
@@ -128,6 +128,7 @@ class SparseEncoderWrapper(AbsEncoder):
128
128
 
129
129
  opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta(
130
130
  name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte",
131
+ model_type=["dense"],
131
132
  languages=["eng-Latn"],
132
133
  open_weights=True,
133
134
  revision="a8abaa916125ee512a7a8f4d706d07eb0128a8e6",
@@ -153,6 +154,7 @@ opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta(
153
154
 
154
155
  opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta(
155
156
  name="opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
157
+ model_type=["dense"],
156
158
  languages=["eng-Latn"],
157
159
  open_weights=True,
158
160
  revision="babf71f3c48695e2e53a978208e8aba48335e3c0",
@@ -174,6 +176,7 @@ opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta(
174
176
 
175
177
  opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta(
176
178
  name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill",
179
+ model_type=["dense"],
177
180
  languages=["eng-Latn"],
178
181
  open_weights=True,
179
182
  revision="8921a26c78b8559d6604eb1f5c0b74c079bee38f",
@@ -196,6 +199,7 @@ opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta(
196
199
 
197
200
  opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta(
198
201
  name="opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini",
202
+ model_type=["dense"],
199
203
  languages=["eng-Latn"],
200
204
  open_weights=True,
201
205
  revision="4af867a426867dfdd744097531046f4289a32fdd",
@@ -217,6 +221,7 @@ opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta(
217
221
 
218
222
  opensearch_neural_sparse_encoding_doc_v1 = ModelMeta(
219
223
  name="opensearch-project/opensearch-neural-sparse-encoding-doc-v1",
224
+ model_type=["dense"],
220
225
  languages=["eng-Latn"],
221
226
  open_weights=True,
222
227
  revision="98cdcbd72867c547f72f2b7b7bed9cdf9f09922d",
@@ -22,12 +22,13 @@ class OPSWrapper(AbsEncoder):
22
22
 
23
23
  ops_moa_conan_embedding = ModelMeta(
24
24
  name="OpenSearch-AI/Ops-MoA-Conan-embedding-v1",
25
+ model_type=["dense"],
25
26
  revision="46dcd58753f3daa920c66f89e47086a534089350",
26
27
  release_date="2025-03-26",
27
28
  languages=["zho-Hans"],
28
29
  loader=OPSWrapper,
29
30
  n_parameters=int(343 * 1e6),
30
- memory_usage_mb=2e3,
31
+ memory_usage_mb=1308,
31
32
  max_tokens=512,
32
33
  embed_dim=1536,
33
34
  license="cc-by-nc-4.0",
@@ -53,12 +54,13 @@ ops_moa_conan_embedding = ModelMeta(
53
54
 
54
55
  ops_moa_yuan_embedding = ModelMeta(
55
56
  name="OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0",
57
+ model_type=["dense"],
56
58
  revision="23712d0766417b0eb88a2513c6e212a58b543268",
57
59
  release_date="2025-03-26",
58
60
  languages=["zho-Hans"],
59
61
  loader=OPSWrapper,
60
62
  n_parameters=int(343 * 1e6),
61
- memory_usage_mb=2e3,
63
+ memory_usage_mb=1242,
62
64
  max_tokens=512,
63
65
  embed_dim=1536,
64
66
  license="cc-by-nc-4.0",
@@ -0,0 +1,39 @@
1
+ from mteb.models.model_meta import (
2
+ ModelMeta,
3
+ ScoringFunction,
4
+ )
5
+ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
6
+
7
+ PAWAN_EMBD_CITATION = """@misc{medhi2025pawanembd,
8
+ title={PawanEmbd-68M: Distilled Embedding Model},
9
+ author={Medhi, D.},
10
+ year={2025},
11
+ url={https://huggingface.co/dmedhi/PawanEmbd-68M}
12
+ }"""
13
+
14
+ pawan_embd_68m = ModelMeta(
15
+ loader=sentence_transformers_loader,
16
+ name="dmedhi/PawanEmbd-68M",
17
+ model_type=["dense"],
18
+ languages=["eng-Latn"],
19
+ open_weights=True,
20
+ revision="32f295145802bdbd65699ad65fd27d2a5b69a909",
21
+ release_date="2025-12-08",
22
+ n_parameters=68_000_000,
23
+ memory_usage_mb=260,
24
+ embed_dim=768,
25
+ license="apache-2.0",
26
+ max_tokens=512,
27
+ reference="https://huggingface.co/dmedhi/PawanEmbd-68M",
28
+ similarity_fn_name=ScoringFunction.COSINE,
29
+ framework=["Sentence Transformers", "PyTorch"],
30
+ adapted_from="ibm-granite/granite-embedding-278m-multilingual",
31
+ superseded_by=None,
32
+ public_training_code=None,
33
+ public_training_data=None,
34
+ use_instructions=False,
35
+ training_datasets={
36
+ "AllNLI",
37
+ },
38
+ citation=PAWAN_EMBD_CITATION,
39
+ )
@@ -6,6 +6,7 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
6
6
  piccolo_base_zh = ModelMeta(
7
7
  loader=sentence_transformers_loader,
8
8
  name="sensenova/piccolo-base-zh",
9
+ model_type=["dense"],
9
10
  languages=["zho-Hans"],
10
11
  open_weights=True,
11
12
  revision="47c0a63b8f667c3482e05b2fd45577bb19252196",
@@ -29,6 +30,7 @@ piccolo_base_zh = ModelMeta(
29
30
  piccolo_large_zh_v2 = ModelMeta(
30
31
  loader=sentence_transformers_loader,
31
32
  name="sensenova/piccolo-large-zh-v2",
33
+ model_type=["dense"],
32
34
  languages=["zho-Hans"],
33
35
  open_weights=False, # They "temporarily" removed it in may last year
34
36
  # "Due to certain internal company considerations"
@@ -48,4 +50,10 @@ piccolo_large_zh_v2 = ModelMeta(
48
50
  public_training_code=None,
49
51
  public_training_data=None,
50
52
  training_datasets=None, # They don't say
53
+ citation="""@misc{2405.06932,
54
+ Author = {Junqin Huang and Zhongjie Hu and Zihao Jing and Mengya Gao and Yichao Wu},
55
+ Title = {Piccolo2: General Text Embedding with Multi-task Hybrid Loss Training},
56
+ Year = {2024},
57
+ Eprint = {arXiv:2405.06932},
58
+ }""",
51
59
  )
@@ -75,12 +75,13 @@ promptriever_llama2 = ModelMeta(
75
75
  model_prompts=model_prompts,
76
76
  ),
77
77
  name="samaya-ai/promptriever-llama2-7b-v1",
78
+ model_type=["dense"],
78
79
  languages=["eng-Latn"],
79
80
  open_weights=True,
80
81
  revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision
81
82
  release_date="2024-09-15",
82
83
  n_parameters=7_000_000_000,
83
- memory_usage_mb=27,
84
+ memory_usage_mb=26703,
84
85
  max_tokens=4096,
85
86
  embed_dim=4096,
86
87
  license="apache-2.0",
@@ -106,6 +107,7 @@ promptriever_llama3 = ModelMeta(
106
107
  model_prompts=model_prompts,
107
108
  ),
108
109
  name="samaya-ai/promptriever-llama3.1-8b-v1",
110
+ model_type=["dense"],
109
111
  languages=["eng-Latn"],
110
112
  open_weights=True,
111
113
  revision="48d6d0fc4e02fb1269b36940650a1b7233035cbb-2ead22cfb1b0e0c519c371c63c2ab90ffc511b8a", # base-peft revision
@@ -115,7 +117,7 @@ promptriever_llama3 = ModelMeta(
115
117
  },
116
118
  release_date="2024-09-15",
117
119
  n_parameters=8_000_000_000,
118
- memory_usage_mb=31,
120
+ memory_usage_mb=30518,
119
121
  max_tokens=8192,
120
122
  embed_dim=4096,
121
123
  license="apache-2.0",
@@ -138,12 +140,13 @@ promptriever_llama3_instruct = ModelMeta(
138
140
  model_prompts=model_prompts,
139
141
  ),
140
142
  name="samaya-ai/promptriever-llama3.1-8b-instruct-v1",
143
+ model_type=["dense"],
141
144
  languages=["eng-Latn"],
142
145
  open_weights=True,
143
146
  revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision
144
147
  release_date="2024-09-15",
145
148
  n_parameters=8_000_000_000,
146
- memory_usage_mb=31,
149
+ memory_usage_mb=30518,
147
150
  max_tokens=8192,
148
151
  embed_dim=4096,
149
152
  training_datasets={
@@ -170,12 +173,13 @@ promptriever_mistral_v1 = ModelMeta(
170
173
  model_prompts=model_prompts,
171
174
  ),
172
175
  name="samaya-ai/promptriever-mistral-v0.1-7b-v1",
176
+ model_type=["dense"],
173
177
  languages=["eng-Latn"],
174
178
  open_weights=True,
175
179
  revision="7231864981174d9bee8c7687c24c8344414eae6b-876d63e49b6115ecb6839893a56298fadee7e8f5", # base-peft revision
176
180
  release_date="2024-09-15",
177
181
  n_parameters=7_000_000_000,
178
- memory_usage_mb=27,
182
+ memory_usage_mb=26703,
179
183
  training_datasets={
180
184
  # "samaya-ai/msmarco-w-instructions",
181
185
  "mMARCO-NL", # translation not trained on
@@ -328,18 +328,16 @@ class MultiVectorModel(AbsEncoder, PylateSearchEncoder):
328
328
  inputs,
329
329
  prompt_name=prompt_name,
330
330
  is_query=prompt_type == PromptType.query,
331
- convert_to_tensor=True,
332
331
  **kwargs,
333
332
  )
334
333
 
335
- # encode returns a list of tensors shaped (x, token_dim), pad to uniform length
336
- pred = torch.nn.utils.rnn.pad_sequence(pred, batch_first=True, padding_value=0)
337
- return pred.cpu().numpy()
334
+ return pred
338
335
 
339
336
 
340
337
  colbert_v2 = ModelMeta(
341
338
  loader=MultiVectorModel,
342
339
  name="colbert-ir/colbertv2.0",
340
+ model_type=["late-interaction"],
343
341
  languages=["eng-Latn"],
344
342
  open_weights=True,
345
343
  revision="c1e84128e85ef755c096a95bdb06b47793b13acf",
@@ -372,6 +370,7 @@ jina_colbert_v2 = ModelMeta(
372
370
  trust_remote_code=True,
373
371
  ),
374
372
  name="jinaai/jina-colbert-v2",
373
+ model_type=["late-interaction"],
375
374
  languages=[
376
375
  "ara-Arab",
377
376
  "ben-Beng",
@@ -418,12 +417,37 @@ jina_colbert_v2 = ModelMeta(
418
417
  "DuRetrieval",
419
418
  "MIRACL",
420
419
  },
420
+ citation="""@inproceedings{xiao-etal-2024-jina,
421
+ title = "{J}ina-{C}ol{BERT}-v2: A General-Purpose Multilingual Late Interaction Retriever",
422
+ author = {Jha, Rohan and
423
+ Wang, Bo and
424
+ G{\"u}nther, Michael and
425
+ Mastrapas, Georgios and
426
+ Sturua, Saba and
427
+ Mohr, Isabelle and
428
+ Koukounas, Andreas and
429
+ Wang, Mohammad Kalim and
430
+ Wang, Nan and
431
+ Xiao, Han},
432
+ editor = {S{\"a}lev{\"a}, Jonne and
433
+ Owodunni, Abraham},
434
+ booktitle = "Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024)",
435
+ month = nov,
436
+ year = "2024",
437
+ address = "Miami, Florida, USA",
438
+ publisher = "Association for Computational Linguistics",
439
+ url = "https://aclanthology.org/2024.mrl-1.11/",
440
+ doi = "10.18653/v1/2024.mrl-1.11",
441
+ pages = "159--166",
442
+ abstract = "Multi-vector dense models, such as ColBERT, have proven highly effective in information retrieval. ColBERT`s late interaction scoring approximates the joint query-document attention seen in cross-encoders while maintaining inference efficiency closer to traditional dense retrieval models, thanks to its bi-encoder architecture and recent optimizations in indexing and search. In this paper, we introduce a novel architecture and a training framework to support long context window and multilingual retrieval. Leveraging Matryoshka Representation Loss, we further demonstrate that the reducing the embedding dimensionality from 128 to 64 has insignificant impact on the model`s retrieval performance and cut storage requirements by up to 50{\%}. Our new model, Jina-ColBERT-v2, demonstrates strong performance across a range of English and multilingual retrieval tasks,"
443
+ }""",
421
444
  )
422
445
 
423
446
 
424
447
  lightonai__gte_moderncolbert_v1 = ModelMeta(
425
448
  loader=MultiVectorModel,
426
449
  name="lightonai/GTE-ModernColBERT-v1",
450
+ model_type=["late-interaction"],
427
451
  languages=[
428
452
  "eng-Latn",
429
453
  ],
@@ -447,4 +471,13 @@ lightonai__gte_moderncolbert_v1 = ModelMeta(
447
471
  "MSMARCO",
448
472
  "mMARCO-NL",
449
473
  },
474
+ citation="""@inproceedings{reimers-2019-sentence-bert,
475
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
476
+ author = "Reimers, Nils and Gurevych, Iryna",
477
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
478
+ month = "11",
479
+ year = "2019",
480
+ publisher = "Association for Computational Linguistics",
481
+ url = "https://arxiv.org/abs/1908.10084"
482
+ }""",
450
483
  )
@@ -30,6 +30,7 @@ qodo_languages = [
30
30
  Qodo_Embed_1_1_5B = ModelMeta(
31
31
  loader=sentence_transformers_loader,
32
32
  name="Qodo/Qodo-Embed-1-1.5B",
33
+ model_type=["dense"],
33
34
  languages=qodo_languages,
34
35
  open_weights=True,
35
36
  revision="84bbef079b32e8823ec226d4e9e92902706b0eb6",
@@ -52,6 +53,7 @@ Qodo_Embed_1_1_5B = ModelMeta(
52
53
  Qodo_Embed_1_7B = ModelMeta(
53
54
  loader=sentence_transformers_loader,
54
55
  name="Qodo/Qodo-Embed-1-7B",
56
+ model_type=["dense"],
55
57
  languages=qodo_languages,
56
58
  open_weights=True,
57
59
  revision="f9edd9bf7f687c0e832424058e265120f603cd81",
@@ -25,6 +25,7 @@ mini_gte_datasets = {
25
25
  mini_gte = ModelMeta(
26
26
  loader=sentence_transformers_loader,
27
27
  name="prdev/mini-gte",
28
+ model_type=["dense"],
28
29
  languages=["eng-Latn"],
29
30
  open_weights=True,
30
31
  revision="7fbe6f9b4cc42615e0747299f837ad7769025492",
@@ -134,12 +134,13 @@ def q3e_instruct_loader(
134
134
  Qwen3_Embedding_0B6 = ModelMeta(
135
135
  loader=q3e_instruct_loader,
136
136
  name="Qwen/Qwen3-Embedding-0.6B",
137
+ model_type=["dense"],
137
138
  languages=multilingual_langs,
138
139
  open_weights=True,
139
140
  revision="b22da495047858cce924d27d76261e96be6febc0", # Commit of @tomaarsen
140
141
  release_date="2025-06-05",
141
142
  n_parameters=595776512,
142
- memory_usage_mb=2272,
143
+ memory_usage_mb=1136,
143
144
  embed_dim=1024,
144
145
  max_tokens=32768,
145
146
  license="apache-2.0",
@@ -156,12 +157,13 @@ Qwen3_Embedding_0B6 = ModelMeta(
156
157
  Qwen3_Embedding_4B = ModelMeta(
157
158
  loader=q3e_instruct_loader,
158
159
  name="Qwen/Qwen3-Embedding-4B",
160
+ model_type=["dense"],
159
161
  languages=multilingual_langs,
160
162
  open_weights=True,
161
163
  revision="636cd9bf47d976946cdbb2b0c3ca0cb2f8eea5ff", # Commit of @tomaarsen
162
164
  release_date="2025-06-05",
163
165
  n_parameters=4021774336,
164
- memory_usage_mb=15341,
166
+ memory_usage_mb=7671,
165
167
  embed_dim=2560,
166
168
  max_tokens=32768,
167
169
  license="apache-2.0",
@@ -178,12 +180,13 @@ Qwen3_Embedding_4B = ModelMeta(
178
180
  Qwen3_Embedding_8B = ModelMeta(
179
181
  loader=q3e_instruct_loader,
180
182
  name="Qwen/Qwen3-Embedding-8B",
183
+ model_type=["dense"],
181
184
  languages=multilingual_langs,
182
185
  open_weights=True,
183
186
  revision="4e423935c619ae4df87b646a3ce949610c66241c", # Commit of @tomaarsen
184
187
  release_date="2025-06-05",
185
188
  n_parameters=7567295488,
186
- memory_usage_mb=28866,
189
+ memory_usage_mb=14433,
187
190
  embed_dim=4096,
188
191
  max_tokens=32768,
189
192
  license="apache-2.0",
@@ -58,12 +58,13 @@ QZhou_Embedding = ModelMeta(
58
58
  apply_instruction_to_passages=False,
59
59
  ),
60
60
  name="Kingsoft-LLM/QZhou-Embedding",
61
+ model_type=["dense"],
61
62
  languages=["eng-Latn", "zho-Hans"],
62
63
  open_weights=True,
63
64
  revision="f1e6c03ee3882e7b9fa5cec91217715272e433b8",
64
65
  release_date="2025-08-24",
65
66
  n_parameters=7_070_619_136,
66
- memory_usage_mb=29070,
67
+ memory_usage_mb=14436,
67
68
  embed_dim=3584,
68
69
  license="apache-2.0",
69
70
  max_tokens=8192,
@@ -91,6 +92,7 @@ QZhou_Embedding_Zh = ModelMeta(
91
92
  apply_instruction_to_passages=False,
92
93
  ),
93
94
  name="Kingsoft-LLM/QZhou-Embedding-Zh",
95
+ model_type=["dense"],
94
96
  languages=["zho-Hans"],
95
97
  open_weights=True,
96
98
  revision="0321ccb126413d1e49c5ce908e802b63d35f18e2",
@@ -1,15 +1,23 @@
1
+ from __future__ import annotations
2
+
1
3
  import hashlib
2
- from typing import Any, Literal
4
+ from typing import TYPE_CHECKING, Any, Literal
3
5
 
4
6
  import numpy as np
5
7
  import torch
6
- from PIL import Image
7
8
  from torch.utils.data import DataLoader
8
9
 
9
10
  from mteb.abstasks.task_metadata import TaskMetadata
10
11
  from mteb.models.model_meta import ModelMeta
12
+ from mteb.similarity_functions import (
13
+ select_pairwise_similarity,
14
+ select_similarity,
15
+ )
11
16
  from mteb.types._encoder_io import Array, BatchedInput, PromptType
12
17
 
18
+ if TYPE_CHECKING:
19
+ from PIL import Image
20
+
13
21
 
14
22
  def _string_to_vector(text: str | None, size: int) -> np.ndarray:
15
23
  """Generate a deterministic random vector based on a string.
@@ -155,15 +163,9 @@ class RandomEncoderBaseline:
155
163
  Returns:
156
164
  Cosine similarity matrix between the two sets of embeddings
157
165
  """
158
- norm1 = np.linalg.norm(
159
- embeddings1.reshape(-1, self.embedding_dim), axis=1, keepdims=True
166
+ return select_similarity(
167
+ embeddings1, embeddings2, self.mteb_model_meta.similarity_fn_name
160
168
  )
161
- norm2 = np.linalg.norm(
162
- embeddings2.reshape(-1, self.embedding_dim), axis=1, keepdims=True
163
- )
164
- normalized1 = embeddings1 / (norm1 + 1e-10)
165
- normalized2 = embeddings2 / (norm2 + 1e-10)
166
- return np.dot(normalized1, normalized2.T)
167
169
 
168
170
  def similarity_pairwise(
169
171
  self,
@@ -179,22 +181,15 @@ class RandomEncoderBaseline:
179
181
  Returns:
180
182
  Cosine similarity for each pair of embeddings
181
183
  """
182
- norm1 = np.linalg.norm(
183
- embeddings1.reshape(-1, self.embedding_dim), axis=1, keepdims=True
184
- )
185
- norm2 = np.linalg.norm(
186
- embeddings2.reshape(-1, self.embedding_dim), axis=1, keepdims=True
184
+ return select_pairwise_similarity(
185
+ embeddings1, embeddings2, self.mteb_model_meta.similarity_fn_name
187
186
  )
188
- normalized1 = embeddings1 / (norm1 + 1e-10)
189
- normalized2 = embeddings2 / (norm2 + 1e-10)
190
- normalized1 = np.asarray(normalized1)
191
- normalized2 = np.asarray(normalized2)
192
- return np.sum(normalized1 * normalized2, axis=1)
193
187
 
194
188
 
195
189
  random_encoder_baseline = ModelMeta(
196
190
  loader=RandomEncoderBaseline, # type: ignore
197
191
  name="baseline/random-encoder-baseline",
192
+ model_type=["dense"],
198
193
  modalities=["text", "image"],
199
194
  **_common_mock_metadata,
200
195
  )
@@ -239,7 +234,7 @@ class RandomCrossEncoderBaseline:
239
234
  random_cross_encoder_baseline = ModelMeta(
240
235
  loader=RandomCrossEncoderBaseline, # type: ignore
241
236
  name="baseline/random-cross-encoder-baseline",
237
+ model_type=["cross-encoder"],
242
238
  modalities=["text", "image"],
243
- is_cross_encoder=True,
244
239
  **_common_mock_metadata,
245
240
  )
@@ -0,0 +1,34 @@
1
+ import numpy as np
2
+
3
+ from mteb.models.model_implementations.model2vec_models import Model2VecModel
4
+ from mteb.models.model_meta import ModelMeta, ScoringFunction
5
+
6
+ potion_base_8m = ModelMeta(
7
+ loader=Model2VecModel, # type: ignore
8
+ name="rasgaard/m2v-dfm-large",
9
+ model_type=["dense"],
10
+ languages=["dan-Latn"],
11
+ open_weights=True,
12
+ revision="387897cfb09992e6d45ea9cd7b28b9fcf119e23a",
13
+ release_date="2025-10-08",
14
+ n_parameters=22893312,
15
+ memory_usage_mb=87,
16
+ max_tokens=np.inf,
17
+ embed_dim=256,
18
+ license="mit",
19
+ similarity_fn_name=ScoringFunction.COSINE,
20
+ framework=["NumPy", "Sentence Transformers"],
21
+ reference="https://huggingface.co/rasgaard/m2v-dfm-large",
22
+ use_instructions=False,
23
+ adapted_from="KennethEnevoldsen/dfm-sentence-encoder-large",
24
+ superseded_by=None,
25
+ training_datasets=set(), # distilled
26
+ public_training_code="https://github.com/MinishLab/model2vec",
27
+ public_training_data="https://huggingface.co/datasets/HuggingFaceFW/fineweb-2", # distilled on this
28
+ citation="""@article{minishlab2024model2vec,
29
+ author = {Tulkens, Stephan and {van Dongen}, Thomas},
30
+ title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
31
+ year = {2024},
32
+ url = {https://github.com/MinishLab/model2vec}
33
+ }""",
34
+ )
@@ -44,6 +44,7 @@ ReasonIR_8B = ModelMeta(
44
44
  trust_remote_code=True,
45
45
  ),
46
46
  name="ReasonIR/ReasonIR-8B",
47
+ model_type=["dense"],
47
48
  languages=["eng-Latn"],
48
49
  open_weights=True,
49
50
  revision="c3d0690370ff4a8c3d3882d8dfa85c43650034fa",
@@ -162,6 +162,7 @@ repllama_llama2_original = ModelMeta(
162
162
  model_prompts=model_prompts,
163
163
  ),
164
164
  name="castorini/repllama-v1-7b-lora-passage",
165
+ model_type=["dense"],
165
166
  languages=["eng-Latn"],
166
167
  open_weights=True,
167
168
  revision="01c7f73d771dfac7d292323805ebc428287df4f9-6097554dfe6e7d93e92f55010b678bcca1e233a8", # base-peft revision
@@ -194,6 +195,7 @@ repllama_llama2_reproduced = ModelMeta(
194
195
  model_prompts=model_prompts,
195
196
  ),
196
197
  name="samaya-ai/RepLLaMA-reproduced",
198
+ model_type=["dense"],
197
199
  languages=["eng-Latn"],
198
200
  open_weights=True,
199
201
  revision="01c7f73d771dfac7d292323805ebc428287df4f9-ad5c1d0938a1e02954bcafb4d811ba2f34052e71", # base-peft revision