mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import math
3
- from typing import Any
5
+ from typing import TYPE_CHECKING, Any
4
6
 
5
7
  import torch
6
- from PIL import Image
7
8
  from torch.utils.data import DataLoader
8
9
  from tqdm.autonotebook import tqdm
9
10
 
@@ -12,6 +13,9 @@ from mteb.models.abs_encoder import AbsEncoder
12
13
  from mteb.models.model_meta import ModelMeta, ScoringFunction
13
14
  from mteb.types import Array, BatchedInput, PromptType
14
15
 
16
+ if TYPE_CHECKING:
17
+ from PIL import Image
18
+
15
19
  logger = logging.getLogger(__name__)
16
20
 
17
21
  GME_CITATION = """@misc{zhang2024gme,
@@ -267,9 +271,9 @@ def smart_resize(
267
271
  return h_bar, w_bar
268
272
 
269
273
 
270
- def fetch_image(
271
- image: str | Image.Image, size_factor: int = IMAGE_FACTOR
272
- ) -> Image.Image:
274
+ def fetch_image(image: Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
275
+ from PIL import Image
276
+
273
277
  image_obj = None
274
278
  if isinstance(image, Image.Image):
275
279
  image_obj = image
@@ -342,6 +346,7 @@ training_data = {
342
346
  gme_qwen2vl_2b = ModelMeta(
343
347
  loader=GmeQwen2VL,
344
348
  name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
349
+ model_type=["dense"],
345
350
  languages=["eng-Latn", "cmn-Hans"],
346
351
  open_weights=True,
347
352
  revision="ce765ae71b8cdb208203cd8fb64a170b1b84293a",
@@ -365,6 +370,7 @@ gme_qwen2vl_2b = ModelMeta(
365
370
  gme_qwen2vl_7b = ModelMeta(
366
371
  loader=GmeQwen2VL,
367
372
  name="Alibaba-NLP/gme-Qwen2-VL-7B-Instruct",
373
+ model_type=["dense"],
368
374
  languages=["eng-Latn", "cmn-Hans"],
369
375
  open_weights=True,
370
376
  revision="477027a6480f8630363be77751f169cc3434b673",
@@ -147,10 +147,10 @@ class GoogleTextEmbeddingModel(AbsEncoder):
147
147
  google_text_emb_004 = ModelMeta(
148
148
  loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
149
149
  loader_kwargs=dict(
150
- model_name="text-embedding-004",
151
150
  model_prompts=MODEL_PROMPTS,
152
151
  ),
153
152
  name="google/text-embedding-004",
153
+ model_type=["dense"],
154
154
  languages=["eng-Latn"],
155
155
  open_weights=False,
156
156
  revision="1", # revision is intended for implementation
@@ -172,10 +172,10 @@ google_text_emb_004 = ModelMeta(
172
172
  google_text_emb_005 = ModelMeta(
173
173
  loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
174
174
  loader_kwargs=dict(
175
- model_name="text-embedding-005",
176
175
  model_prompts=MODEL_PROMPTS,
177
176
  ),
178
177
  name="google/text-embedding-005",
178
+ model_type=["dense"],
179
179
  languages=["eng-Latn"],
180
180
  open_weights=False,
181
181
  revision="1", # revision is intended for implementation
@@ -197,10 +197,10 @@ google_text_emb_005 = ModelMeta(
197
197
  google_text_multilingual_emb_002 = ModelMeta(
198
198
  loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
199
199
  loader_kwargs=dict(
200
- model_name="text-embedding-002",
201
200
  model_prompts=MODEL_PROMPTS,
202
201
  ),
203
202
  name="google/text-multilingual-embedding-002",
203
+ model_type=["dense"],
204
204
  languages=MULTILINGUAL_EVALUATED_LANGUAGES, # From the list of evaluated languages in https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api#supported_text_languages
205
205
  open_weights=False,
206
206
  revision="1",
@@ -222,10 +222,10 @@ google_text_multilingual_emb_002 = ModelMeta(
222
222
  google_gemini_embedding_001 = ModelMeta(
223
223
  loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
224
224
  loader_kwargs=dict(
225
- model_name="gemini-embedding-001",
226
225
  model_prompts=MODEL_PROMPTS,
227
226
  ),
228
227
  name="google/gemini-embedding-001",
228
+ model_type=["dense"],
229
229
  languages=MULTILINGUAL_EVALUATED_LANGUAGES,
230
230
  open_weights=False,
231
231
  revision="1",
@@ -260,6 +260,7 @@ def gemma_embedding_loader(model_name: str, revision: str, **kwargs):
260
260
  embedding_gemma_300m = ModelMeta(
261
261
  loader=gemma_embedding_loader,
262
262
  name="google/embeddinggemma-300m",
263
+ model_type=["dense"],
263
264
  languages=MULTILINGUAL_EVALUATED_LANGUAGES,
264
265
  open_weights=True,
265
266
  revision="64614b0b8b64f0c6c1e52b07e4e9a4e8fe4d2da2",
@@ -275,5 +276,15 @@ embedding_gemma_300m = ModelMeta(
275
276
  public_training_data=None,
276
277
  training_datasets=GECKO_TRAINING_DATA,
277
278
  similarity_fn_name="cosine",
278
- memory_usage_mb=578,
279
+ memory_usage_mb=1155,
280
+ citation="""
281
+ @misc{vera2025embeddinggemmapowerfullightweighttext,
282
+ title={EmbeddingGemma: Powerful and Lightweight Text Representations},
283
+ author={Henrique Schechter Vera and Sahil Dua and Biao Zhang and Daniel Salz and Ryan Mullins and Sindhu Raghuram Panyam and Sara Smoot and Iftekhar Naim and Joe Zou and Feiyang Chen and Daniel Cer and Alice Lisak and Min Choi and Lucas Gonzalez and Omar Sanseviero and Glenn Cameron and Ian Ballantyne and Kat Black and Kaifeng Chen and Weiyi Wang and Zhe Li and Gus Martins and Jinhyuk Lee and Mark Sherwood and Juyeong Ji and Renjie Wu and Jingxiao Zheng and Jyotinder Singh and Abheesht Sharma and Divyashree Sreepathihalli and Aashi Jain and Adham Elarabawy and AJ Co and Andreas Doumanoglou and Babak Samari and Ben Hora and Brian Potetz and Dahun Kim and Enrique Alfonseca and Fedor Moiseev and Feng Han and Frank Palma Gomez and Gustavo Hernández Ábrego and Hesen Zhang and Hui Hui and Jay Han and Karan Gill and Ke Chen and Koert Chen and Madhuri Shanbhogue and Michael Boratko and Paul Suganthan and Sai Meher Karthik Duddu and Sandeep Mariserla and Setareh Ariafar and Shanfeng Zhang and Shijie Zhang and Simon Baumgartner and Sonam Goenka and Steve Qiu and Tanmaya Dabral and Trevor Walker and Vikram Rao and Waleed Khawaja and Wenlei Zhou and Xiaoqi Ren and Ye Xia and Yichang Chen and Yi-Ting Chen and Zhe Dong and Zhongli Ding and Francesco Visin and Gaël Liu and Jiageng Zhang and Kathleen Kenealy and Michelle Casbon and Ravin Kumar and Thomas Mesnard and Zach Gleicher and Cormac Brick and Olivier Lacombe and Adam Roberts and Qin Yin and Yunhsuan Sung and Raphael Hoffmann and Tris Warkentin and Armand Joulin and Tom Duerig and Mojtaba Seyedhosseini},
284
+ year={2025},
285
+ eprint={2509.20354},
286
+ archivePrefix={arXiv},
287
+ primaryClass={cs.CL},
288
+ url={https://arxiv.org/abs/2509.20354},
289
+ }""",
279
290
  )
@@ -1,8 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from PIL import Image
6
7
  from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
@@ -15,6 +16,9 @@ from mteb.types import Array, BatchedInput, PromptType
15
16
 
16
17
  logger = logging.getLogger(__name__)
17
18
 
19
+ if TYPE_CHECKING:
20
+ from PIL import Image
21
+
18
22
 
19
23
  class GraniteVisionEmbeddingWrapper:
20
24
  def __init__(
@@ -162,6 +166,7 @@ granite_vision_embedding = ModelMeta(
162
166
  torch_dtype=torch.float16,
163
167
  ),
164
168
  name="ibm-granite/granite-vision-3.3-2b-embedding",
169
+ model_type=["dense"],
165
170
  languages=["eng-Latn"],
166
171
  revision="cee615db64d89d1552a4ee39c50f25c0fc5c66ca",
167
172
  release_date="2025-06-11",
@@ -38,6 +38,7 @@ gritlm7b = ModelMeta(
38
38
  torch_dtype="auto",
39
39
  ),
40
40
  name="GritLM/GritLM-7B",
41
+ model_type=["dense"],
41
42
  languages=["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"],
42
43
  open_weights=True,
43
44
  revision="13f00a0e36500c80ce12870ea513846a066004af",
@@ -66,6 +67,7 @@ gritlm8x7b = ModelMeta(
66
67
  torch_dtype="auto",
67
68
  ),
68
69
  name="GritLM/GritLM-8x7B",
70
+ model_type=["dense"],
69
71
  languages=["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"],
70
72
  open_weights=True,
71
73
  revision="7f089b13e3345510281733ca1e6ff871b5b4bc76",
@@ -42,6 +42,7 @@ gte_qwen2_7b_instruct = ModelMeta(
42
42
  embed_eos="<|endoftext|>",
43
43
  ),
44
44
  name="Alibaba-NLP/gte-Qwen2-7B-instruct",
45
+ model_type=["dense"],
45
46
  languages=None,
46
47
  open_weights=True,
47
48
  revision="e26182b2122f4435e8b3ebecbf363990f409b45b",
@@ -73,6 +74,7 @@ gte_qwen1_5_7b_instruct = ModelMeta(
73
74
  embed_eos="<|endoftext|>",
74
75
  ),
75
76
  name="Alibaba-NLP/gte-Qwen1.5-7B-instruct",
77
+ model_type=["dense"],
76
78
  languages=["eng-Latn"],
77
79
  open_weights=True,
78
80
  revision="07d27e5226328010336563bc1b564a5e3436a298",
@@ -89,6 +91,12 @@ gte_qwen1_5_7b_instruct = ModelMeta(
89
91
  public_training_code=None,
90
92
  public_training_data=None,
91
93
  training_datasets=None,
94
+ citation="""@article{li2023towards,
95
+ title={Towards general text embeddings with multi-stage contrastive learning},
96
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
97
+ journal={arXiv preprint arXiv:2308.03281},
98
+ year={2023}
99
+ }""",
92
100
  )
93
101
 
94
102
  gte_qwen2_1_5b_instruct = ModelMeta(
@@ -103,6 +111,7 @@ gte_qwen2_1_5b_instruct = ModelMeta(
103
111
  embed_eos="<|endoftext|>",
104
112
  ),
105
113
  name="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
114
+ model_type=["dense"],
106
115
  languages=["eng-Latn"],
107
116
  open_weights=True,
108
117
  revision="c6c1b92f4a3e1b92b326ad29dd3c8433457df8dd",
@@ -119,11 +128,18 @@ gte_qwen2_1_5b_instruct = ModelMeta(
119
128
  public_training_code=None,
120
129
  public_training_data=None,
121
130
  training_datasets=None,
131
+ citation="""@article{li2023towards,
132
+ title={Towards general text embeddings with multi-stage contrastive learning},
133
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
134
+ journal={arXiv preprint arXiv:2308.03281},
135
+ year={2023}
136
+ }""",
122
137
  )
123
138
 
124
139
  gte_small_zh = ModelMeta(
125
140
  loader=sentence_transformers_loader,
126
141
  name="thenlper/gte-small-zh",
142
+ model_type=["dense"],
127
143
  languages=["zho-Hans"],
128
144
  open_weights=True,
129
145
  revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a",
@@ -140,11 +156,18 @@ gte_small_zh = ModelMeta(
140
156
  public_training_code=None,
141
157
  public_training_data=None,
142
158
  training_datasets=None, # Not disclosed
159
+ citation="""@article{li2023towards,
160
+ title={Towards general text embeddings with multi-stage contrastive learning},
161
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
162
+ journal={arXiv preprint arXiv:2308.03281},
163
+ year={2023}
164
+ }""",
143
165
  )
144
166
 
145
167
  gte_base_zh = ModelMeta(
146
168
  loader=sentence_transformers_loader,
147
169
  name="thenlper/gte-base-zh",
170
+ model_type=["dense"],
148
171
  languages=["zho-Hans"],
149
172
  open_weights=True,
150
173
  revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c",
@@ -161,11 +184,18 @@ gte_base_zh = ModelMeta(
161
184
  public_training_code=None,
162
185
  public_training_data=None,
163
186
  training_datasets=None, # Not disclosed
187
+ citation="""@article{li2023towards,
188
+ title={Towards general text embeddings with multi-stage contrastive learning},
189
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
190
+ journal={arXiv preprint arXiv:2308.03281},
191
+ year={2023}
192
+ }""",
164
193
  )
165
194
 
166
195
  gte_large_zh = ModelMeta(
167
196
  loader=sentence_transformers_loader,
168
197
  name="thenlper/gte-large-zh",
198
+ model_type=["dense"],
169
199
  languages=["zho-Hans"],
170
200
  open_weights=True,
171
201
  revision="64c364e579de308104a9b2c170ca009502f4f545",
@@ -182,6 +212,12 @@ gte_large_zh = ModelMeta(
182
212
  public_training_code=None,
183
213
  public_training_data=None,
184
214
  training_datasets=None, # Not disclosed
215
+ citation="""@article{li2023towards,
216
+ title={Towards general text embeddings with multi-stage contrastive learning},
217
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
218
+ journal={arXiv preprint arXiv:2308.03281},
219
+ year={2023}
220
+ }""",
185
221
  )
186
222
 
187
223
  gte_multilingual_langs = [
@@ -288,6 +324,7 @@ gte_multi_training_data = {
288
324
  gte_multilingual_base = ModelMeta(
289
325
  loader=sentence_transformers_loader,
290
326
  name="Alibaba-NLP/gte-multilingual-base",
327
+ model_type=["dense"],
291
328
  languages=gte_multilingual_langs,
292
329
  open_weights=True,
293
330
  revision="ca1791e0bcc104f6db161f27de1340241b13c5a4",
@@ -304,11 +341,19 @@ gte_multilingual_base = ModelMeta(
304
341
  public_training_code=None,
305
342
  public_training_data=None, # couldn't find
306
343
  training_datasets=gte_multi_training_data,
344
+ citation="""@inproceedings{zhang2024mgte,
345
+ title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
346
+ author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},
347
+ booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},
348
+ pages={1393--1412},
349
+ year={2024}
350
+ }""",
307
351
  )
308
352
 
309
353
  gte_modernbert_base = ModelMeta(
310
354
  loader=sentence_transformers_loader,
311
355
  name="Alibaba-NLP/gte-modernbert-base",
356
+ model_type=["dense"],
312
357
  languages=["eng-Latn"],
313
358
  open_weights=True,
314
359
  revision="7ca8b4ca700621b67618669f5378fe5f5820b8e4",
@@ -325,12 +370,27 @@ gte_modernbert_base = ModelMeta(
325
370
  public_training_code=None, # couldn't find
326
371
  public_training_data=None,
327
372
  training_datasets=gte_multi_training_data, # English part of gte_multi_training_data,
373
+ citation="""@inproceedings{zhang2024mgte,
374
+ title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
375
+ author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},
376
+ booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},
377
+ pages={1393--1412},
378
+ year={2024}
379
+ }
380
+
381
+ @article{li2023towards,
382
+ title={Towards general text embeddings with multi-stage contrastive learning},
383
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
384
+ journal={arXiv preprint arXiv:2308.03281},
385
+ year={2023}
386
+ }""",
328
387
  )
329
388
 
330
389
 
331
390
  gte_base_en_v15 = ModelMeta(
332
391
  loader=sentence_transformers_loader,
333
392
  name="Alibaba-NLP/gte-base-en-v1.5",
393
+ model_type=["dense"],
334
394
  languages=["eng-Latn"],
335
395
  open_weights=True,
336
396
  revision="a829fd0e060bb84554da0dfd354d0de0f7712b7f", # can be any
@@ -349,4 +409,22 @@ gte_base_en_v15 = ModelMeta(
349
409
  public_training_code=None,
350
410
  public_training_data=None,
351
411
  training_datasets=None,
412
+ citation="""@misc{zhang2024mgte,
413
+ title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
414
+ author={Xin Zhang and Yanzhao Zhang and Dingkun Long and Wen Xie and Ziqi Dai and Jialong Tang and Huan Lin and Baosong Yang and Pengjun Xie and Fei Huang and Meishan Zhang and Wenjie Li and Min Zhang},
415
+ year={2024},
416
+ eprint={2407.19669},
417
+ archivePrefix={arXiv},
418
+ primaryClass={cs.CL},
419
+ url={https://arxiv.org/abs/2407.19669},
420
+ }
421
+ @misc{li2023gte,
422
+ title={Towards General Text Embeddings with Multi-stage Contrastive Learning},
423
+ author={Zehan Li and Xin Zhang and Yanzhao Zhang and Dingkun Long and Pengjun Xie and Meishan Zhang},
424
+ year={2023},
425
+ eprint={2308.03281},
426
+ archivePrefix={arXiv},
427
+ primaryClass={cs.CL},
428
+ url={https://arxiv.org/abs/2308.03281},
429
+ }""",
352
430
  )
@@ -37,6 +37,7 @@ Hinvec_bidir = ModelMeta(
37
37
  add_eos_token=True,
38
38
  ),
39
39
  name="Sailesh97/Hinvec",
40
+ model_type=["dense"],
40
41
  languages=["eng-Latn", "hin-Deva"],
41
42
  open_weights=True,
42
43
  revision="d4fc678720cc1b8c5d18599ce2d9a4d6090c8b6b",
@@ -3,6 +3,7 @@ from mteb.models import ModelMeta
3
3
  human = ModelMeta(
4
4
  loader=None,
5
5
  name="Human",
6
+ model_type=["dense"],
6
7
  languages=["eng-Latn", "ara-Arab", "rus-Cyrl", "dan-Latn", "nob-Latn"],
7
8
  open_weights=True,
8
9
  revision="2025_09_25",
@@ -94,6 +94,7 @@ granite_training_data = {
94
94
  granite_107m_multilingual = ModelMeta(
95
95
  loader=sentence_transformers_loader,
96
96
  name="ibm-granite/granite-embedding-107m-multilingual",
97
+ model_type=["dense"],
97
98
  languages=GRANITE_LANGUAGES,
98
99
  open_weights=True,
99
100
  revision="47db56afe692f731540413c67dd818ff492277e7",
@@ -118,6 +119,7 @@ granite_107m_multilingual = ModelMeta(
118
119
  granite_278m_multilingual = ModelMeta(
119
120
  loader=sentence_transformers_loader,
120
121
  name="ibm-granite/granite-embedding-278m-multilingual",
122
+ model_type=["dense"],
121
123
  languages=GRANITE_LANGUAGES,
122
124
  open_weights=True,
123
125
  revision="84e3546b88b0cb69f8078608a1df558020bcbf1f",
@@ -142,6 +144,7 @@ granite_278m_multilingual = ModelMeta(
142
144
  granite_30m_english = ModelMeta(
143
145
  loader=sentence_transformers_loader,
144
146
  name="ibm-granite/granite-embedding-30m-english",
147
+ model_type=["dense"],
145
148
  languages=["eng-Latn"],
146
149
  open_weights=True,
147
150
  revision="eddbb57470f896b5f8e2bfcb823d8f0e2d2024a5",
@@ -166,6 +169,7 @@ granite_30m_english = ModelMeta(
166
169
  granite_125m_english = ModelMeta(
167
170
  loader=sentence_transformers_loader,
168
171
  name="ibm-granite/granite-embedding-125m-english",
172
+ model_type=["dense"],
169
173
  languages=["eng-Latn"],
170
174
  open_weights=True,
171
175
  revision="e48d3a5b47eaa18e3fe07d4676e187fd80f32730",
@@ -191,6 +195,7 @@ granite_125m_english = ModelMeta(
191
195
  granite_english_r2 = ModelMeta(
192
196
  loader=sentence_transformers_loader,
193
197
  name="ibm-granite/granite-embedding-english-r2",
198
+ model_type=["dense"],
194
199
  languages=["eng-Latn"],
195
200
  open_weights=True,
196
201
  revision="6e7b8ce0e76270394ac4669ba4bbd7133b60b7f9",
@@ -215,6 +220,7 @@ granite_english_r2 = ModelMeta(
215
220
  granite_small_english_r2 = ModelMeta(
216
221
  loader=sentence_transformers_loader,
217
222
  name="ibm-granite/granite-embedding-small-english-r2",
223
+ model_type=["dense"],
218
224
  languages=["eng-Latn"],
219
225
  open_weights=True,
220
226
  revision="54a8d2616a0844355a5164432d3f6dafb37b17a3",
@@ -50,6 +50,7 @@ inf_retriever_v1 = ModelMeta(
50
50
  trust_remote_code=True,
51
51
  ),
52
52
  name="infly/inf-retriever-v1",
53
+ model_type=["dense"],
53
54
  languages=["eng-Latn", "zho-Hans"],
54
55
  open_weights=True,
55
56
  revision="cb70ca7c31dfa866b2eff2dad229c144d8ddfd91",
@@ -76,6 +77,7 @@ inf_retriever_v1_1_5b = ModelMeta(
76
77
  trust_remote_code=True,
77
78
  ),
78
79
  name="infly/inf-retriever-v1-1.5b",
80
+ model_type=["dense"],
79
81
  languages=["eng-Latn", "zho-Hans"],
80
82
  open_weights=True,
81
83
  revision="c9c05c2dd50707a486966ba81703021ae2094a06",