mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,7 @@ mme5_mllama = ModelMeta(
12
12
  "trust_remote_code": True,
13
13
  },
14
14
  name="intfloat/mmE5-mllama-11b-instruct",
15
+ model_type=["dense"],
15
16
  revision="cbb328b9bf9ff5362c852c3166931903226d46f1",
16
17
  release_date="2025-02-12",
17
18
  languages=["eng-Latn"],
@@ -10,6 +10,13 @@ from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.types import Array, BatchedInput, PromptType
12
12
 
13
+ MOCOV3_CITATION = """@Article{chen2021mocov3,
14
+ author = {Xinlei Chen* and Saining Xie* and Kaiming He},
15
+ title = {An Empirical Study of Training Self-Supervised Vision Transformers},
16
+ journal = {arXiv preprint arXiv:2104.02057},
17
+ year = {2021},
18
+ }"""
19
+
13
20
 
14
21
  def mocov3_loader(model_name, **kwargs):
15
22
  requires_package(mocov3_loader, "timm", model_name, "pip install 'mteb[timm]'")
@@ -112,6 +119,7 @@ mocov3_training_datasets = set(
112
119
  mocov3_vit_base = ModelMeta(
113
120
  loader=mocov3_loader, # type: ignore
114
121
  name="nyu-visionx/moco-v3-vit-b",
122
+ model_type=["dense"],
115
123
  languages=["eng-Latn"],
116
124
  revision="7d091cd70772c5c0ecf7f00b5f12ca609a99d69d",
117
125
  release_date="2024-06-03",
@@ -129,11 +137,13 @@ mocov3_vit_base = ModelMeta(
129
137
  similarity_fn_name=ScoringFunction.COSINE,
130
138
  use_instructions=False,
131
139
  training_datasets=mocov3_training_datasets,
140
+ citation=MOCOV3_CITATION,
132
141
  )
133
142
 
134
143
  mocov3_vit_large = ModelMeta(
135
144
  loader=mocov3_loader, # type: ignore
136
145
  name="nyu-visionx/moco-v3-vit-l",
146
+ model_type=["dense"],
137
147
  languages=["eng-Latn"],
138
148
  revision="7bf75358d616f39b9716148bf4e3425f3bd35b47",
139
149
  release_date="2024-06-03",
@@ -151,4 +161,5 @@ mocov3_vit_large = ModelMeta(
151
161
  similarity_fn_name=ScoringFunction.COSINE,
152
162
  use_instructions=False,
153
163
  training_datasets=mocov3_training_datasets,
164
+ citation=MOCOV3_CITATION,
154
165
  )
@@ -0,0 +1,191 @@
1
+ from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
+ from mteb.models.model_meta import ModelMeta
3
+ from mteb.models.models_protocols import PromptType
4
+
5
+
6
+ def instruction_template(
7
+ instruction: str, prompt_type: PromptType | None = None
8
+ ) -> str:
9
+ if not instruction or prompt_type == PromptType.document:
10
+ return ""
11
+ if isinstance(instruction, dict):
12
+ if prompt_type is None:
13
+ instruction = next(iter(instruction.values())) # TODO
14
+ else:
15
+ instruction = instruction[prompt_type]
16
+ return f"Instruct: {instruction}\nQuery:"
17
+
18
+
19
+ multilingual_langs = [
20
+ "afr-Latn",
21
+ "ara-Arab",
22
+ "aze-Latn",
23
+ "bel-Cyrl",
24
+ "bul-Cyrl",
25
+ "ben-Beng",
26
+ "cat-Latn",
27
+ "ceb-Latn",
28
+ "ces-Latn",
29
+ "cym-Latn",
30
+ "dan-Latn",
31
+ "deu-Latn",
32
+ "ell-Grek",
33
+ "eng-Latn",
34
+ "spa-Latn",
35
+ "est-Latn",
36
+ "eus-Latn",
37
+ "fas-Arab",
38
+ "fin-Latn",
39
+ "fra-Latn",
40
+ "glg-Latn",
41
+ "guj-Gujr",
42
+ "heb-Hebr",
43
+ "hin-Deva",
44
+ "hrv-Latn",
45
+ "hat-Latn",
46
+ "hun-Latn",
47
+ "hye-Armn",
48
+ "ind-Latn",
49
+ "isl-Latn",
50
+ "ita-Latn",
51
+ "jpn-Jpan",
52
+ "jav-Latn",
53
+ "kat-Geor",
54
+ "kaz-Cyrl",
55
+ "khm-Khmr",
56
+ "kan-Knda",
57
+ "kor-Hang",
58
+ "kir-Cyrl",
59
+ "lao-Laoo",
60
+ "lit-Latn",
61
+ "lav-Latn",
62
+ "mkd-Cyrl",
63
+ "mal-Mlym",
64
+ "mon-Cyrl",
65
+ "mar-Deva",
66
+ "msa-Latn",
67
+ "mya-Mymr",
68
+ "nep-Deva",
69
+ "nld-Latn",
70
+ "nor-Latn",
71
+ "nob-Latn",
72
+ "nno-Latn",
73
+ "pan-Guru",
74
+ "pol-Latn",
75
+ "por-Latn",
76
+ "que-Latn",
77
+ "ron-Latn",
78
+ "rus-Cyrl",
79
+ "sin-Sinh",
80
+ "slk-Latn",
81
+ "slv-Latn",
82
+ "swa-Latn",
83
+ "tam-Taml",
84
+ "tel-Telu",
85
+ "tha-Thai",
86
+ "tgl-Latn",
87
+ "tur-Latn",
88
+ "ukr-Cyrl",
89
+ "urd-Arab",
90
+ "vie-Latn",
91
+ "yor-Latn",
92
+ "zho-Hans",
93
+ ]
94
+
95
+ MOD_CITATION = """@misc{mod-embedding-2025,
96
+ title={MoD-Embedding: A Fine-tuned Multilingual Text Embedding Model},
97
+ author={MoD Team},
98
+ year={2025},
99
+ url={https://huggingface.co/bflhc/MoD-Embedding}
100
+ }"""
101
+
102
+ training_data = {
103
+ "T2Retrieval",
104
+ "DuRetrieval",
105
+ "MMarcoReranking",
106
+ "CMedQAv2-reranking",
107
+ "NQ",
108
+ "MSMARCO",
109
+ "HotpotQA",
110
+ "FEVER",
111
+ "MrTidyRetrieval",
112
+ "MIRACLRetrieval",
113
+ "CodeSearchNet",
114
+ }
115
+
116
+ # Predefined prompts for various RTEB tasks
117
+ _PREDEFINED_PROMPTS = {
118
+ # ========== Open Datasets ==========
119
+ # Legal domain
120
+ "AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
121
+ "AILAStatutes": "Given a legal scenario, retrieve the most relevant statute documents",
122
+ "LegalQuAD": "Given a legal question, retrieve relevant legal documents that answer the question",
123
+ "LegalSummarization": "Given a query, retrieve relevant legal documents for summarization",
124
+ # Code domain
125
+ "AppsRetrieval": "Given a query about mobile applications, retrieve relevant app information",
126
+ "HumanEvalRetrieval": "Given a code problem description, retrieve relevant code examples",
127
+ "MBPPRetrieval": "Given a programming problem description, retrieve relevant code solutions",
128
+ "DS1000Retrieval": "Given a data science problem, retrieve relevant code snippets",
129
+ "FreshStackRetrieval": "Given a programming question, retrieve relevant Stack Overflow posts",
130
+ # Finance domain
131
+ "FinQARetrieval": "Given a financial question, retrieve relevant financial documents",
132
+ "FinanceBenchRetrieval": "Given a financial query, retrieve relevant financial information",
133
+ "HC3FinanceRetrieval": "Given a finance-related query, retrieve relevant documents",
134
+ # Medical domain
135
+ "CUREv1": "Given a medical query, retrieve relevant clinical documents",
136
+ "ChatDoctorRetrieval": "Given a medical question, retrieve relevant medical information",
137
+ # SQL domain
138
+ "WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
139
+ # Multilingual
140
+ "MIRACLRetrievalHardNegatives": "Given a question, retrieve Wikipedia passages that answer the question",
141
+ # ========== Private/Closed Datasets ==========
142
+ # Code domain (Private)
143
+ "Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
144
+ "JapaneseCode1Retrieval": "Given a code problem description, retrieve relevant code examples",
145
+ # Finance domain (Private)
146
+ "EnglishFinance1Retrieval": "Given a financial query, retrieve relevant financial documents",
147
+ "EnglishFinance2Retrieval": "Given a financial query, retrieve relevant financial documents",
148
+ "EnglishFinance3Retrieval": "Given a financial query, retrieve relevant financial documents",
149
+ "EnglishFinance4Retrieval": "Given a financial query, retrieve relevant financial documents",
150
+ # Healthcare domain (Private)
151
+ "EnglishHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
152
+ "GermanHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
153
+ # Legal domain (Private)
154
+ "FrenchLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
155
+ "GermanLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
156
+ "JapaneseLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
157
+ # General/Multilingual (Private)
158
+ "French1Retrieval": "Given a query, retrieve relevant passages",
159
+ "German1Retrieval": "Given a query, retrieve relevant passages",
160
+ }
161
+
162
+
163
+ MoD_Embedding = ModelMeta(
164
+ loader=InstructSentenceTransformerModel,
165
+ loader_kwargs=dict(
166
+ instruction_template=instruction_template,
167
+ apply_instruction_to_passages=False,
168
+ prompts_dict=_PREDEFINED_PROMPTS,
169
+ max_seq_length=18480,
170
+ model_kwargs={"torch_dtype": "bfloat16"},
171
+ ),
172
+ name="bflhc/MoD-Embedding",
173
+ languages=multilingual_langs,
174
+ open_weights=True,
175
+ revision="acbb5b70fdab262226a6af2bc62001de8021b05c",
176
+ release_date="2025-12-14",
177
+ n_parameters=4021774336,
178
+ memory_usage_mb=7671,
179
+ embed_dim=2560,
180
+ max_tokens=32768,
181
+ license="apache-2.0",
182
+ reference="https://huggingface.co/bflhc/MoD-Embedding",
183
+ similarity_fn_name="cosine",
184
+ framework=["Sentence Transformers", "PyTorch"],
185
+ use_instructions=True,
186
+ public_training_code=None,
187
+ public_training_data=None,
188
+ training_datasets=training_data,
189
+ citation=MOD_CITATION,
190
+ adapted_from="Qwen/Qwen3-Embedding-4B",
191
+ )
@@ -161,6 +161,7 @@ class Model2VecModel(AbsEncoder):
161
161
  m2v_base_glove_subword = ModelMeta(
162
162
  loader=Model2VecModel,
163
163
  name="minishlab/M2V_base_glove_subword",
164
+ model_type=["dense"],
164
165
  languages=["eng-Latn"],
165
166
  open_weights=True,
166
167
  revision="5f4f5ca159b7321a8b39739bba0794fa0debddf4",
@@ -186,6 +187,7 @@ m2v_base_glove_subword = ModelMeta(
186
187
  m2v_base_glove = ModelMeta(
187
188
  loader=Model2VecModel,
188
189
  name="minishlab/M2V_base_glove",
190
+ model_type=["dense"],
189
191
  languages=["eng-Latn"],
190
192
  open_weights=True,
191
193
  revision="38ebd7f10f71e67fa8db898290f92b82e9cfff2b",
@@ -210,6 +212,7 @@ m2v_base_glove = ModelMeta(
210
212
  m2v_base_output = ModelMeta(
211
213
  loader=Model2VecModel,
212
214
  name="minishlab/M2V_base_output",
215
+ model_type=["dense"],
213
216
  languages=["eng-Latn"],
214
217
  open_weights=True,
215
218
  revision="02460ae401a22b09d2c6652e23371398329551e2",
@@ -234,6 +237,7 @@ m2v_base_output = ModelMeta(
234
237
  m2v_multilingual_output = ModelMeta(
235
238
  loader=Model2VecModel,
236
239
  name="minishlab/M2V_multilingual_output",
240
+ model_type=["dense"],
237
241
  languages=["eng-Latn"],
238
242
  open_weights=True,
239
243
  revision="2cf4ec4e1f51aeca6c55cf9b93097d00711a6305",
@@ -258,6 +262,7 @@ m2v_multilingual_output = ModelMeta(
258
262
  potion_base_2m = ModelMeta(
259
263
  loader=Model2VecModel,
260
264
  name="minishlab/potion-base-2M",
265
+ model_type=["dense"],
261
266
  languages=["eng-Latn"],
262
267
  open_weights=True,
263
268
  revision="86db093558fbced2072b929eb1690bce5272bd4b",
@@ -282,6 +287,7 @@ potion_base_2m = ModelMeta(
282
287
  potion_base_4m = ModelMeta(
283
288
  loader=Model2VecModel,
284
289
  name="minishlab/potion-base-4M",
290
+ model_type=["dense"],
285
291
  languages=["eng-Latn"],
286
292
  open_weights=True,
287
293
  revision="81b1802ada41afcd0987a37dc15e569c9fa76f04",
@@ -306,6 +312,7 @@ potion_base_4m = ModelMeta(
306
312
  potion_base_8m = ModelMeta(
307
313
  loader=Model2VecModel,
308
314
  name="minishlab/potion-base-8M",
315
+ model_type=["dense"],
309
316
  languages=["eng-Latn"],
310
317
  open_weights=True,
311
318
  revision="dcbec7aa2d52fc76754ac6291803feedd8c619ce",
@@ -330,6 +337,7 @@ potion_base_8m = ModelMeta(
330
337
  potion_multilingual_128m = ModelMeta(
331
338
  loader=Model2VecModel,
332
339
  name="minishlab/potion-multilingual-128M",
340
+ model_type=["dense"],
333
341
  languages=_POTION_MULTILINGUAL_128M_LANGUAGES,
334
342
  open_weights=True,
335
343
  revision="38ebd7f10f71e67fa8db898290f92b82e9cfff2a",
@@ -354,6 +362,7 @@ potion_multilingual_128m = ModelMeta(
354
362
  pubmed_bert_100k = ModelMeta(
355
363
  loader=Model2VecModel,
356
364
  name="NeuML/pubmedbert-base-embeddings-100K",
365
+ model_type=["dense"],
357
366
  languages=["eng-Latn"],
358
367
  open_weights=True,
359
368
  revision="bac5e3b12fb8c650e92a19c41b436732c4f16e9e",
@@ -377,6 +386,7 @@ pubmed_bert_100k = ModelMeta(
377
386
  pubmed_bert_500k = ModelMeta(
378
387
  loader=Model2VecModel,
379
388
  name="NeuML/pubmedbert-base-embeddings-500K",
389
+ model_type=["dense"],
380
390
  languages=["eng-Latn"],
381
391
  open_weights=True,
382
392
  revision="34ba71e35c393fdad7ed695113f653feb407b16b",
@@ -400,6 +410,7 @@ pubmed_bert_500k = ModelMeta(
400
410
  pubmed_bert_1m = ModelMeta(
401
411
  loader=Model2VecModel,
402
412
  name="NeuML/pubmedbert-base-embeddings-1M",
413
+ model_type=["dense"],
403
414
  languages=["eng-Latn"],
404
415
  open_weights=True,
405
416
  revision="2b7fed222594708da6d88bcda92ae9b434b7ddd1",
@@ -423,6 +434,7 @@ pubmed_bert_1m = ModelMeta(
423
434
  pubmed_bert_2m = ModelMeta(
424
435
  loader=Model2VecModel,
425
436
  name="NeuML/pubmedbert-base-embeddings-2M",
437
+ model_type=["dense"],
426
438
  languages=["eng-Latn"],
427
439
  open_weights=True,
428
440
  revision="1d7bbe04d6713e425161146bfdc71473cbed498a",
@@ -446,6 +458,7 @@ pubmed_bert_2m = ModelMeta(
446
458
  pubmed_bert_8m = ModelMeta(
447
459
  loader=Model2VecModel,
448
460
  name="NeuML/pubmedbert-base-embeddings-8M",
461
+ model_type=["dense"],
449
462
  languages=["eng-Latn"],
450
463
  open_weights=True,
451
464
  revision="387d350015e963744f4fafe56a574b7cd48646c9",
@@ -91,6 +91,7 @@ m3e_dataset = {
91
91
  m3e_base = ModelMeta(
92
92
  loader=sentence_transformers_loader,
93
93
  name="moka-ai/m3e-base",
94
+ model_type=["dense"],
94
95
  languages=["zho-Hans", "eng-Latn"],
95
96
  open_weights=True,
96
97
  revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c",
@@ -116,6 +117,7 @@ m3e_base = ModelMeta(
116
117
  m3e_small = ModelMeta(
117
118
  loader=sentence_transformers_loader,
118
119
  name="moka-ai/m3e-small",
120
+ model_type=["dense"],
119
121
  languages=["zho-Hans", "eng-Latn"],
120
122
  open_weights=True,
121
123
  revision="44c696631b2a8c200220aaaad5f987f096e986df",
@@ -141,6 +143,7 @@ m3e_small = ModelMeta(
141
143
  m3e_large = ModelMeta(
142
144
  loader=sentence_transformers_loader,
143
145
  name="moka-ai/m3e-large",
146
+ model_type=["dense"],
144
147
  languages=["zho-Hans", "eng-Latn"],
145
148
  open_weights=True,
146
149
  revision="12900375086c37ba5d83d1e417b21dc7d1d1f388",
@@ -21,6 +21,7 @@ mxbai_embed_large_v1 = ModelMeta(
21
21
  },
22
22
  ),
23
23
  name="mixedbread-ai/mxbai-embed-large-v1",
24
+ model_type=["dense"],
24
25
  languages=["eng-Latn"],
25
26
  open_weights=True,
26
27
  revision="990580e27d329c7408b3741ecff85876e128e203",
@@ -57,6 +58,7 @@ mxbai_embed_large_v1 = ModelMeta(
57
58
  mxbai_embed_2d_large_v1 = ModelMeta(
58
59
  loader=sentence_transformers_loader,
59
60
  name="mixedbread-ai/mxbai-embed-2d-large-v1",
61
+ model_type=["dense"],
60
62
  languages=["eng-Latn"],
61
63
  open_weights=True,
62
64
  revision="7e639ca8e344af398876ead3b19ec3c0b9068f49",
@@ -81,6 +83,7 @@ mxbai_embed_2d_large_v1 = ModelMeta(
81
83
  mxbai_embed_xsmall_v1 = ModelMeta(
82
84
  loader=sentence_transformers_loader,
83
85
  name="mixedbread-ai/mxbai-embed-xsmall-v1",
86
+ model_type=["dense"],
84
87
  languages=["eng-Latn"],
85
88
  open_weights=True,
86
89
  revision="2f741ec33328bb57e4704e1238fc59a4a5745705",
@@ -99,4 +102,10 @@ mxbai_embed_xsmall_v1 = ModelMeta(
99
102
  public_training_code=None,
100
103
  public_training_data=None,
101
104
  training_datasets=mixedbread_training_data,
105
+ citation="""@online{xsmall2024mxbai,
106
+ title={Every Byte Matters: Introducing mxbai-embed-xsmall-v1},
107
+ author={Sean Lee and Julius Lipp and Rui Huang and Darius Koenig},
108
+ year={2024},
109
+ url={https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1},
110
+ }""",
102
111
  )
@@ -0,0 +1,70 @@
1
+ from mteb.models.model_meta import ModelMeta, ScoringFunction
2
+ from mteb.models.sentence_transformer_wrapper import (
3
+ SentenceTransformerEncoderWrapper,
4
+ )
5
+
6
+ nb_sbert = ModelMeta(
7
+ loader=SentenceTransformerEncoderWrapper, # type: ignore[arg-type]
8
+ name="NbAiLab/nb-sbert-base",
9
+ model_type=["dense"],
10
+ languages=["nno-Latn", "nob-Latn", "swe-Latn", "dan-Latn"],
11
+ open_weights=True,
12
+ revision="b95656350a076aeafd2d23763660f80655408cc6",
13
+ release_date="2022-11-23",
14
+ n_parameters=1_780_000_000,
15
+ memory_usage_mb=678,
16
+ embed_dim=4096,
17
+ license="apache-2.0",
18
+ max_tokens=75,
19
+ reference="https://huggingface.co/NbAiLab/nb-sbert-base",
20
+ similarity_fn_name=ScoringFunction.COSINE,
21
+ framework=["Sentence Transformers", "PyTorch"],
22
+ use_instructions=False,
23
+ public_training_code=None,
24
+ public_training_data="https://huggingface.co/datasets/NbAiLab/mnli-norwegian",
25
+ training_datasets=set(),
26
+ )
27
+
28
+ nb_bert_large = ModelMeta(
29
+ loader=SentenceTransformerEncoderWrapper, # type: ignore[arg-type]
30
+ name="NbAiLab/nb-bert-large",
31
+ model_type=["dense"],
32
+ languages=["nno-Latn", "nob-Latn"],
33
+ open_weights=True,
34
+ revision="f9d0fc184adab4dc354d85e1854b7634540d7550",
35
+ release_date="2021-04-29",
36
+ n_parameters=355087360,
37
+ memory_usage_mb=1359,
38
+ embed_dim=1024,
39
+ license="cc-by-4.0",
40
+ max_tokens=512,
41
+ reference="https://huggingface.co/NbAiLab/nb-bert-large",
42
+ similarity_fn_name=ScoringFunction.COSINE,
43
+ framework=["Sentence Transformers", "PyTorch"],
44
+ use_instructions=False,
45
+ public_training_code=None,
46
+ public_training_data="https://huggingface.co/NbAiLab/nb-bert-large#training-data",
47
+ training_datasets=set(),
48
+ )
49
+
50
+ nb_bert_base = ModelMeta(
51
+ loader=SentenceTransformerEncoderWrapper, # type: ignore[arg-type]
52
+ name="NbAiLab/nb-bert-base",
53
+ model_type=["dense"],
54
+ languages=["nno-Latn", "nob-Latn"],
55
+ open_weights=True,
56
+ revision="9417c3f62a3adc99f17ff92bff446f35d011f994",
57
+ release_date="2021-01-13",
58
+ n_parameters=177853440,
59
+ memory_usage_mb=681,
60
+ embed_dim=768,
61
+ license="cc-by-4.0",
62
+ max_tokens=512,
63
+ reference="https://huggingface.co/NbAiLab/nb-bert-base",
64
+ similarity_fn_name=ScoringFunction.COSINE,
65
+ framework=["Sentence Transformers", "PyTorch"],
66
+ use_instructions=False,
67
+ public_training_code=None,
68
+ public_training_data="https://huggingface.co/NbAiLab/nb-bert-base#training-data",
69
+ training_datasets=set(),
70
+ )
@@ -97,6 +97,7 @@ class NoInstructModel(AbsEncoder):
97
97
  no_instruct_small_v0 = ModelMeta(
98
98
  loader=NoInstructModel,
99
99
  name="avsolatorio/NoInstruct-small-Embedding-v0",
100
+ model_type=["dense"],
100
101
  languages=["eng-Latn"],
101
102
  open_weights=True,
102
103
  revision="b38747000553d8268915c95a55fc87e707c9aadd",