mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -5,108 +5,10 @@ from mteb.models.model_meta import (
5
5
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
6
6
  from mteb.types import PromptType
7
7
 
8
+ from .facebookai import XLMR_LANGUAGES
9
+
8
10
  E5_PAPER_RELEASE_DATE = "2024-02-08"
9
- XLMR_LANGUAGES = [
10
- "afr-Latn",
11
- "amh-Latn",
12
- "ara-Latn",
13
- "asm-Latn",
14
- "aze-Latn",
15
- "bel-Latn",
16
- "bul-Latn",
17
- "ben-Latn",
18
- "ben-Beng",
19
- "bre-Latn",
20
- "bos-Latn",
21
- "cat-Latn",
22
- "ces-Latn",
23
- "cym-Latn",
24
- "dan-Latn",
25
- "deu-Latn",
26
- "ell-Latn",
27
- "eng-Latn",
28
- "epo-Latn",
29
- "spa-Latn",
30
- "est-Latn",
31
- "eus-Latn",
32
- "fas-Latn",
33
- "fin-Latn",
34
- "fra-Latn",
35
- "fry-Latn",
36
- "gle-Latn",
37
- "gla-Latn",
38
- "glg-Latn",
39
- "guj-Latn",
40
- "hau-Latn",
41
- "heb-Latn",
42
- "hin-Latn",
43
- "hin-Deva",
44
- "hrv-Latn",
45
- "hun-Latn",
46
- "hye-Latn",
47
- "ind-Latn",
48
- "isl-Latn",
49
- "ita-Latn",
50
- "jpn-Latn",
51
- "jav-Latn",
52
- "kat-Latn",
53
- "kaz-Latn",
54
- "khm-Latn",
55
- "kan-Latn",
56
- "kor-Latn",
57
- "kur-Latn",
58
- "kir-Latn",
59
- "lat-Latn",
60
- "lao-Latn",
61
- "lit-Latn",
62
- "lav-Latn",
63
- "mlg-Latn",
64
- "mkd-Latn",
65
- "mal-Latn",
66
- "mon-Latn",
67
- "mar-Latn",
68
- "msa-Latn",
69
- "mya-Latn",
70
- "nep-Latn",
71
- "nld-Latn",
72
- "nob-Latn",
73
- "orm-Latn",
74
- "ori-Latn",
75
- "pan-Latn",
76
- "pol-Latn",
77
- "pus-Latn",
78
- "por-Latn",
79
- "ron-Latn",
80
- "rus-Latn",
81
- "san-Latn",
82
- "snd-Latn",
83
- "sin-Latn",
84
- "slk-Latn",
85
- "slv-Latn",
86
- "som-Latn",
87
- "sqi-Latn",
88
- "srp-Latn",
89
- "sun-Latn",
90
- "swe-Latn",
91
- "swa-Latn",
92
- "tam-Latn",
93
- "tam-Taml",
94
- "tel-Latn",
95
- "tel-Telu",
96
- "tha-Latn",
97
- "tgl-Latn",
98
- "tur-Latn",
99
- "uig-Latn",
100
- "ukr-Latn",
101
- "urd-Latn",
102
- "urd-Arab",
103
- "uzb-Latn",
104
- "vie-Latn",
105
- "xho-Latn",
106
- "yid-Latn",
107
- "zho-Hant",
108
- "zho-Hans",
109
- ]
11
+
110
12
 
111
13
  MULTILINGUAL_E5_CITATION = """
112
14
  @article{wang2024multilingual,
@@ -168,6 +70,7 @@ e5_mult_small = ModelMeta(
168
70
  model_prompts=model_prompts,
169
71
  ),
170
72
  name="intfloat/multilingual-e5-small",
73
+ model_type=["dense"],
171
74
  languages=XLMR_LANGUAGES,
172
75
  open_weights=True,
173
76
  revision="fd1525a9fd15316a2d503bf26ab031a61d056e98",
@@ -179,7 +82,7 @@ e5_mult_small = ModelMeta(
179
82
  max_tokens=512,
180
83
  reference="https://huggingface.co/intfloat/multilingual-e5-small",
181
84
  similarity_fn_name=ScoringFunction.COSINE,
182
- framework=["Sentence Transformers", "PyTorch"],
85
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
183
86
  use_instructions=True,
184
87
  public_training_code=None, # couldn't find
185
88
  public_training_data=None,
@@ -194,6 +97,7 @@ e5_mult_base = ModelMeta(
194
97
  model_prompts=model_prompts,
195
98
  ),
196
99
  name="intfloat/multilingual-e5-base",
100
+ model_type=["dense"],
197
101
  languages=XLMR_LANGUAGES,
198
102
  open_weights=True,
199
103
  revision="d13f1b27baf31030b7fd040960d60d909913633f",
@@ -205,7 +109,7 @@ e5_mult_base = ModelMeta(
205
109
  max_tokens=514,
206
110
  reference="https://huggingface.co/intfloat/multilingual-e5-base",
207
111
  similarity_fn_name=ScoringFunction.COSINE,
208
- framework=["Sentence Transformers", "PyTorch"],
112
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
209
113
  use_instructions=True,
210
114
  public_training_code=None,
211
115
  public_training_data=None,
@@ -220,6 +124,7 @@ e5_mult_large = ModelMeta(
220
124
  model_prompts=model_prompts,
221
125
  ),
222
126
  name="intfloat/multilingual-e5-large",
127
+ model_type=["dense"],
223
128
  languages=XLMR_LANGUAGES,
224
129
  open_weights=True,
225
130
  revision="ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb",
@@ -231,7 +136,7 @@ e5_mult_large = ModelMeta(
231
136
  max_tokens=514,
232
137
  reference="https://huggingface.co/intfloat/multilingual-e5-large",
233
138
  similarity_fn_name=ScoringFunction.COSINE,
234
- framework=["Sentence Transformers", "PyTorch"],
139
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
235
140
  use_instructions=True,
236
141
  public_training_code=None,
237
142
  public_training_data=None,
@@ -246,6 +151,7 @@ e5_eng_small_v2 = ModelMeta(
246
151
  model_prompts=model_prompts,
247
152
  ),
248
153
  name="intfloat/e5-small-v2",
154
+ model_type=["dense"],
249
155
  languages=["eng-Latn"],
250
156
  open_weights=True,
251
157
  revision="dca8b1a9dae0d4575df2bf423a5edb485a431236",
@@ -257,7 +163,7 @@ e5_eng_small_v2 = ModelMeta(
257
163
  max_tokens=512,
258
164
  reference="https://huggingface.co/intfloat/e5-small-v2",
259
165
  similarity_fn_name=ScoringFunction.COSINE,
260
- framework=["Sentence Transformers", "PyTorch"],
166
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
261
167
  use_instructions=True,
262
168
  public_training_code=None,
263
169
  public_training_data=None,
@@ -272,6 +178,7 @@ e5_eng_small = ModelMeta(
272
178
  model_prompts=model_prompts,
273
179
  ),
274
180
  name="intfloat/e5-small",
181
+ model_type=["dense"],
275
182
  languages=["eng-Latn"],
276
183
  open_weights=True,
277
184
  revision="e272f3049e853b47cb5ca3952268c6662abda68f",
@@ -283,7 +190,7 @@ e5_eng_small = ModelMeta(
283
190
  max_tokens=512,
284
191
  reference="https://huggingface.co/intfloat/e5-small",
285
192
  similarity_fn_name=ScoringFunction.COSINE,
286
- framework=["Sentence Transformers", "PyTorch"],
193
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
287
194
  use_instructions=True,
288
195
  public_training_code=None,
289
196
  public_training_data=None,
@@ -298,6 +205,7 @@ e5_eng_base_v2 = ModelMeta(
298
205
  model_prompts=model_prompts,
299
206
  ),
300
207
  name="intfloat/e5-base-v2",
208
+ model_type=["dense"],
301
209
  languages=["eng-Latn"],
302
210
  open_weights=True,
303
211
  revision="1c644c92ad3ba1efdad3f1451a637716616a20e8",
@@ -309,7 +217,7 @@ e5_eng_base_v2 = ModelMeta(
309
217
  max_tokens=512,
310
218
  reference="https://huggingface.co/intfloat/e5-base-v2",
311
219
  similarity_fn_name=ScoringFunction.COSINE,
312
- framework=["Sentence Transformers", "PyTorch"],
220
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
313
221
  use_instructions=True,
314
222
  superseded_by=None,
315
223
  adapted_from="intfloat/e5-base",
@@ -325,6 +233,7 @@ e5_eng_large_v2 = ModelMeta(
325
233
  model_prompts=model_prompts,
326
234
  ),
327
235
  name="intfloat/e5-large-v2",
236
+ model_type=["dense"],
328
237
  languages=["eng-Latn"],
329
238
  open_weights=True,
330
239
  revision="b322e09026e4ea05f42beadf4d661fb4e101d311",
@@ -336,7 +245,7 @@ e5_eng_large_v2 = ModelMeta(
336
245
  max_tokens=514,
337
246
  reference="https://huggingface.co/intfloat/e5-large-v2",
338
247
  similarity_fn_name=ScoringFunction.COSINE,
339
- framework=["Sentence Transformers", "PyTorch"],
248
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
340
249
  use_instructions=True,
341
250
  superseded_by=None,
342
251
  adapted_from="intfloat/e5-large",
@@ -352,6 +261,7 @@ e5_large = ModelMeta(
352
261
  model_prompts=model_prompts,
353
262
  ),
354
263
  name="intfloat/e5-large",
264
+ model_type=["dense"],
355
265
  languages=["eng-Latn"],
356
266
  open_weights=True,
357
267
  revision="4dc6d853a804b9c8886ede6dda8a073b7dc08a81",
@@ -363,7 +273,7 @@ e5_large = ModelMeta(
363
273
  max_tokens=512,
364
274
  reference="https://huggingface.co/intfloat/e5-large",
365
275
  similarity_fn_name=ScoringFunction.COSINE,
366
- framework=["Sentence Transformers", "PyTorch"],
276
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
367
277
  use_instructions=True,
368
278
  superseded_by="intfloat/e5-large-v2",
369
279
  adapted_from="google-bert/bert-large-uncased-whole-word-masking",
@@ -379,6 +289,7 @@ e5_base = ModelMeta(
379
289
  model_prompts=model_prompts,
380
290
  ),
381
291
  name="intfloat/e5-base",
292
+ model_type=["dense"],
382
293
  languages=["eng-Latn"],
383
294
  open_weights=True,
384
295
  revision="b533fe4636f4a2507c08ddab40644d20b0006d6a",
@@ -390,7 +301,7 @@ e5_base = ModelMeta(
390
301
  max_tokens=512,
391
302
  reference="https://huggingface.co/intfloat/e5-base",
392
303
  similarity_fn_name=ScoringFunction.COSINE,
393
- framework=["Sentence Transformers", "PyTorch"],
304
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
394
305
  use_instructions=True,
395
306
  superseded_by="intfloat/e5-base-v2",
396
307
  adapted_from="google-bert/bert-base-uncased",
@@ -30,6 +30,7 @@ class E5VModel(AbsEncoder):
30
30
  self,
31
31
  model_name: str,
32
32
  revision: str,
33
+ device: str | None = None,
33
34
  composed_prompt=None,
34
35
  **kwargs: Any,
35
36
  ):
@@ -47,8 +48,7 @@ class E5VModel(AbsEncoder):
47
48
  self.processor = LlavaNextProcessor.from_pretrained(
48
49
  model_name, revision=revision
49
50
  )
50
- if "device" in kwargs:
51
- self.device = kwargs.pop("device")
51
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
52
52
  self.model = LlavaNextForConditionalGeneration.from_pretrained(
53
53
  model_name, revision=revision, **kwargs
54
54
  )
@@ -87,7 +87,7 @@ class E5VModel(AbsEncoder):
87
87
  ],
88
88
  return_tensors="pt",
89
89
  padding=True,
90
- ).to("cuda")
90
+ ).to(self.device)
91
91
  text_outputs = self.model(
92
92
  **text_inputs, output_hidden_states=True, return_dict=True
93
93
  ).hidden_states[-1][:, -1, :]
@@ -111,7 +111,7 @@ class E5VModel(AbsEncoder):
111
111
  batch["image"],
112
112
  return_tensors="pt",
113
113
  padding=True,
114
- ).to("cuda")
114
+ ).to(self.device)
115
115
  image_outputs = self.model(
116
116
  **img_inputs, output_hidden_states=True, return_dict=True
117
117
  ).hidden_states[-1][:, -1, :]
@@ -141,7 +141,7 @@ class E5VModel(AbsEncoder):
141
141
  ]
142
142
  inputs = self.processor(
143
143
  prompts, batch["image"], return_tensors="pt", padding=True
144
- ).to("cuda")
144
+ ).to(self.device)
145
145
  outputs = self.model(
146
146
  **inputs, output_hidden_states=True, return_dict=True
147
147
  ).hidden_states[-1][:, -1, :]
@@ -160,6 +160,7 @@ e5_v = ModelMeta(
160
160
  device_map="auto",
161
161
  ),
162
162
  name="royokong/e5-v",
163
+ model_type=["dense"],
163
164
  languages=["eng-Latn"],
164
165
  revision="0c1f22679417b3ae925d779442221c40cd1861ab",
165
166
  release_date="2024-07-17",
@@ -172,7 +173,7 @@ e5_v = ModelMeta(
172
173
  open_weights=True,
173
174
  public_training_code="https://github.com/kongds/E5-V",
174
175
  public_training_data="https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse",
175
- framework=["PyTorch"],
176
+ framework=["PyTorch", "Transformers", "safetensors"],
176
177
  reference="https://huggingface.co/royokong/e5-v",
177
178
  similarity_fn_name=ScoringFunction.COSINE,
178
179
  use_instructions=True,
@@ -0,0 +1,164 @@
1
+ from typing import Any
2
+
3
+ import torch
4
+ from torch.utils.data import DataLoader
5
+ from tqdm.auto import tqdm
6
+
7
+ from mteb._requires_package import (
8
+ requires_image_dependencies,
9
+ requires_package,
10
+ )
11
+ from mteb.abstasks.task_metadata import TaskMetadata
12
+ from mteb.models.abs_encoder import AbsEncoder
13
+ from mteb.models.model_meta import ModelMeta, ScoringFunction
14
+ from mteb.types import Array, BatchedInput, PromptType
15
+
16
+
17
+ class EagerEmbedV1Wrapper(AbsEncoder):
18
+ """Wrapper for EagerEmbed single-vector embedding models."""
19
+
20
+ def __init__(
21
+ self,
22
+ model_name: str,
23
+ revision: str | None = None,
24
+ device: str | None = None,
25
+ image_size: int = 784,
26
+ **kwargs,
27
+ ):
28
+ requires_image_dependencies()
29
+ requires_package(
30
+ self, "qwen_vl_utils", model_name, "pip install mteb[eager_embed]"
31
+ )
32
+ from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
33
+
34
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
35
+ self.image_size = image_size
36
+
37
+ # Load model
38
+ self.mdl = Qwen3VLForConditionalGeneration.from_pretrained(model_name, **kwargs)
39
+ self.mdl = self.mdl.to(self.device)
40
+ self.mdl.eval()
41
+
42
+ # Load processor
43
+ self.processor = AutoProcessor.from_pretrained(model_name)
44
+
45
+ def get_embedding(self, last_hidden_state: torch.Tensor) -> torch.Tensor:
46
+ """Extract embeddings from last token of last hidden state."""
47
+ reps = last_hidden_state[:, -1]
48
+ return reps
49
+
50
+ def encode(
51
+ self,
52
+ inputs: DataLoader[BatchedInput],
53
+ *,
54
+ task_metadata: TaskMetadata,
55
+ hf_split: str,
56
+ hf_subset: str,
57
+ prompt_type: PromptType | None = None,
58
+ **kwargs: Any,
59
+ ) -> Array:
60
+ """Encode inputs (text and/or images) into embeddings."""
61
+ from qwen_vl_utils import process_vision_info
62
+
63
+ all_embeddings: list[torch.Tensor] = []
64
+
65
+ with torch.no_grad():
66
+ for batch in tqdm(inputs, desc="Encoding"):
67
+ batch_texts = batch.get("text", [])
68
+ batch_images = batch.get("image", [])
69
+
70
+ messages = []
71
+ for i in range(max(len(batch_texts), len(batch_images))):
72
+ text_content = batch_texts[i] if batch_texts else ""
73
+ image_content = batch_images[i] if batch_images else None
74
+
75
+ query_prefix = "Query: " if prompt_type == PromptType.query else ""
76
+ content = [
77
+ {"type": "text", "text": f"{query_prefix}{text_content}"}
78
+ ]
79
+
80
+ if image_content is not None:
81
+ content.append(
82
+ {
83
+ "type": "image",
84
+ "image": image_content,
85
+ "resized_height": self.image_size,
86
+ "resized_width": self.image_size,
87
+ }
88
+ )
89
+
90
+ messages.append([{"role": "user", "content": content}])
91
+
92
+ # Prepare inputs
93
+ texts = [
94
+ self.processor.apply_chat_template(
95
+ msg, tokenize=False, add_generation_prompt=False
96
+ )
97
+ + "<|endoftext|>"
98
+ for msg in messages
99
+ ]
100
+
101
+ image_inputs = None
102
+ video_inputs = None
103
+ if batch_images:
104
+ image_inputs, video_inputs = process_vision_info(messages)
105
+
106
+ model_inputs = self.processor(
107
+ text=texts,
108
+ images=image_inputs,
109
+ videos=video_inputs,
110
+ padding="longest",
111
+ return_tensors="pt",
112
+ ).to(self.device)
113
+
114
+ # Get embeddings
115
+ output = self.mdl(
116
+ **model_inputs, return_dict=True, output_hidden_states=True
117
+ )
118
+ embeddings = self.get_embedding(output.hidden_states[-1])
119
+ embeddings = embeddings.cpu().to(torch.float32)
120
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
121
+
122
+ all_embeddings.append(embeddings)
123
+
124
+ return torch.cat(all_embeddings, dim=0)
125
+
126
+
127
+ EAGER_EMBED_V1_CITATION = """@article{EagerEmbed,
128
+ title={Eager Embed V1: Multimodal Dense Embeddings for Retrieval},
129
+ author={Juan Pablo Balarini},
130
+ year={2025},
131
+ publisher={Eagerworks},
132
+ url={https://github.com/eagerworks/eager-embed},
133
+ }"""
134
+
135
+ EAGER_EMBED_V1_TRAINING_DATASETS = {"colpali", "bge-ir", "pixmo-docs", "wiki-ss"}
136
+
137
+ Eager_Embed_V1 = ModelMeta(
138
+ loader=EagerEmbedV1Wrapper,
139
+ loader_kwargs=dict(
140
+ dtype=torch.float16,
141
+ image_size=784,
142
+ ),
143
+ name="eagerworks/eager-embed-v1",
144
+ model_type=["dense"],
145
+ languages=["fra-Latn", "spa-Latn", "eng-Latn", "deu-Latn"],
146
+ revision="a6bec272729c5056e2c26618ce085205c82a3b3c",
147
+ release_date="2025-11-20",
148
+ modalities=["image", "text"],
149
+ n_parameters=4_000_000_000,
150
+ memory_usage_mb=16929,
151
+ max_tokens=262144,
152
+ embed_dim=2560,
153
+ license="apache-2.0",
154
+ open_weights=True,
155
+ framework=["Tevatron", "safetensors"],
156
+ reference="https://huggingface.co/eagerworks/eager-embed-v1",
157
+ similarity_fn_name=ScoringFunction.COSINE,
158
+ use_instructions=True,
159
+ training_datasets=EAGER_EMBED_V1_TRAINING_DATASETS,
160
+ citation=EAGER_EMBED_V1_CITATION,
161
+ adapted_from="https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct",
162
+ public_training_code="https://github.com/eagerworks/eager-embed",
163
+ public_training_data="https://github.com/eagerworks/eager-embed/blob/main/dataset_config.yaml",
164
+ )
@@ -0,0 +1,91 @@
1
+ from mteb.models.model_meta import ModelMeta
2
+ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
3
+
4
+ embedding_gemma_300m_scandi = ModelMeta(
5
+ loader=sentence_transformers_loader,
6
+ name="emillykkejensen/EmbeddingGemma-Scandi-300m",
7
+ model_type=["dense"],
8
+ languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
9
+ open_weights=True,
10
+ revision="9f3307b9f601db564a9190cb475324d128dcfe86",
11
+ release_date="2025-10-17",
12
+ n_parameters=307_581_696,
13
+ embed_dim=768,
14
+ max_tokens=2048,
15
+ license="apache-2.0",
16
+ reference="https://huggingface.co/emillykkejensen/EmbeddingGemma-Scandi-300m",
17
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
18
+ use_instructions=True,
19
+ public_training_code=None,
20
+ public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
21
+ training_datasets=set(),
22
+ similarity_fn_name="cosine", # type: ignore[arg-type]
23
+ adapted_from="google/embeddinggemma-300m",
24
+ memory_usage_mb=578,
25
+ citation="""@inproceedings{reimers-2019-sentence-bert,
26
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
27
+ author = "Reimers, Nils and Gurevych, Iryna",
28
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
29
+ month = "11",
30
+ year = "2019",
31
+ publisher = "Association for Computational Linguistics",
32
+ url = "https://arxiv.org/abs/1908.10084",
33
+ }""",
34
+ )
35
+
36
+
37
+ qwen_scandi = ModelMeta(
38
+ loader=sentence_transformers_loader,
39
+ name="emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
40
+ model_type=["dense"],
41
+ languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
42
+ open_weights=True,
43
+ revision="cf1e7ba36ebd3d605549d8f02930a18e17b54513",
44
+ release_date="2025-10-17",
45
+ n_parameters=595776512,
46
+ memory_usage_mb=2272,
47
+ embed_dim=1024,
48
+ max_tokens=32768,
49
+ license="apache-2.0",
50
+ reference="https://huggingface.co/emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
51
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
52
+ use_instructions=True,
53
+ public_training_code=None,
54
+ public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
55
+ training_datasets=set(),
56
+ similarity_fn_name="cosine", # type: ignore[arg-type]
57
+ adapted_from="Qwen/Qwen3-Embedding-0.6B",
58
+ )
59
+
60
+
61
+ mmbert_scandi = ModelMeta(
62
+ loader=sentence_transformers_loader,
63
+ name="emillykkejensen/mmBERTscandi-base-embedding",
64
+ model_type=["dense"],
65
+ languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
66
+ open_weights=True,
67
+ revision="82d74c7a5d8e1ddf31b132865df2d16b2b0294ee",
68
+ release_date="2025-10-17",
69
+ n_parameters=306939648,
70
+ memory_usage_mb=1171,
71
+ embed_dim=768,
72
+ max_tokens=8192,
73
+ license="apache-2.0",
74
+ reference="https://huggingface.co/emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
75
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
76
+ use_instructions=True,
77
+ public_training_code=None,
78
+ public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
79
+ training_datasets=set(),
80
+ similarity_fn_name="cosine", # type: ignore[arg-type]
81
+ adapted_from="jonasaise/scandmmBERT-base-scandinavian",
82
+ citation="""@inproceedings{reimers-2019-sentence-bert,
83
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
84
+ author = "Reimers, Nils and Gurevych, Iryna",
85
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
86
+ month = "11",
87
+ year = "2019",
88
+ publisher = "Association for Computational Linguistics",
89
+ url = "https://arxiv.org/abs/1908.10084",
90
+ }""",
91
+ )
@@ -12,6 +12,7 @@ english_code_retriever = ModelMeta(
12
12
  },
13
13
  ),
14
14
  name="fyaronskiy/english_code_retriever",
15
+ model_type=["dense"],
15
16
  languages=["eng-Latn"],
16
17
  open_weights=True,
17
18
  revision="be653fab7d27a7348a0c2c3d16b9f92a7f10cb0c",
@@ -23,7 +24,7 @@ english_code_retriever = ModelMeta(
23
24
  max_tokens=8192,
24
25
  reference="https://huggingface.co/fyaronskiy/english_code_retriever",
25
26
  similarity_fn_name="cosine",
26
- framework=["Sentence Transformers", "PyTorch"],
27
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
27
28
  use_instructions=True,
28
29
  public_training_code=None,
29
30
  public_training_data="https://huggingface.co/datasets/code-search-net/code_search_net",
@@ -0,0 +1,32 @@
1
+ from mteb.models.model_meta import ModelMeta
2
+ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
3
+
4
+ Euler_Legal_Embedding_V1 = ModelMeta(
5
+ loader=sentence_transformers_loader,
6
+ name="Mira190/Euler-Legal-Embedding-V1",
7
+ model_type=["dense"],
8
+ revision="df607ed9e25e569514a99c27cdaaab16e76b6dd4",
9
+ release_date="2025-11-06",
10
+ languages=["eng-Latn"],
11
+ n_parameters=8000000000,
12
+ memory_usage_mb=15618,
13
+ max_tokens=1536,
14
+ embed_dim=4096,
15
+ license="apache-2.0",
16
+ open_weights=True,
17
+ public_training_code=None,
18
+ public_training_data=None,
19
+ framework=["PyTorch", "Sentence Transformers", "safetensors", "Transformers"],
20
+ reference="https://huggingface.co/Mira190/Euler-Legal-Embedding-V1",
21
+ similarity_fn_name="cosine",
22
+ use_instructions=False,
23
+ training_datasets=set(), # final-data-new-anonymized-grok4-filtered
24
+ adapted_from="Qwen/Qwen3-Embedding-8B",
25
+ superseded_by=None,
26
+ citation="""@misc{euler2025legal,
27
+ title={Euler-Legal-Embedding: Advanced Legal Representation Learning},
28
+ author={LawRank Team},
29
+ year={2025},
30
+ publisher={Hugging Face}
31
+ }""",
32
+ )
@@ -138,6 +138,7 @@ laion_2b = set(
138
138
  EVA02_CLIP_B_16 = ModelMeta(
139
139
  loader=evaclip_loader,
140
140
  name="QuanSun/EVA02-CLIP-B-16",
141
+ model_type=["dense"],
141
142
  languages=["eng-Latn"],
142
143
  revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12",
143
144
  release_date="2023-04-26",
@@ -161,6 +162,7 @@ EVA02_CLIP_B_16 = ModelMeta(
161
162
  EVA02_CLIP_L_14 = ModelMeta(
162
163
  loader=evaclip_loader,
163
164
  name="QuanSun/EVA02-CLIP-L-14",
165
+ model_type=["dense"],
164
166
  languages=["eng-Latn"],
165
167
  revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12",
166
168
  release_date="2023-04-26",
@@ -184,6 +186,7 @@ EVA02_CLIP_L_14 = ModelMeta(
184
186
  EVA02_CLIP_bigE_14 = ModelMeta(
185
187
  loader=evaclip_loader,
186
188
  name="QuanSun/EVA02-CLIP-bigE-14",
189
+ model_type=["dense"],
187
190
  languages=["eng-Latn"],
188
191
  revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12",
189
192
  release_date="2023-04-26",
@@ -208,6 +211,7 @@ EVA02_CLIP_bigE_14 = ModelMeta(
208
211
  EVA02_CLIP_bigE_14_plus = ModelMeta(
209
212
  loader=evaclip_loader,
210
213
  name="QuanSun/EVA02-CLIP-bigE-14-plus",
214
+ model_type=["dense"],
211
215
  languages=["eng-Latn"],
212
216
  revision="11afd202f2ae80869d6cef18b1ec775e79bd8d12",
213
217
  release_date="2023-04-26",