mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
@@ -219,6 +219,7 @@ monobert_large = ModelMeta(
219
219
  fp_options="float16",
220
220
  ),
221
221
  name="castorini/monobert-large-msmarco",
222
+ model_type=["cross-encoder"],
222
223
  languages=["eng-Latn"],
223
224
  open_weights=True,
224
225
  revision="0a97706f3827389da43b83348d5d18c9d53876fa",
@@ -234,7 +235,6 @@ monobert_large = ModelMeta(
234
235
  use_instructions=None,
235
236
  training_datasets=None,
236
237
  framework=["Sentence Transformers", "PyTorch"],
237
- is_cross_encoder=True,
238
238
  )
239
239
 
240
240
  # languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28
@@ -244,6 +244,7 @@ jina_reranker_multilingual = ModelMeta(
244
244
  fp_options="float16",
245
245
  ),
246
246
  name="jinaai/jina-reranker-v2-base-multilingual",
247
+ model_type=["cross-encoder"],
247
248
  languages=["eng-Latn"],
248
249
  open_weights=True,
249
250
  revision="126747772a932960028d9f4dc93bd5d9c4869be4",
@@ -259,7 +260,6 @@ jina_reranker_multilingual = ModelMeta(
259
260
  use_instructions=None,
260
261
  training_datasets=None,
261
262
  framework=["Sentence Transformers", "PyTorch"],
262
- is_cross_encoder=True,
263
263
  )
264
264
 
265
265
  bge_reranker_v2_m3 = ModelMeta(
@@ -268,6 +268,7 @@ bge_reranker_v2_m3 = ModelMeta(
268
268
  fp_options="float16",
269
269
  ),
270
270
  name="BAAI/bge-reranker-v2-m3",
271
+ model_type=["cross-encoder"],
271
272
  languages=[
272
273
  "eng-Latn",
273
274
  "ara-Arab",
@@ -316,7 +317,6 @@ bge_reranker_v2_m3 = ModelMeta(
316
317
  use_instructions=None,
317
318
  training_datasets=bge_m3_training_data,
318
319
  framework=["Sentence Transformers", "PyTorch"],
319
- is_cross_encoder=True,
320
320
  citation="""
321
321
  @misc{li2023making,
322
322
  title={Making Large Language Models A Better Foundation For Dense Retrieval},
@@ -315,6 +315,7 @@ monot5_small = ModelMeta(
315
315
  fp_options="float16",
316
316
  ),
317
317
  name="castorini/monot5-small-msmarco-10k",
318
+ model_type=["cross-encoder"],
318
319
  languages=["eng-Latn"],
319
320
  open_weights=True,
320
321
  revision="77f8e3f7b1eb1afe353aa21a7c3a2fc8feca702e",
@@ -330,7 +331,6 @@ monot5_small = ModelMeta(
330
331
  use_instructions=None,
331
332
  training_datasets=None,
332
333
  framework=["PyTorch"],
333
- is_cross_encoder=True,
334
334
  citation="""@misc{rosa2022parameterleftbehinddistillation,
335
335
  title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
336
336
  author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
@@ -348,6 +348,7 @@ monot5_base = ModelMeta(
348
348
  fp_options="float16",
349
349
  ),
350
350
  name="castorini/monot5-base-msmarco-10k",
351
+ model_type=["cross-encoder"],
351
352
  languages=["eng-Latn"],
352
353
  open_weights=True,
353
354
  revision="f15657ab3d2a5dd0b9a30c8c0b6a0a73c9cb5884",
@@ -372,7 +373,6 @@ monot5_base = ModelMeta(
372
373
  use_instructions=None,
373
374
  training_datasets=None,
374
375
  framework=["PyTorch"],
375
- is_cross_encoder=True,
376
376
  )
377
377
 
378
378
  monot5_large = ModelMeta(
@@ -381,6 +381,7 @@ monot5_large = ModelMeta(
381
381
  fp_options="float16",
382
382
  ),
383
383
  name="castorini/monot5-large-msmarco-10k",
384
+ model_type=["cross-encoder"],
384
385
  languages=["eng-Latn"],
385
386
  open_weights=True,
386
387
  revision="48cfad1d8dd587670393f27ee8ec41fde63e3d98",
@@ -396,7 +397,6 @@ monot5_large = ModelMeta(
396
397
  use_instructions=None,
397
398
  training_datasets=None,
398
399
  framework=["PyTorch"],
399
- is_cross_encoder=True,
400
400
  citation="""@misc{rosa2022parameterleftbehinddistillation,
401
401
  title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
402
402
  author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
@@ -414,6 +414,7 @@ monot5_3b = ModelMeta(
414
414
  fp_options="float16",
415
415
  ),
416
416
  name="castorini/monot5-3b-msmarco-10k",
417
+ model_type=["cross-encoder"],
417
418
  languages=["eng-Latn"],
418
419
  open_weights=True,
419
420
  revision="bc0c419a438c81f592f878ce32430a1823f5db6c",
@@ -429,7 +430,6 @@ monot5_3b = ModelMeta(
429
430
  use_instructions=None,
430
431
  training_datasets=None,
431
432
  framework=["PyTorch"],
432
- is_cross_encoder=True,
433
433
  citation="""@misc{rosa2022parameterleftbehinddistillation,
434
434
  title={No Parameter Left Behind: How Distillation and Model Size Affect Zero-Shot Retrieval},
435
435
  author={Guilherme Moraes Rosa and Luiz Bonifacio and Vitor Jeronymo and Hugo Abonizio and Marzieh Fadaee and Roberto Lotufo and Rodrigo Nogueira},
@@ -447,6 +447,7 @@ flant5_base = ModelMeta(
447
447
  fp_options="float16",
448
448
  ),
449
449
  name="google/flan-t5-base",
450
+ model_type=["cross-encoder"],
450
451
  languages=["eng-Latn"],
451
452
  open_weights=True,
452
453
  revision="7bcac572ce56db69c1ea7c8af255c5d7c9672fc2",
@@ -484,7 +485,6 @@ flant5_base = ModelMeta(
484
485
  similarity_fn_name=None,
485
486
  use_instructions=None,
486
487
  framework=["PyTorch"],
487
- is_cross_encoder=True,
488
488
  )
489
489
 
490
490
  flant5_large = ModelMeta(
@@ -493,6 +493,7 @@ flant5_large = ModelMeta(
493
493
  fp_options="float16",
494
494
  ),
495
495
  name="google/flan-t5-large",
496
+ model_type=["cross-encoder"],
496
497
  languages=["eng-Latn"],
497
498
  open_weights=True,
498
499
  revision="0613663d0d48ea86ba8cb3d7a44f0f65dc596a2a",
@@ -530,7 +531,6 @@ flant5_large = ModelMeta(
530
531
  similarity_fn_name=None,
531
532
  use_instructions=None,
532
533
  framework=["PyTorch"],
533
- is_cross_encoder=True,
534
534
  )
535
535
 
536
536
  flant5_xl = ModelMeta(
@@ -539,6 +539,7 @@ flant5_xl = ModelMeta(
539
539
  fp_options="float16",
540
540
  ),
541
541
  name="google/flan-t5-xl",
542
+ model_type=["cross-encoder"],
542
543
  languages=["eng-Latn"],
543
544
  open_weights=True,
544
545
  revision="7d6315df2c2fb742f0f5b556879d730926ca9001",
@@ -576,7 +577,6 @@ flant5_xl = ModelMeta(
576
577
  similarity_fn_name=None,
577
578
  use_instructions=None,
578
579
  framework=["PyTorch"],
579
- is_cross_encoder=True,
580
580
  )
581
581
 
582
582
  flant5_xxl = ModelMeta(
@@ -585,6 +585,7 @@ flant5_xxl = ModelMeta(
585
585
  fp_options="float16",
586
586
  ),
587
587
  name="google/flan-t5-xxl",
588
+ model_type=["cross-encoder"],
588
589
  languages=["eng-Latn"],
589
590
  open_weights=True,
590
591
  revision="ae7c9136adc7555eeccc78cdd960dfd60fb346ce",
@@ -622,7 +623,6 @@ flant5_xxl = ModelMeta(
622
623
  similarity_fn_name=None,
623
624
  use_instructions=None,
624
625
  framework=["PyTorch"],
625
- is_cross_encoder=True,
626
626
  )
627
627
 
628
628
 
@@ -632,6 +632,7 @@ llama2_7b = ModelMeta(
632
632
  fp_options="float16",
633
633
  ),
634
634
  name="meta-llama/Llama-2-7b-hf",
635
+ model_type=["cross-encoder"],
635
636
  languages=["eng-Latn"],
636
637
  open_weights=True,
637
638
  revision="01c7f73d771dfac7d292323805ebc428287df4f9",
@@ -656,7 +657,6 @@ llama2_7b = ModelMeta(
656
657
  primaryClass={cs.CL},
657
658
  url={https://arxiv.org/abs/2307.09288},
658
659
  }""",
659
- is_cross_encoder=True,
660
660
  )
661
661
 
662
662
  llama2_7b_chat = ModelMeta(
@@ -665,6 +665,7 @@ llama2_7b_chat = ModelMeta(
665
665
  fp_options="float16",
666
666
  ),
667
667
  name="meta-llama/Llama-2-7b-chat-hf",
668
+ model_type=["cross-encoder"],
668
669
  languages=["eng-Latn"],
669
670
  open_weights=True,
670
671
  revision="f5db02db724555f92da89c216ac04704f23d4590",
@@ -689,7 +690,6 @@ llama2_7b_chat = ModelMeta(
689
690
  use_instructions=None,
690
691
  training_datasets=None,
691
692
  framework=["PyTorch"],
692
- is_cross_encoder=True,
693
693
  )
694
694
 
695
695
  mistral_7b = ModelMeta(
@@ -698,6 +698,7 @@ mistral_7b = ModelMeta(
698
698
  fp_options="float16",
699
699
  ),
700
700
  name="mistralai/Mistral-7B-Instruct-v0.2",
701
+ model_type=["cross-encoder"],
701
702
  languages=["eng-Latn"],
702
703
  open_weights=True,
703
704
  revision="3ad372fc79158a2148299e3318516c786aeded6c",
@@ -722,7 +723,6 @@ mistral_7b = ModelMeta(
722
723
  primaryClass={cs.CL},
723
724
  url={https://arxiv.org/abs/2310.06825},
724
725
  }""",
725
- is_cross_encoder=True,
726
726
  )
727
727
 
728
728
  followir_7b = ModelMeta(
@@ -731,6 +731,7 @@ followir_7b = ModelMeta(
731
731
  fp_options="float16",
732
732
  ),
733
733
  name="jhu-clsp/FollowIR-7B",
734
+ model_type=["cross-encoder"],
734
735
  languages=["eng-Latn"],
735
736
  open_weights=True,
736
737
  revision="4d25d437e38b510c01852070c0731e8f6e1875d1",
@@ -758,7 +759,6 @@ followir_7b = ModelMeta(
758
759
  primaryClass={cs.IR}
759
760
  }
760
761
  """,
761
- is_cross_encoder=True,
762
762
  )
763
763
 
764
764
 
@@ -874,6 +874,7 @@ mt5_base_mmarco_v2 = ModelMeta(
874
874
  fp_options="float16",
875
875
  ),
876
876
  name="unicamp-dl/mt5-base-mmarco-v2",
877
+ model_type=["cross-encoder"],
877
878
  languages=mt5_languages,
878
879
  open_weights=True,
879
880
  revision="cc0a949b9f21efcaba45c8cabb998ad02ce8d4e7",
@@ -898,7 +899,6 @@ mt5_base_mmarco_v2 = ModelMeta(
898
899
  similarity_fn_name=None,
899
900
  use_instructions=None,
900
901
  framework=["PyTorch"],
901
- is_cross_encoder=True,
902
902
  )
903
903
 
904
904
  mt5_13b_mmarco_100k = ModelMeta(
@@ -907,6 +907,7 @@ mt5_13b_mmarco_100k = ModelMeta(
907
907
  fp_options="float16",
908
908
  ),
909
909
  name="unicamp-dl/mt5-13b-mmarco-100k",
910
+ model_type=["cross-encoder"],
910
911
  languages=mt5_languages,
911
912
  open_weights=True,
912
913
  revision="e1a4317e102a525ea9e16745ad21394a4f1bffbc",
@@ -922,5 +923,4 @@ mt5_13b_mmarco_100k = ModelMeta(
922
923
  use_instructions=None,
923
924
  training_datasets=None,
924
925
  framework=["PyTorch"],
925
- is_cross_encoder=True,
926
926
  )
@@ -9,6 +9,7 @@ from .stella_models import stella_zh_datasets
9
9
  ritrieve_zh_v1 = ModelMeta(
10
10
  loader=SentenceTransformerEncoderWrapper,
11
11
  name="richinfoai/ritrieve_zh_v1",
12
+ model_type=["dense"],
12
13
  languages=["zho-Hans"],
13
14
  open_weights=True,
14
15
  revision="f8d5a707656c55705027678e311f9202c8ced12c",
@@ -43,6 +43,10 @@ GIGA_task_prompts = {
43
43
  "query": "Given a news title, retrieve relevant news article",
44
44
  "document": "",
45
45
  },
46
+ "RiaNewsRetrievalHardNegatives.v2": {
47
+ "query": "Given a news title, retrieve relevant news article",
48
+ "document": "",
49
+ },
46
50
  "MIRACLReranking": {
47
51
  "query": "Given a question, retrieve Wikipedia passages that answer the question",
48
52
  "document": "",
@@ -51,6 +55,10 @@ GIGA_task_prompts = {
51
55
  "query": "Given a question, retrieve Wikipedia passages that answer the question",
52
56
  "document": "",
53
57
  },
58
+ "MIRACLRetrievalHardNegatives.v2": {
59
+ "query": "Given a question, retrieve Wikipedia passages that answer the question",
60
+ "document": "",
61
+ },
54
62
  "ArguAna": {
55
63
  "query": "Given a search query, retrieve passages that answer the question",
56
64
  "document": "Given a search query, retrieve passages that answer the question",
@@ -230,6 +238,7 @@ GIGA_task_prompts = {
230
238
  rubert_tiny = ModelMeta(
231
239
  loader=sentence_transformers_loader,
232
240
  name="cointegrated/rubert-tiny",
241
+ model_type=["dense"],
233
242
  languages=["rus-Cyrl"],
234
243
  open_weights=True,
235
244
  revision="5441c5ea8026d4f6d7505ec004845409f1259fb1",
@@ -255,6 +264,7 @@ rubert_tiny = ModelMeta(
255
264
  rubert_tiny2 = ModelMeta(
256
265
  loader=sentence_transformers_loader,
257
266
  name="cointegrated/rubert-tiny2",
267
+ model_type=["dense"],
258
268
  languages=["rus-Cyrl"],
259
269
  open_weights=True,
260
270
  revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3",
@@ -281,6 +291,7 @@ rubert_tiny2 = ModelMeta(
281
291
  sbert_large_nlu_ru = ModelMeta(
282
292
  loader=sentence_transformers_loader,
283
293
  name="ai-forever/sbert_large_nlu_ru",
294
+ model_type=["dense"],
284
295
  languages=["rus-Cyrl"],
285
296
  open_weights=True,
286
297
  revision="af977d5dfa46a3635e29bf0ef383f2df2a08d47a",
@@ -306,6 +317,7 @@ sbert_large_nlu_ru = ModelMeta(
306
317
  sbert_large_mt_nlu_ru = ModelMeta(
307
318
  loader=sentence_transformers_loader,
308
319
  name="ai-forever/sbert_large_mt_nlu_ru",
320
+ model_type=["dense"],
309
321
  languages=["rus-Cyrl"],
310
322
  open_weights=True,
311
323
  revision="05300876c2b83f46d3ddd422a7f17e45cf633bb0",
@@ -333,6 +345,7 @@ user_base_ru = ModelMeta(
333
345
  model_prompts={"query": "query: ", "document": "passage: "},
334
346
  ),
335
347
  name="deepvk/USER-base",
348
+ model_type=["dense"],
336
349
  languages=["rus-Cyrl"],
337
350
  open_weights=True,
338
351
  revision="436a489a2087d61aa670b3496a9915f84e46c861",
@@ -393,6 +406,7 @@ user_base_ru = ModelMeta(
393
406
  user_bge_m3 = ModelMeta(
394
407
  loader=sentence_transformers_loader,
395
408
  name="deepvk/USER-bge-m3",
409
+ model_type=["dense"],
396
410
  languages=["rus-Cyrl"],
397
411
  open_weights=True,
398
412
  revision="0cc6cfe48e260fb0474c753087a69369e88709ae",
@@ -431,11 +445,19 @@ user_bge_m3 = ModelMeta(
431
445
  },
432
446
  public_training_code=None,
433
447
  public_training_data=None,
448
+ citation="""@misc{deepvk2024user,
449
+ title={USER: Universal Sentence Encoder for Russian},
450
+ author={Malashenko, Boris and Zemerov, Anton and Spirin, Egor},
451
+ url={https://huggingface.co/datasets/deepvk/USER-base},
452
+ publisher={Hugging Face},
453
+ year={2024},
454
+ }""",
434
455
  )
435
456
 
436
457
  deberta_v1_ru = ModelMeta(
437
458
  loader=sentence_transformers_loader,
438
459
  name="deepvk/deberta-v1-base",
460
+ model_type=["dense"],
439
461
  languages=["rus-Cyrl"],
440
462
  open_weights=True,
441
463
  revision="bdd30b0e19757e6940c92c7aff19e8fc0a60dff4",
@@ -466,6 +488,7 @@ deberta_v1_ru = ModelMeta(
466
488
  rubert_base_cased = ModelMeta(
467
489
  loader=sentence_transformers_loader,
468
490
  name="DeepPavlov/rubert-base-cased",
491
+ model_type=["dense"],
469
492
  languages=["rus-Cyrl"],
470
493
  open_weights=True,
471
494
  revision="4036cab694767a299f2b9e6492909664d9414229",
@@ -501,6 +524,7 @@ rubert_base_cased = ModelMeta(
501
524
  distilrubert_small_cased_conversational = ModelMeta(
502
525
  loader=sentence_transformers_loader,
503
526
  name="DeepPavlov/distilrubert-small-cased-conversational",
527
+ model_type=["dense"],
504
528
  languages=["rus-Cyrl"],
505
529
  open_weights=True,
506
530
  revision="e348066b4a7279b97138038299bddc6580a9169a",
@@ -535,6 +559,7 @@ distilrubert_small_cased_conversational = ModelMeta(
535
559
  rubert_base_cased_sentence = ModelMeta(
536
560
  loader=sentence_transformers_loader,
537
561
  name="DeepPavlov/rubert-base-cased-sentence",
562
+ model_type=["dense"],
538
563
  languages=["rus-Cyrl"],
539
564
  open_weights=True,
540
565
  revision="78b5122d6365337dd4114281b0d08cd1edbb3bc8",
@@ -559,6 +584,7 @@ rubert_base_cased_sentence = ModelMeta(
559
584
  labse_en_ru = ModelMeta(
560
585
  loader=sentence_transformers_loader,
561
586
  name="cointegrated/LaBSE-en-ru",
587
+ model_type=["dense"],
562
588
  languages=["rus-Cyrl"],
563
589
  open_weights=True,
564
590
  revision="cf0714e606d4af551e14ad69a7929cd6b0da7f7e",
@@ -586,6 +612,7 @@ turbo_models_datasets = set(
586
612
  rubert_tiny_turbo = ModelMeta(
587
613
  loader=sentence_transformers_loader,
588
614
  name="sergeyzh/rubert-tiny-turbo",
615
+ model_type=["dense"],
589
616
  languages=["rus-Cyrl"],
590
617
  open_weights=True,
591
618
  revision="8ce0cf757446ce9bb2d5f5a4ac8103c7a1049054",
@@ -608,6 +635,7 @@ rubert_tiny_turbo = ModelMeta(
608
635
  rubert_mini_frida = ModelMeta(
609
636
  loader=sentence_transformers_loader,
610
637
  name="sergeyzh/rubert-mini-frida",
638
+ model_type=["dense"],
611
639
  languages=["rus-Cyrl"],
612
640
  open_weights=True,
613
641
  revision="19b279b78afd945b5ccae78f63e284909814adc2",
@@ -635,6 +663,7 @@ rubert_mini_frida = ModelMeta(
635
663
  labse_ru_turbo = ModelMeta(
636
664
  loader=sentence_transformers_loader,
637
665
  name="sergeyzh/LaBSE-ru-turbo",
666
+ model_type=["dense"],
638
667
  languages=["rus-Cyrl"],
639
668
  open_weights=True,
640
669
  revision="1940b046c6b5e125df11722b899130329d0a46da",
@@ -683,6 +712,7 @@ rosberta_ru_en = ModelMeta(
683
712
  model_prompts=rosberta_prompts,
684
713
  ),
685
714
  name="ai-forever/ru-en-RoSBERTa",
715
+ model_type=["dense"],
686
716
  languages=["rus-Cyrl"],
687
717
  open_weights=True,
688
718
  revision="89fb1651989adbb1cfcfdedafd7d102951ad0555",
@@ -755,6 +785,7 @@ frida_prompts = {
755
785
  "SensitiveTopicsClassification": "categorize_topic: ",
756
786
  "TERRa": "categorize_entailment: ",
757
787
  "RiaNewsRetrieval": "categorize: ",
788
+ "RiaNewsRetrievalHardNegatives.v2": "",
758
789
  }
759
790
 
760
791
  frida_training_datasets = {
@@ -847,6 +878,7 @@ frida = ModelMeta(
847
878
  model_prompts=frida_prompts,
848
879
  ),
849
880
  name="ai-forever/FRIDA",
881
+ model_type=["dense"],
850
882
  languages=["rus-Cyrl"],
851
883
  open_weights=True,
852
884
  revision="7292217af9a9e6dbf07048f76b434ad1e2aa8b76",
@@ -864,6 +896,7 @@ frida = ModelMeta(
864
896
  public_training_data=None,
865
897
  public_training_code=None,
866
898
  framework=["Sentence Transformers", "PyTorch"],
899
+ citation=None,
867
900
  )
868
901
 
869
902
  giga_embeddings = ModelMeta(
@@ -879,6 +912,7 @@ giga_embeddings = ModelMeta(
879
912
  },
880
913
  ),
881
914
  name="ai-sage/Giga-Embeddings-instruct",
915
+ model_type=["dense"],
882
916
  languages=["eng-Latn", "rus-Cyrl"],
883
917
  open_weights=True,
884
918
  revision="0ad5b29bfecd806cecc9d66b927d828a736594dc",
@@ -910,6 +944,7 @@ berta_training_datasets = (
910
944
  berta = ModelMeta(
911
945
  loader=sentence_transformers_loader,
912
946
  name="sergeyzh/BERTA",
947
+ model_type=["dense"],
913
948
  languages=["rus-Cyrl"],
914
949
  open_weights=True,
915
950
  revision="914c8c8aed14042ed890fc2c662d5e9e66b2faa7",
@@ -982,6 +1017,7 @@ user2_small = ModelMeta(
982
1017
  model_prompts=user2_prompts,
983
1018
  ),
984
1019
  name="deepvk/USER2-small",
1020
+ model_type=["dense"],
985
1021
  languages=["rus-Cyrl"],
986
1022
  open_weights=True,
987
1023
  revision="23f65b34cf7632032061f5cc66c14714e6d4cee4",
@@ -999,6 +1035,13 @@ user2_small = ModelMeta(
999
1035
  public_training_data=None,
1000
1036
  public_training_code="https://github.com/BlessedTatonka/some_code/tree/2899f27d51efdf4217fc6453799ff197e9792f1e",
1001
1037
  framework=["Sentence Transformers", "PyTorch"],
1038
+ citation="""@misc{deepvk2025user,
1039
+ title={USER2},
1040
+ author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
1041
+ url={https://huggingface.co/deepvk/USER2-small},
1042
+ publisher={Hugging Face},
1043
+ year={2025},
1044
+ }""",
1002
1045
  )
1003
1046
 
1004
1047
  user2_base = ModelMeta(
@@ -1007,6 +1050,7 @@ user2_base = ModelMeta(
1007
1050
  model_prompts=user2_prompts,
1008
1051
  ),
1009
1052
  name="deepvk/USER2-base",
1053
+ model_type=["dense"],
1010
1054
  languages=["rus-Cyrl"],
1011
1055
  open_weights=True,
1012
1056
  revision="0942cf96909b6d52e61f79a01e2d30c7be640b27",
@@ -1024,4 +1068,11 @@ user2_base = ModelMeta(
1024
1068
  public_training_data=None,
1025
1069
  public_training_code="https://github.com/BlessedTatonka/some_code/tree/2899f27d51efdf4217fc6453799ff197e9792f1e",
1026
1070
  framework=["Sentence Transformers", "PyTorch"],
1071
+ citation="""@misc{deepvk2025user,
1072
+ title={USER2},
1073
+ author={Malashenko, Boris and Spirin, Egor and Sokolov Andrey},
1074
+ url={https://huggingface.co/deepvk/USER2-base},
1075
+ publisher={Hugging Face},
1076
+ year={2025},
1077
+ }""",
1027
1078
  )