mteb 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (486) hide show
  1. mteb/_create_dataloaders.py +63 -14
  2. mteb/_evaluators/any_sts_evaluator.py +12 -5
  3. mteb/_evaluators/clustering_evaluator.py +12 -4
  4. mteb/_evaluators/evaluator.py +11 -5
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +14 -5
  6. mteb/_evaluators/pair_classification_evaluator.py +13 -5
  7. mteb/_evaluators/retrieval_evaluator.py +22 -13
  8. mteb/_evaluators/retrieval_metrics.py +9 -3
  9. mteb/_evaluators/sklearn_evaluator.py +20 -11
  10. mteb/_evaluators/text/bitext_mining_evaluator.py +10 -3
  11. mteb/_evaluators/text/summarization_evaluator.py +10 -4
  12. mteb/_evaluators/zeroshot_classification_evaluator.py +12 -3
  13. mteb/_helpful_enum.py +5 -1
  14. mteb/abstasks/_data_filter/filters.py +8 -2
  15. mteb/abstasks/_data_filter/task_pipelines.py +7 -2
  16. mteb/abstasks/_statistics_calculation.py +6 -4
  17. mteb/abstasks/abstask.py +48 -21
  18. mteb/abstasks/aggregate_task_metadata.py +20 -9
  19. mteb/abstasks/aggregated_task.py +15 -8
  20. mteb/abstasks/classification.py +25 -9
  21. mteb/abstasks/clustering.py +23 -10
  22. mteb/abstasks/clustering_legacy.py +22 -8
  23. mteb/abstasks/image/image_text_pair_classification.py +23 -9
  24. mteb/abstasks/multilabel_classification.py +13 -5
  25. mteb/abstasks/pair_classification.py +27 -11
  26. mteb/abstasks/regression.py +14 -6
  27. mteb/abstasks/retrieval.py +56 -30
  28. mteb/abstasks/retrieval_dataset_loaders.py +48 -37
  29. mteb/abstasks/sts.py +29 -13
  30. mteb/abstasks/task_metadata.py +17 -8
  31. mteb/abstasks/text/bitext_mining.py +23 -12
  32. mteb/abstasks/text/reranking.py +2 -2
  33. mteb/abstasks/text/summarization.py +19 -8
  34. mteb/abstasks/zeroshot_classification.py +23 -9
  35. mteb/benchmarks/_create_table.py +13 -7
  36. mteb/benchmarks/benchmark.py +11 -1
  37. mteb/benchmarks/benchmarks/__init__.py +2 -0
  38. mteb/benchmarks/benchmarks/benchmarks.py +41 -2
  39. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  40. mteb/cache.py +10 -5
  41. mteb/cli/_display_tasks.py +9 -3
  42. mteb/cli/build_cli.py +5 -2
  43. mteb/cli/generate_model_card.py +9 -2
  44. mteb/deprecated_evaluator.py +16 -12
  45. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  46. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  47. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  48. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  49. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  50. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  51. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  52. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  64. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  65. mteb/evaluate.py +33 -20
  66. mteb/filter_tasks.py +12 -7
  67. mteb/get_tasks.py +9 -4
  68. mteb/languages/language_scripts.py +8 -3
  69. mteb/leaderboard/app.py +11 -4
  70. mteb/leaderboard/table.py +7 -2
  71. mteb/load_results.py +9 -3
  72. mteb/models/abs_encoder.py +22 -12
  73. mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
  74. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
  75. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
  76. mteb/models/cache_wrappers/cache_wrapper.py +14 -9
  77. mteb/models/get_model_meta.py +32 -6
  78. mteb/models/instruct_wrapper.py +13 -5
  79. mteb/models/model_implementations/align_models.py +10 -4
  80. mteb/models/model_implementations/amazon_models.py +1 -0
  81. mteb/models/model_implementations/andersborges.py +2 -0
  82. mteb/models/model_implementations/ara_models.py +1 -0
  83. mteb/models/model_implementations/arctic_models.py +8 -0
  84. mteb/models/model_implementations/b1ade_models.py +1 -0
  85. mteb/models/model_implementations/bedrock_models.py +20 -6
  86. mteb/models/model_implementations/bge_models.py +40 -1
  87. mteb/models/model_implementations/bica_model.py +1 -0
  88. mteb/models/model_implementations/blip2_models.py +11 -4
  89. mteb/models/model_implementations/blip_models.py +17 -4
  90. mteb/models/model_implementations/bm25.py +24 -14
  91. mteb/models/model_implementations/bmretriever_models.py +10 -2
  92. mteb/models/model_implementations/cadet_models.py +1 -0
  93. mteb/models/model_implementations/cde_models.py +11 -5
  94. mteb/models/model_implementations/clip_models.py +12 -4
  95. mteb/models/model_implementations/clips_models.py +3 -0
  96. mteb/models/model_implementations/codefuse_models.py +5 -0
  97. mteb/models/model_implementations/codesage_models.py +3 -0
  98. mteb/models/model_implementations/cohere_models.py +14 -4
  99. mteb/models/model_implementations/cohere_v.py +14 -4
  100. mteb/models/model_implementations/colpali_models.py +7 -3
  101. mteb/models/model_implementations/colqwen_models.py +17 -31
  102. mteb/models/model_implementations/colsmol_models.py +3 -1
  103. mteb/models/model_implementations/conan_models.py +11 -4
  104. mteb/models/model_implementations/dino_models.py +28 -4
  105. mteb/models/model_implementations/e5_instruct.py +4 -0
  106. mteb/models/model_implementations/e5_models.py +9 -0
  107. mteb/models/model_implementations/e5_v.py +10 -4
  108. mteb/models/model_implementations/eagerworks_models.py +11 -4
  109. mteb/models/model_implementations/emillykkejensen_models.py +3 -0
  110. mteb/models/model_implementations/en_code_retriever.py +1 -0
  111. mteb/models/model_implementations/euler_models.py +1 -0
  112. mteb/models/model_implementations/evaclip_models.py +13 -4
  113. mteb/models/model_implementations/fa_models.py +9 -0
  114. mteb/models/model_implementations/facebookai.py +2 -0
  115. mteb/models/model_implementations/geogpt_models.py +1 -0
  116. mteb/models/model_implementations/gme_v_models.py +7 -3
  117. mteb/models/model_implementations/google_models.py +15 -4
  118. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
  119. mteb/models/model_implementations/gritlm_models.py +3 -0
  120. mteb/models/model_implementations/gte_models.py +9 -0
  121. mteb/models/model_implementations/hinvec_models.py +6 -1
  122. mteb/models/model_implementations/human.py +1 -0
  123. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  124. mteb/models/model_implementations/inf_models.py +2 -0
  125. mteb/models/model_implementations/jasper_models.py +14 -5
  126. mteb/models/model_implementations/jina_clip.py +10 -4
  127. mteb/models/model_implementations/jina_models.py +17 -5
  128. mteb/models/model_implementations/kalm_models.py +24 -12
  129. mteb/models/model_implementations/kblab.py +1 -0
  130. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
  131. mteb/models/model_implementations/kfst.py +1 -0
  132. mteb/models/model_implementations/kowshik24_models.py +1 -0
  133. mteb/models/model_implementations/lens_models.py +2 -0
  134. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  135. mteb/models/model_implementations/linq_models.py +7 -1
  136. mteb/models/model_implementations/listconranker.py +10 -4
  137. mteb/models/model_implementations/llm2clip_models.py +12 -4
  138. mteb/models/model_implementations/llm2vec_models.py +20 -6
  139. mteb/models/model_implementations/mcinext_models.py +8 -2
  140. mteb/models/model_implementations/mdbr_models.py +2 -0
  141. mteb/models/model_implementations/misc_models.py +63 -0
  142. mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
  143. mteb/models/model_implementations/mme5_models.py +2 -1
  144. mteb/models/model_implementations/moco_models.py +11 -4
  145. mteb/models/model_implementations/mod_models.py +2 -1
  146. mteb/models/model_implementations/model2vec_models.py +23 -4
  147. mteb/models/model_implementations/moka_models.py +3 -0
  148. mteb/models/model_implementations/nbailab.py +3 -0
  149. mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
  150. mteb/models/model_implementations/nomic_models.py +17 -4
  151. mteb/models/model_implementations/nomic_models_vision.py +5 -3
  152. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
  153. mteb/models/model_implementations/nvidia_models.py +15 -4
  154. mteb/models/model_implementations/octen_models.py +3 -1
  155. mteb/models/model_implementations/openai_models.py +14 -4
  156. mteb/models/model_implementations/openclip_models.py +17 -4
  157. mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
  158. mteb/models/model_implementations/ops_moa_models.py +9 -2
  159. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
  160. mteb/models/model_implementations/pawan_models.py +1 -0
  161. mteb/models/model_implementations/piccolo_models.py +2 -0
  162. mteb/models/model_implementations/promptriever_models.py +16 -6
  163. mteb/models/model_implementations/pylate_models.py +32 -13
  164. mteb/models/model_implementations/qodo_models.py +2 -0
  165. mteb/models/model_implementations/qtack_models.py +1 -0
  166. mteb/models/model_implementations/qwen3_models.py +11 -1
  167. mteb/models/model_implementations/qzhou_models.py +2 -0
  168. mteb/models/model_implementations/random_baseline.py +4 -3
  169. mteb/models/model_implementations/rasgaard_models.py +1 -0
  170. mteb/models/model_implementations/reasonir_model.py +65 -0
  171. mteb/models/model_implementations/repllama_models.py +15 -6
  172. mteb/models/model_implementations/rerankers_custom.py +13 -4
  173. mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
  174. mteb/models/model_implementations/richinfoai_models.py +1 -0
  175. mteb/models/model_implementations/ru_sentence_models.py +20 -0
  176. mteb/models/model_implementations/ruri_models.py +10 -0
  177. mteb/models/model_implementations/salesforce_models.py +10 -1
  178. mteb/models/model_implementations/samilpwc_models.py +1 -0
  179. mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
  180. mteb/models/model_implementations/searchmap_models.py +1 -0
  181. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  182. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
  183. mteb/models/model_implementations/seed_models.py +2 -1
  184. mteb/models/model_implementations/sentence_transformers_models.py +18 -0
  185. mteb/models/model_implementations/shuu_model.py +1 -0
  186. mteb/models/model_implementations/siglip_models.py +19 -4
  187. mteb/models/model_implementations/slm_models.py +7 -4
  188. mteb/models/model_implementations/sonar_models.py +2 -1
  189. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
  190. mteb/models/model_implementations/stella_models.py +6 -0
  191. mteb/models/model_implementations/tarka_models.py +2 -0
  192. mteb/models/model_implementations/text2vec_models.py +3 -0
  193. mteb/models/model_implementations/ua_sentence_models.py +1 -0
  194. mteb/models/model_implementations/uae_models.py +10 -4
  195. mteb/models/model_implementations/vdr_models.py +8 -1
  196. mteb/models/model_implementations/vi_vn_models.py +6 -0
  197. mteb/models/model_implementations/vista_models.py +11 -4
  198. mteb/models/model_implementations/vlm2vec_models.py +11 -4
  199. mteb/models/model_implementations/voyage_models.py +52 -4
  200. mteb/models/model_implementations/voyage_v.py +11 -6
  201. mteb/models/model_implementations/xyz_models.py +1 -0
  202. mteb/models/model_implementations/youtu_models.py +1 -0
  203. mteb/models/model_implementations/yuan_models.py +1 -0
  204. mteb/models/model_implementations/yuan_models_en.py +2 -1
  205. mteb/models/model_meta.py +47 -9
  206. mteb/models/models_protocols.py +23 -18
  207. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  208. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
  209. mteb/models/search_wrappers.py +31 -12
  210. mteb/models/sentence_transformer_wrapper.py +4 -3
  211. mteb/models/vllm_wrapper.py +8 -6
  212. mteb/results/benchmark_results.py +22 -17
  213. mteb/results/model_result.py +21 -15
  214. mteb/results/task_result.py +32 -16
  215. mteb/similarity_functions.py +8 -2
  216. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  217. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  218. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  219. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  220. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  221. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  222. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  223. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  224. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  225. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  226. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  227. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  228. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  229. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  230. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  231. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  232. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  233. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  234. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  235. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  236. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  237. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  238. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  239. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  240. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  241. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  242. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  243. mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
  244. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  245. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  246. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  247. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  248. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  249. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  250. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  251. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  252. mteb/tasks/classification/est/estonian_valence.py +1 -1
  253. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  254. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  255. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  257. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  260. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  261. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  262. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  263. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  264. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  265. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  266. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  267. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  268. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  269. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  270. mteb/tasks/classification/kor/klue_tc.py +2 -2
  271. mteb/tasks/classification/kor/kor_fin.py +1 -1
  272. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  274. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  275. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  276. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  277. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  278. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  279. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  280. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  281. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  282. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  283. mteb/tasks/classification/multilingual/scala_classification.py +1 -1
  284. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  285. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  286. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  287. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  288. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  289. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  290. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  291. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  292. mteb/tasks/classification/ron/moroco.py +1 -1
  293. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  294. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  295. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  296. mteb/tasks/classification/rus/headline_classification.py +2 -2
  297. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  298. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  299. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  300. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  301. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  302. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  303. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  304. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  305. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  306. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  307. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  308. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  309. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  310. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  311. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  312. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  313. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  314. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  315. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  316. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  317. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  318. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  319. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  320. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  321. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  322. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  323. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  324. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  325. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  326. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  327. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  328. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  329. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  330. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  331. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  332. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  333. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  334. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  335. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  336. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  337. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  338. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  339. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  340. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  341. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  342. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  343. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  344. mteb/tasks/clustering/nob/snl_clustering.py +8 -3
  345. mteb/tasks/clustering/nob/vg_clustering.py +8 -3
  346. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  347. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  348. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  349. mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
  350. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  351. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  352. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  353. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  354. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  355. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  356. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  357. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  358. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  359. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  360. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  361. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  362. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  363. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
  364. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  365. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  366. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  367. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  368. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  369. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  370. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  371. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  372. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  373. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  374. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  375. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  376. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  377. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  378. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  379. mteb/tasks/pair_classification/rus/terra.py +2 -2
  380. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  381. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  382. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  383. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  384. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  385. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  386. mteb/tasks/retrieval/code/code_rag.py +4 -4
  387. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  388. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  389. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  390. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  391. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  392. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  393. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  394. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  395. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
  396. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
  397. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  398. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  399. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  400. mteb/tasks/retrieval/eng/__init__.py +42 -0
  401. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  402. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  403. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  404. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  405. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  406. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  407. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  408. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  409. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  410. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  411. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  412. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  413. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  414. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  415. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  416. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  417. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  418. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  419. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  420. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  421. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  422. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  423. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  424. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  425. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  426. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  427. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  428. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  429. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  430. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  431. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  432. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  433. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  434. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  435. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  436. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  437. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  438. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  439. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  440. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  441. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  442. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  443. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  444. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  445. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  446. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  447. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  448. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  449. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
  450. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  451. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  452. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  453. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  454. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  455. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  456. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  457. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  458. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  459. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  460. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  461. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  462. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  463. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  464. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  465. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  466. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  467. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  468. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  469. mteb/tasks/retrieval/nob/norquad.py +1 -1
  470. mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
  471. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  472. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  473. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  474. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  475. mteb/tasks/sts/kor/klue_sts.py +1 -1
  476. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  477. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  478. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  479. mteb/types/_encoder_io.py +1 -1
  480. mteb/types/statistics.py +9 -2
  481. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/METADATA +1 -1
  482. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/RECORD +486 -465
  483. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
  484. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
  485. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
  486. {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,13 @@
1
- import numpy as np
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
2
4
 
3
5
  from mteb.models.abs_encoder import AbsEncoder
4
6
  from mteb.models.model_meta import ModelMeta
5
7
 
8
+ if TYPE_CHECKING:
9
+ from mteb.types import Array
10
+
6
11
 
7
12
  class OPSWrapper(AbsEncoder):
8
13
  def __init__(self, model_name: str, revision: str):
@@ -15,7 +20,7 @@ class OPSWrapper(AbsEncoder):
15
20
  )
16
21
  self.output_dim = 1536
17
22
 
18
- def encode(self, sentences: list[str], **kwargs) -> np.ndarray:
23
+ def encode(self, sentences: list[str], **kwargs) -> Array:
19
24
  embeddings = self.model.encode(sentences, **kwargs)
20
25
  return embeddings[:, : self.output_dim]
21
26
 
@@ -28,6 +33,7 @@ ops_moa_conan_embedding = ModelMeta(
28
33
  languages=["zho-Hans"],
29
34
  loader=OPSWrapper,
30
35
  n_parameters=int(343 * 1e6),
36
+ n_embedding_parameters=21_635_072,
31
37
  memory_usage_mb=1308,
32
38
  max_tokens=512,
33
39
  embed_dim=1536,
@@ -60,6 +66,7 @@ ops_moa_yuan_embedding = ModelMeta(
60
66
  languages=["zho-Hans"],
61
67
  loader=OPSWrapper,
62
68
  n_parameters=int(343 * 1e6),
69
+ n_embedding_parameters=21_635_072,
63
70
  memory_usage_mb=1242,
64
71
  max_tokens=512,
65
72
  embed_dim=1536,
@@ -4,6 +4,7 @@ solon_embeddings_1_1 = ModelMeta(
4
4
  name="OrdalieTech/Solon-embeddings-mini-beta-1.1",
5
5
  languages=["fra-Latn"],
6
6
  n_parameters=210_000_000,
7
+ n_embedding_parameters=None,
7
8
  public_training_code=None,
8
9
  memory_usage_mb=808.0,
9
10
  open_weights=True,
@@ -20,6 +20,7 @@ pawan_embd_68m = ModelMeta(
20
20
  revision="32f295145802bdbd65699ad65fd27d2a5b69a909",
21
21
  release_date="2025-12-08",
22
22
  n_parameters=68_000_000,
23
+ n_embedding_parameters=None,
23
24
  memory_usage_mb=260,
24
25
  embed_dim=768,
25
26
  license="apache-2.0",
@@ -12,6 +12,7 @@ piccolo_base_zh = ModelMeta(
12
12
  revision="47c0a63b8f667c3482e05b2fd45577bb19252196",
13
13
  release_date="2023-09-04", # first commit
14
14
  n_parameters=None,
15
+ n_embedding_parameters=16_226_304,
15
16
  memory_usage_mb=None, # can't see on model card
16
17
  embed_dim=768,
17
18
  license="mit",
@@ -37,6 +38,7 @@ piccolo_large_zh_v2 = ModelMeta(
37
38
  revision="05948c1d889355936bdf9db7d30df57dd78d25a3",
38
39
  release_date="2024-04-22", # first commit
39
40
  n_parameters=None,
41
+ n_embedding_parameters=None,
40
42
  memory_usage_mb=None, # we don't know because they removed the model
41
43
  embed_dim=1024,
42
44
  license="not specified",
@@ -1,15 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from collections.abc import Callable
3
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
4
5
 
5
6
  import torch
6
- from torch.utils.data import DataLoader
7
7
 
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
8
  from mteb.models.abs_encoder import AbsEncoder
10
9
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
- from mteb.models.models_protocols import EncoderProtocol
12
- from mteb.types import Array, BatchedInput, PromptType
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Callable
13
+
14
+ from torch.utils.data import DataLoader
15
+
16
+ from mteb.abstasks.task_metadata import TaskMetadata
17
+ from mteb.models.models_protocols import EncoderProtocol
18
+ from mteb.types import Array, BatchedInput, PromptType
13
19
 
14
20
  from .repllama_models import RepLLaMAModel, model_prompts
15
21
 
@@ -81,6 +87,7 @@ promptriever_llama2 = ModelMeta(
81
87
  revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision
82
88
  release_date="2024-09-15",
83
89
  n_parameters=7_000_000_000,
90
+ n_embedding_parameters=None,
84
91
  memory_usage_mb=26703,
85
92
  max_tokens=4096,
86
93
  embed_dim=4096,
@@ -117,6 +124,7 @@ promptriever_llama3 = ModelMeta(
117
124
  },
118
125
  release_date="2024-09-15",
119
126
  n_parameters=8_000_000_000,
127
+ n_embedding_parameters=None,
120
128
  memory_usage_mb=30518,
121
129
  max_tokens=8192,
122
130
  embed_dim=4096,
@@ -146,6 +154,7 @@ promptriever_llama3_instruct = ModelMeta(
146
154
  revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision
147
155
  release_date="2024-09-15",
148
156
  n_parameters=8_000_000_000,
157
+ n_embedding_parameters=None,
149
158
  memory_usage_mb=30518,
150
159
  max_tokens=8192,
151
160
  embed_dim=4096,
@@ -179,6 +188,7 @@ promptriever_mistral_v1 = ModelMeta(
179
188
  revision="7231864981174d9bee8c7687c24c8344414eae6b-876d63e49b6115ecb6839893a56298fadee7e8f5", # base-peft revision
180
189
  release_date="2024-09-15",
181
190
  n_parameters=7_000_000_000,
191
+ n_embedding_parameters=131_072_000,
182
192
  memory_usage_mb=26703,
183
193
  training_datasets={
184
194
  # "samaya-ai/msmarco-w-instructions",
@@ -1,30 +1,36 @@
1
+ from __future__ import annotations
2
+
1
3
  import heapq
2
4
  import logging
3
5
  import shutil
4
6
  import tempfile
5
7
  from pathlib import Path
6
- from typing import Any
8
+ from typing import TYPE_CHECKING, Any
7
9
 
8
10
  import torch
9
- from torch.utils.data import DataLoader
10
11
 
11
12
  from mteb._create_dataloaders import (
12
13
  create_dataloader,
13
14
  )
14
15
  from mteb._requires_package import requires_package
15
- from mteb.abstasks.task_metadata import TaskMetadata
16
16
  from mteb.models.abs_encoder import AbsEncoder
17
17
  from mteb.models.model_meta import ModelMeta, ScoringFunction
18
- from mteb.types import (
19
- Array,
20
- BatchedInput,
21
- CorpusDatasetType,
22
- EncodeKwargs,
23
- PromptType,
24
- QueryDatasetType,
25
- RetrievalOutputType,
26
- TopRankedDocumentsType,
27
- )
18
+ from mteb.types import PromptType
19
+
20
+ if TYPE_CHECKING:
21
+ from torch.utils.data import DataLoader
22
+
23
+ from mteb.abstasks.task_metadata import TaskMetadata
24
+ from mteb.types import (
25
+ Array,
26
+ BatchedInput,
27
+ CorpusDatasetType,
28
+ EncodeKwargs,
29
+ QueryDatasetType,
30
+ RetrievalOutputType,
31
+ TopRankedDocumentsType,
32
+ )
33
+
28
34
 
29
35
  logger = logging.getLogger(__name__)
30
36
 
@@ -47,6 +53,7 @@ class PylateSearchEncoder:
47
53
  hf_split: str,
48
54
  hf_subset: str,
49
55
  encode_kwargs: EncodeKwargs,
56
+ num_proc: int,
50
57
  ) -> None:
51
58
  """Index the corpus for retrieval.
52
59
 
@@ -56,6 +63,7 @@ class PylateSearchEncoder:
56
63
  hf_split: Split of current task, allows to know some additional information about current split.
57
64
  hf_subset: Subset of current task. Similar to `hf_split` to get more information
58
65
  encode_kwargs: Additional arguments to pass to the encoder during indexing.
66
+ num_proc: Number of processes to use for indexing.
59
67
  """
60
68
  self.task_corpus = corpus
61
69
 
@@ -81,12 +89,14 @@ class PylateSearchEncoder:
81
89
  top_k: int,
82
90
  encode_kwargs: EncodeKwargs,
83
91
  top_ranked: TopRankedDocumentsType | None = None,
92
+ num_proc: int,
84
93
  ) -> RetrievalOutputType:
85
94
  queries_dataloader = create_dataloader(
86
95
  queries,
87
96
  task_metadata,
88
97
  prompt_type=PromptType.query,
89
98
  batch_size=encode_kwargs.get("batch_size", 32),
99
+ num_proc=num_proc,
90
100
  )
91
101
 
92
102
  query_embeddings = self.encode(
@@ -110,6 +120,7 @@ class PylateSearchEncoder:
110
120
  hf_subset=hf_subset,
111
121
  hf_split=hf_split,
112
122
  encode_kwargs=encode_kwargs,
123
+ num_proc=num_proc,
113
124
  )
114
125
  else:
115
126
  result_heaps = self._pylate_full_corpus_search(
@@ -120,6 +131,7 @@ class PylateSearchEncoder:
120
131
  hf_subset=hf_subset,
121
132
  hf_split=hf_split,
122
133
  encode_kwargs=encode_kwargs,
134
+ num_proc=num_proc,
123
135
  )
124
136
 
125
137
  results = {qid: {} for qid in query_idx_to_id.values()}
@@ -138,6 +150,7 @@ class PylateSearchEncoder:
138
150
  hf_split: str,
139
151
  top_k: int,
140
152
  encode_kwargs: EncodeKwargs,
153
+ num_proc: int,
141
154
  ) -> dict[str, list[tuple[float, str]]]:
142
155
  from pylate import indexes, retrieve
143
156
 
@@ -164,6 +177,7 @@ class PylateSearchEncoder:
164
177
  task_metadata,
165
178
  prompt_type=PromptType.document,
166
179
  batch_size=encode_kwargs.get("batch_size", 32),
180
+ num_proc=num_proc,
167
181
  )
168
182
  documents_embeddings = self.encode(
169
183
  documents_loader,
@@ -202,6 +216,7 @@ class PylateSearchEncoder:
202
216
  hf_subset: str,
203
217
  hf_split: str,
204
218
  encode_kwargs: EncodeKwargs,
219
+ num_proc: int = 1,
205
220
  ) -> dict[str, list[tuple[float, str]]]:
206
221
  """Rerank with PyLate's rank.rerank using per-query candidates.
207
222
 
@@ -224,6 +239,7 @@ class PylateSearchEncoder:
224
239
  task_metadata,
225
240
  prompt_type=PromptType.document,
226
241
  batch_size=encode_kwargs.get("batch_size", 32),
242
+ num_proc=num_proc,
227
243
  ),
228
244
  task_metadata=task_metadata,
229
245
  hf_split=hf_split,
@@ -346,6 +362,7 @@ colbert_v2 = ModelMeta(
346
362
  public_training_data=None,
347
363
  release_date="2024-09-21",
348
364
  n_parameters=int(110 * 1e6),
365
+ n_embedding_parameters=23_440_896,
349
366
  memory_usage_mb=418,
350
367
  max_tokens=180,
351
368
  embed_dim=None,
@@ -402,6 +419,7 @@ jina_colbert_v2 = ModelMeta(
402
419
  public_training_data=None,
403
420
  release_date="2024-08-16",
404
421
  n_parameters=int(559 * 1e6),
422
+ n_embedding_parameters=None,
405
423
  memory_usage_mb=1067,
406
424
  max_tokens=8192,
407
425
  embed_dim=None,
@@ -458,6 +476,7 @@ lightonai__gte_moderncolbert_v1 = ModelMeta(
458
476
  public_training_data="https://huggingface.co/datasets/lightonai/ms-marco-en-bge-gemma",
459
477
  release_date="2025-04-30",
460
478
  n_parameters=int(149 * 1e6),
479
+ n_embedding_parameters=None,
461
480
  memory_usage_mb=None,
462
481
  max_tokens=8192,
463
482
  embed_dim=None,
@@ -36,6 +36,7 @@ Qodo_Embed_1_1_5B = ModelMeta(
36
36
  revision="84bbef079b32e8823ec226d4e9e92902706b0eb6",
37
37
  release_date="2025-02-19",
38
38
  n_parameters=1_780_000_000,
39
+ n_embedding_parameters=232_928_256,
39
40
  memory_usage_mb=6776,
40
41
  embed_dim=1536,
41
42
  license="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE",
@@ -59,6 +60,7 @@ Qodo_Embed_1_7B = ModelMeta(
59
60
  revision="f9edd9bf7f687c0e832424058e265120f603cd81",
60
61
  release_date="2025-02-24",
61
62
  n_parameters=7_613_000_000,
63
+ n_embedding_parameters=None,
62
64
  memory_usage_mb=29040,
63
65
  embed_dim=3584,
64
66
  license="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE",
@@ -31,6 +31,7 @@ mini_gte = ModelMeta(
31
31
  revision="7fbe6f9b4cc42615e0747299f837ad7769025492",
32
32
  release_date="2025-01-28",
33
33
  n_parameters=int(66.3 * 1e6),
34
+ n_embedding_parameters=23_440_896,
34
35
  memory_usage_mb=253,
35
36
  embed_dim=768,
36
37
  license="apache-2.0",
@@ -1,6 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
1
5
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
6
  from mteb.models.model_meta import ModelMeta
3
- from mteb.models.models_protocols import EncoderProtocol, PromptType
7
+ from mteb.types import PromptType
8
+
9
+ if TYPE_CHECKING:
10
+ from mteb.models.models_protocols import EncoderProtocol
4
11
 
5
12
 
6
13
  def instruction_template(
@@ -140,6 +147,7 @@ Qwen3_Embedding_0B6 = ModelMeta(
140
147
  revision="b22da495047858cce924d27d76261e96be6febc0", # Commit of @tomaarsen
141
148
  release_date="2025-06-05",
142
149
  n_parameters=595776512,
150
+ n_embedding_parameters=None,
143
151
  memory_usage_mb=1136,
144
152
  embed_dim=1024,
145
153
  max_tokens=32768,
@@ -163,6 +171,7 @@ Qwen3_Embedding_4B = ModelMeta(
163
171
  revision="636cd9bf47d976946cdbb2b0c3ca0cb2f8eea5ff", # Commit of @tomaarsen
164
172
  release_date="2025-06-05",
165
173
  n_parameters=4021774336,
174
+ n_embedding_parameters=None,
166
175
  memory_usage_mb=7671,
167
176
  embed_dim=2560,
168
177
  max_tokens=32768,
@@ -186,6 +195,7 @@ Qwen3_Embedding_8B = ModelMeta(
186
195
  revision="4e423935c619ae4df87b646a3ce949610c66241c", # Commit of @tomaarsen
187
196
  release_date="2025-06-05",
188
197
  n_parameters=7567295488,
198
+ n_embedding_parameters=None,
189
199
  memory_usage_mb=14433,
190
200
  embed_dim=4096,
191
201
  max_tokens=32768,
@@ -64,6 +64,7 @@ QZhou_Embedding = ModelMeta(
64
64
  revision="f1e6c03ee3882e7b9fa5cec91217715272e433b8",
65
65
  release_date="2025-08-24",
66
66
  n_parameters=7_070_619_136,
67
+ n_embedding_parameters=None,
67
68
  memory_usage_mb=14436,
68
69
  embed_dim=3584,
69
70
  license="apache-2.0",
@@ -98,6 +99,7 @@ QZhou_Embedding_Zh = ModelMeta(
98
99
  revision="0321ccb126413d1e49c5ce908e802b63d35f18e2",
99
100
  release_date="2025-09-28",
100
101
  n_parameters=7_575_747_328,
102
+ n_embedding_parameters=None,
101
103
  memory_usage_mb=29431,
102
104
  embed_dim=1792,
103
105
  license="apache-2.0",
@@ -5,18 +5,19 @@ from typing import TYPE_CHECKING, Any, Literal
5
5
 
6
6
  import numpy as np
7
7
  import torch
8
- from torch.utils.data import DataLoader
9
8
 
10
- from mteb.abstasks.task_metadata import TaskMetadata
11
9
  from mteb.models.model_meta import ModelMeta
12
10
  from mteb.similarity_functions import (
13
11
  select_pairwise_similarity,
14
12
  select_similarity,
15
13
  )
16
- from mteb.types._encoder_io import Array, BatchedInput, PromptType
17
14
 
18
15
  if TYPE_CHECKING:
19
16
  from PIL import Image
17
+ from torch.utils.data import DataLoader
18
+
19
+ from mteb.abstasks.task_metadata import TaskMetadata
20
+ from mteb.types._encoder_io import Array, BatchedInput, PromptType
20
21
 
21
22
 
22
23
  def _string_to_vector(text: str | None, size: int) -> np.ndarray:
@@ -12,6 +12,7 @@ potion_base_8m = ModelMeta(
12
12
  revision="387897cfb09992e6d45ea9cd7b28b9fcf119e23a",
13
13
  release_date="2025-10-08",
14
14
  n_parameters=22893312,
15
+ n_embedding_parameters=22893312,
15
16
  memory_usage_mb=87,
16
17
  max_tokens=np.inf,
17
18
  embed_dim=256,
@@ -36,12 +36,76 @@ REASONIR_TRAINING_DATA = {
36
36
  "DuRetrieval",
37
37
  "QuoraRetrieval",
38
38
  }
39
+ _prompts_dict = {
40
+ "BrightBiologyRetrieval": {
41
+ "query": "Given a Biology post, retrieve relevant passages that help answer the post"
42
+ },
43
+ "BrightEarthScienceRetrieval": {
44
+ "query": "Given a Earth Science post, retrieve relevant passages that help answer the post"
45
+ },
46
+ "BrightEconomicsRetrieval": {
47
+ "query": "Given a Economics post, retrieve relevant passages that help answer the post"
48
+ },
49
+ "BrightPsychologyRetrieval": {
50
+ "query": "Given a Psychology post, retrieve relevant passages that help answer the post"
51
+ },
52
+ "BrightRoboticsRetrieval": {
53
+ "query": "Given a Robotics post, retrieve relevant passages that help answer the post"
54
+ },
55
+ "BrightStackoverflowRetrieval": {
56
+ "query": "Given a Stackoverflow post, retrieve relevant passages that help answer the post"
57
+ },
58
+ "BrightSustainableLivingRetrieval": {
59
+ "query": "Given a Sustainable Living post, retrieve relevant passages that help answer the post"
60
+ },
61
+ "BrightPonyRetrieval": {
62
+ "query": "Given a Pony question, retrieve relevant passages that help answer the question"
63
+ },
64
+ "BrightLeetcodeRetrieval": {
65
+ "query": "Given a coding problem, retrieve relevant examples that help answer the problem",
66
+ },
67
+ "BrightAopsRetrieval": {
68
+ "query": "Given a Math problem, retrieve relevant examples that help answer the problem"
69
+ },
70
+ "BrightTheoremQATheoremsRetrieval": {
71
+ "query": "Given a Math problem, retrieve relevant theorems that help answer the problem",
72
+ },
73
+ "BrightTheoremQAQuestionsRetrieval": {
74
+ "query": "Given a Math problem, retrieve relevant examples that help answer the problem",
75
+ },
76
+ "BrightBiologyLongRetrieval": {
77
+ "query": "Given a Biology post, retrieve relevant documents that help answer the post"
78
+ },
79
+ "BrightEarthScienceLongRetrieval": {
80
+ "query": "Given a Earth Science post, retrieve relevant documents that help answer the post"
81
+ },
82
+ "BrightEconomicsLongRetrieval": {
83
+ "query": "Given a Economics post, retrieve relevant documents that help answer the post"
84
+ },
85
+ "BrightPsychologyLongRetrieval": {
86
+ "query": "Given a Psychology post, retrieve relevant documents that help answer the post"
87
+ },
88
+ "BrightRoboticsLongRetrieval": {
89
+ "query": "Given a Robotics post, retrieve relevant documents that help answer the post"
90
+ },
91
+ "BrightStackoverflowLongRetrieval": {
92
+ "query": "Given a Stackoverflow post, retrieve relevant documents that help answer the post"
93
+ },
94
+ "BrightSustainableLivingLongRetrieval": {
95
+ "query": "Given a Sustainable Living post, retrieve relevant documents that help answer the post"
96
+ },
97
+ "BrightPonyLongRetrieval": {
98
+ "query": "Given a Pony question, retrieve relevant documents that help answer the question"
99
+ },
100
+ }
101
+
39
102
 
40
103
  ReasonIR_8B = ModelMeta(
41
104
  loader=InstructSentenceTransformerModel,
42
105
  loader_kwargs=dict(
43
106
  instruction_template=instruction_template,
44
107
  trust_remote_code=True,
108
+ prompts_dict=_prompts_dict,
45
109
  ),
46
110
  name="ReasonIR/ReasonIR-8B",
47
111
  model_type=["dense"],
@@ -50,6 +114,7 @@ ReasonIR_8B = ModelMeta(
50
114
  revision="c3d0690370ff4a8c3d3882d8dfa85c43650034fa",
51
115
  release_date="2025-04-29",
52
116
  n_parameters=7_500_000_000,
117
+ n_embedding_parameters=None,
53
118
  memory_usage_mb=None,
54
119
  embed_dim=4096,
55
120
  license="cc-by-nc-4.0",
@@ -1,22 +1,29 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from collections.abc import Callable
3
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
4
5
 
5
6
  import numpy as np
6
7
  import torch
7
8
  import torch.nn.functional as F
8
- from torch.utils.data import DataLoader
9
9
  from tqdm.auto import tqdm
10
10
 
11
11
  from mteb._requires_package import requires_package
12
- from mteb.abstasks.task_metadata import TaskMetadata
13
12
  from mteb.models.abs_encoder import AbsEncoder
14
13
  from mteb.models.model_meta import (
15
14
  ModelMeta,
16
15
  ScoringFunction,
17
16
  )
18
- from mteb.models.models_protocols import EncoderProtocol
19
- from mteb.types import Array, BatchedInput, PromptType
17
+ from mteb.types import PromptType
18
+
19
+ if TYPE_CHECKING:
20
+ from collections.abc import Callable
21
+
22
+ from torch.utils.data import DataLoader
23
+
24
+ from mteb.abstasks.task_metadata import TaskMetadata
25
+ from mteb.models.models_protocols import EncoderProtocol
26
+ from mteb.types import Array, BatchedInput
20
27
 
21
28
  logger = logging.getLogger(__name__)
22
29
 
@@ -172,6 +179,7 @@ repllama_llama2_original = ModelMeta(
172
179
  "mMARCO-NL", # translation not trained on
173
180
  },
174
181
  n_parameters=7_000_000,
182
+ n_embedding_parameters=131_072_000,
175
183
  memory_usage_mb=27,
176
184
  max_tokens=4096,
177
185
  embed_dim=4096,
@@ -201,6 +209,7 @@ repllama_llama2_reproduced = ModelMeta(
201
209
  revision="01c7f73d771dfac7d292323805ebc428287df4f9-ad5c1d0938a1e02954bcafb4d811ba2f34052e71", # base-peft revision
202
210
  release_date="2024-09-15",
203
211
  n_parameters=7_000_000,
212
+ n_embedding_parameters=None,
204
213
  memory_usage_mb=27,
205
214
  max_tokens=4096,
206
215
  embed_dim=4096,
@@ -1,16 +1,22 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from torch.utils.data import DataLoader
6
7
 
7
8
  from mteb._requires_package import requires_package
8
- from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.model_meta import ModelMeta
10
- from mteb.types import Array, BatchedInput, PromptType
11
10
 
12
11
  from .bge_models import bge_m3_training_data
13
12
 
13
+ if TYPE_CHECKING:
14
+ from torch.utils.data import DataLoader
15
+
16
+ from mteb.abstasks.task_metadata import TaskMetadata
17
+ from mteb.types import Array, BatchedInput, PromptType
18
+
19
+
14
20
  logger = logging.getLogger(__name__)
15
21
 
16
22
 
@@ -225,6 +231,7 @@ monobert_large = ModelMeta(
225
231
  revision="0a97706f3827389da43b83348d5d18c9d53876fa",
226
232
  release_date="2020-05-28",
227
233
  n_parameters=None,
234
+ n_embedding_parameters=31_254_528,
228
235
  memory_usage_mb=None,
229
236
  max_tokens=None,
230
237
  embed_dim=None,
@@ -250,6 +257,7 @@ jina_reranker_multilingual = ModelMeta(
250
257
  revision="126747772a932960028d9f4dc93bd5d9c4869be4",
251
258
  release_date="2024-09-26",
252
259
  n_parameters=None,
260
+ n_embedding_parameters=None,
253
261
  memory_usage_mb=531,
254
262
  max_tokens=None,
255
263
  embed_dim=None,
@@ -313,6 +321,7 @@ bge_reranker_v2_m3 = ModelMeta(
313
321
  revision="953dc6f6f85a1b2dbfca4c34a2796e7dde08d41e",
314
322
  release_date="2024-06-24",
315
323
  n_parameters=None,
324
+ n_embedding_parameters=256_002_048,
316
325
  memory_usage_mb=2166,
317
326
  max_tokens=None,
318
327
  embed_dim=None,