mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,20 @@
1
1
  from mteb.models import ModelMeta
2
2
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
3
+ from mteb.models.model_meta import ScoringFunction
3
4
  from mteb.types import PromptType
4
5
 
6
+ F2LLM_CITATION = """@article{2025F2LLM,
7
+ title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
8
+ author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
9
+ journal={CoRR},
10
+ volume={abs/2510.02294},
11
+ year={2025},
12
+ url={https://doi.org/10.48550/arXiv.2510.02294},
13
+ doi={10.48550/ARXIV.2510.02294},
14
+ eprinttype={arXiv},
15
+ eprint={2510.02294}
16
+ }"""
17
+
5
18
  training_datasets = {
6
19
  "MSMARCO",
7
20
  "ArguAna",
@@ -62,6 +75,22 @@ training_datasets = {
62
75
  "TwentyNewsgroupsClustering",
63
76
  }
64
77
 
78
+ c2llm_training_datasets = {
79
+ "CodeSearchNet",
80
+ "CodeSearchNetRetrieval",
81
+ "CodeSearchNetCCRetrieval",
82
+ "CodeEditSearchRetrieval",
83
+ "CodeFeedbackMT",
84
+ "CodeFeedbackST",
85
+ "CodeTransOceanContest",
86
+ "CodeTransOceanDL",
87
+ "COIRCodeSearchNetRetrieval",
88
+ "CosQA",
89
+ "StackOverflowQA",
90
+ "SyntheticText2SQL",
91
+ "AdvTrain",
92
+ }
93
+
65
94
  prompts_dict = {
66
95
  "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not counterfactual.",
67
96
  "Banking77Classification": "Given an online banking query, find the corresponding intents.",
@@ -107,6 +136,77 @@ prompts_dict = {
107
136
  }
108
137
 
109
138
 
139
+ c2llm_prompts_dict = {
140
+ "CodeEditSearchRetrieval": {
141
+ "query": "Retrieve the diff code that relevant the following query:\n",
142
+ "document": "Retrieved Answer:",
143
+ },
144
+ "CodeSearchNetRetrieval": {
145
+ "query": "Retrieve the code that solves the following query:\n",
146
+ "document": "Retrieved Answer:",
147
+ },
148
+ "AppsRetrieval": {
149
+ "query": "Given a problem description from a programming contest, retrieve code examples that can assist in solving it.\n",
150
+ "document": "Retrieved Answer:",
151
+ },
152
+ "CodeFeedbackMT": {
153
+ "query": "Given a multi-turn conversation history that includes both text and code, retrieve relevant multi-modal answers composed of text and code that address the ongoing discussion.\n",
154
+ "document": "Retrieved Answer:",
155
+ },
156
+ "CodeFeedbackST": {
157
+ "query": "Given a single-turn question composed of text and code, retrieve suitable answers that also mix text and code to provide helpful feedback.\n",
158
+ "document": "Retrieved Answer:",
159
+ },
160
+ "CodeSearchNetCCRetrieval": {
161
+ "query": "Given an initial code segment, retrieve the subsequent segment that continues the code.\n",
162
+ "document": "Retrieved Answer:",
163
+ },
164
+ "CodeTransOceanContest": {
165
+ "query": "Given a Python code snippet, retrieve its semantically equivalent version written in C++.\n",
166
+ "document": "Retrieved Answer:",
167
+ },
168
+ "CodeTransOceanDL": {
169
+ "query": "Given a Python code snippet, retrieve its semantically equivalent version written in C++.\n",
170
+ "document": "Retrieved Answer:",
171
+ },
172
+ "COIRCodeSearchNetRetrieval": {
173
+ "query": "Given a code snippet, retrieve its corresponding document string that summarizes its functionality.\n",
174
+ "document": "Retrieved Answer:",
175
+ },
176
+ "CosQA": {
177
+ "query": "Given a query from a web search, retrieve code that is helpful in addressing the query.\n",
178
+ "document": "Retrieved Answer:",
179
+ },
180
+ "StackOverflowQA": {
181
+ "query": "Given a question combining text and code, retrieve relevant answers that also contain both text and code snippets and can address the question.\n",
182
+ "document": "Retrieved Answer:",
183
+ },
184
+ "SyntheticText2SQL": {
185
+ "query": "Given a natural language question, retrieve SQL queries that serve as appropriate responses.\n",
186
+ "document": "Retrieved Answer:",
187
+ },
188
+ }
189
+
190
+ c2llm_languages = [
191
+ "eng-Latn",
192
+ "zho-Hans",
193
+ "python-Code",
194
+ "javascript-Code",
195
+ "go-Code",
196
+ "ruby-Code",
197
+ "java-Code",
198
+ "php-Code",
199
+ ]
200
+
201
+ c2llm_loader_kwargs = dict(
202
+ trust_remote_code=True,
203
+ prompts_dict=c2llm_prompts_dict,
204
+ apply_instruction_to_passages=True,
205
+ max_seq_length=2048,
206
+ padding_side="left",
207
+ )
208
+
209
+
110
210
  def instruction_template(
111
211
  instruction: str, prompt_type: PromptType | None = None
112
212
  ) -> str:
@@ -130,6 +230,7 @@ F2LLM_0B6 = ModelMeta(
130
230
  max_seq_length=8192,
131
231
  ),
132
232
  name="codefuse-ai/F2LLM-0.6B",
233
+ model_type=["dense"],
133
234
  languages=["eng-Latn"],
134
235
  open_weights=True,
135
236
  revision="36416618b83d4bd84a8ca30c2ee01ed518f9f2e7",
@@ -141,11 +242,12 @@ F2LLM_0B6 = ModelMeta(
141
242
  max_tokens=8192,
142
243
  reference="https://huggingface.co/codefuse-ai/F2LLM-0.6B",
143
244
  similarity_fn_name="cosine",
144
- framework=["Sentence Transformers", "PyTorch"],
245
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
145
246
  use_instructions=True,
146
247
  public_training_code="https://github.com/codefuse-ai/F2LLM",
147
248
  public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
148
249
  training_datasets=training_datasets,
250
+ citation=F2LLM_CITATION,
149
251
  )
150
252
 
151
253
  F2LLM_1B7 = ModelMeta(
@@ -158,6 +260,7 @@ F2LLM_1B7 = ModelMeta(
158
260
  max_seq_length=8192,
159
261
  ),
160
262
  name="codefuse-ai/F2LLM-1.7B",
263
+ model_type=["dense"],
161
264
  languages=["eng-Latn"],
162
265
  open_weights=True,
163
266
  revision="fdce0e09655f42cea26f7f66f5a70cd4507ea45c",
@@ -169,11 +272,12 @@ F2LLM_1B7 = ModelMeta(
169
272
  max_tokens=8192,
170
273
  reference="https://huggingface.co/codefuse-ai/F2LLM-1.7B",
171
274
  similarity_fn_name="cosine",
172
- framework=["Sentence Transformers", "PyTorch"],
275
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
173
276
  use_instructions=True,
174
277
  public_training_code="https://github.com/codefuse-ai/F2LLM",
175
278
  public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
176
279
  training_datasets=training_datasets,
280
+ citation=F2LLM_CITATION,
177
281
  )
178
282
 
179
283
  F2LLM_4B = ModelMeta(
@@ -186,6 +290,7 @@ F2LLM_4B = ModelMeta(
186
290
  max_seq_length=8192,
187
291
  ),
188
292
  name="codefuse-ai/F2LLM-4B",
293
+ model_type=["dense"],
189
294
  languages=["eng-Latn"],
190
295
  open_weights=True,
191
296
  revision="9fe95901ed2b6b59dd7673d6e93c9d76766a1e25",
@@ -197,9 +302,66 @@ F2LLM_4B = ModelMeta(
197
302
  max_tokens=8192,
198
303
  reference="https://huggingface.co/codefuse-ai/F2LLM-4B",
199
304
  similarity_fn_name="cosine",
200
- framework=["Sentence Transformers", "PyTorch"],
305
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
201
306
  use_instructions=True,
202
307
  public_training_code="https://github.com/codefuse-ai/F2LLM",
203
308
  public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
204
309
  training_datasets=training_datasets,
310
+ citation=F2LLM_CITATION,
311
+ )
312
+
313
+ C2LLM_0B5 = ModelMeta(
314
+ loader=InstructSentenceTransformerModel,
315
+ loader_kwargs=c2llm_loader_kwargs,
316
+ name="codefuse-ai/C2LLM-0.5B",
317
+ revision="f08c18be03de42c6e388948a1804d4b271a953a2",
318
+ release_date="2025-12-22",
319
+ languages=c2llm_languages,
320
+ n_parameters=497252096,
321
+ memory_usage_mb=948.0,
322
+ max_tokens=32768,
323
+ embed_dim=896,
324
+ license="apache-2.0",
325
+ open_weights=True,
326
+ public_training_code=None,
327
+ public_training_data=None,
328
+ framework=["PyTorch", "Sentence Transformers", "Transformers", "safetensors"],
329
+ reference="https://huggingface.co/codefuse-ai/C2LLM-0.5B",
330
+ similarity_fn_name=ScoringFunction.COSINE,
331
+ use_instructions=True,
332
+ training_datasets=c2llm_training_datasets,
333
+ adapted_from=None,
334
+ superseded_by=None,
335
+ modalities=["text"],
336
+ is_cross_encoder=None,
337
+ citation=None,
338
+ contacts=None,
339
+ )
340
+
341
+ C2LLM_7B = ModelMeta(
342
+ loader=InstructSentenceTransformerModel,
343
+ loader_kwargs=c2llm_loader_kwargs,
344
+ name="codefuse-ai/C2LLM-7B",
345
+ revision="c1dc16d6d64eb962c783bfb36a6d9c2f24a86dca",
346
+ release_date="2025-12-22",
347
+ languages=c2llm_languages,
348
+ n_parameters=7667028992,
349
+ memory_usage_mb=14624.0,
350
+ max_tokens=32768,
351
+ embed_dim=3584,
352
+ license="apache-2.0",
353
+ open_weights=True,
354
+ public_training_code=None,
355
+ public_training_data=None,
356
+ framework=["PyTorch", "Sentence Transformers", "Transformers", "safetensors"],
357
+ reference="https://huggingface.co/codefuse-ai/C2LLM-7B",
358
+ similarity_fn_name=ScoringFunction.COSINE,
359
+ use_instructions=True,
360
+ training_datasets=c2llm_training_datasets,
361
+ adapted_from=None,
362
+ superseded_by=None,
363
+ modalities=["text"],
364
+ is_cross_encoder=None,
365
+ citation=None,
366
+ contacts=None,
205
367
  )
@@ -1,6 +1,15 @@
1
1
  from mteb.models.model_meta import ModelMeta, ScoringFunction
2
2
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
3
3
 
4
+ CODESAGE_CITATION = """@inproceedings{
5
+ zhang2024code,
6
+ title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
7
+ author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
8
+ booktitle={The Twelfth International Conference on Learning Representations},
9
+ year={2024},
10
+ url={https://openreview.net/forum?id=vfzRRjumpX}
11
+ }"""
12
+
4
13
  codesage_languages = [
5
14
  "python-Code",
6
15
  "javascript-Code",
@@ -13,6 +22,7 @@ codesage_languages = [
13
22
  codesage_large = ModelMeta(
14
23
  loader=sentence_transformers_loader,
15
24
  name="codesage/codesage-large-v2",
25
+ model_type=["dense"],
16
26
  languages=codesage_languages,
17
27
  revision="6e5d6dc15db3e310c37c6dbac072409f95ffa5c5",
18
28
  release_date="2024-02-03",
@@ -25,7 +35,7 @@ codesage_large = ModelMeta(
25
35
  open_weights=True,
26
36
  public_training_code=None,
27
37
  public_training_data=None,
28
- framework=["PyTorch"],
38
+ framework=["PyTorch", "Transformers"],
29
39
  reference="https://huggingface.co/codesage/codesage-large-v2",
30
40
  similarity_fn_name=ScoringFunction.COSINE,
31
41
  use_instructions=False,
@@ -33,11 +43,13 @@ codesage_large = ModelMeta(
33
43
  "CodeSearchNetRetrieval",
34
44
  "CodeSearchNetCCRetrieval",
35
45
  },
46
+ citation=CODESAGE_CITATION,
36
47
  )
37
48
 
38
49
  codesage_base = ModelMeta(
39
50
  loader=sentence_transformers_loader,
40
51
  name="codesage/codesage-base-v2",
52
+ model_type=["dense"],
41
53
  languages=codesage_languages,
42
54
  revision="92eac4f44c8674638f039f1b0d8280f2539cb4c7",
43
55
  release_date="2024-02-03",
@@ -50,7 +62,7 @@ codesage_base = ModelMeta(
50
62
  open_weights=True,
51
63
  public_training_code=None,
52
64
  public_training_data=None,
53
- framework=["PyTorch"],
65
+ framework=["PyTorch", "Transformers"],
54
66
  reference="https://huggingface.co/codesage/codesage-base-v2",
55
67
  similarity_fn_name=ScoringFunction.COSINE,
56
68
  use_instructions=False,
@@ -58,11 +70,13 @@ codesage_base = ModelMeta(
58
70
  "CodeSearchNetRetrieval",
59
71
  "CodeSearchNetCCRetrieval",
60
72
  },
73
+ citation=CODESAGE_CITATION,
61
74
  )
62
75
 
63
76
  codesage_small = ModelMeta(
64
77
  loader=sentence_transformers_loader,
65
78
  name="codesage/codesage-small-v2",
79
+ model_type=["dense"],
66
80
  languages=codesage_languages,
67
81
  revision="4844c2f24b25e181aa43ca058cc73dd2622565c1",
68
82
  release_date="2024-02-03",
@@ -75,7 +89,7 @@ codesage_small = ModelMeta(
75
89
  open_weights=True,
76
90
  public_training_code=None,
77
91
  public_training_data=None,
78
- framework=["PyTorch"],
92
+ framework=["PyTorch", "Transformers"],
79
93
  reference="https://huggingface.co/codesage/codesage-small-v2",
80
94
  similarity_fn_name=ScoringFunction.COSINE,
81
95
  use_instructions=False,
@@ -83,4 +97,5 @@ codesage_small = ModelMeta(
83
97
  "CodeSearchNetRetrieval",
84
98
  "CodeSearchNetCCRetrieval",
85
99
  },
100
+ citation=CODESAGE_CITATION,
86
101
  )
@@ -8,6 +8,7 @@ import torch
8
8
  from torch.utils.data import DataLoader
9
9
  from tqdm.auto import tqdm
10
10
 
11
+ from mteb._requires_package import requires_package
11
12
  from mteb.abstasks.task_metadata import TaskMetadata
12
13
  from mteb.models.abs_encoder import AbsEncoder
13
14
  from mteb.models.model_meta import ModelMeta, ScoringFunction
@@ -219,9 +220,11 @@ class CohereTextEmbeddingModel(AbsEncoder):
219
220
  output_dimension: int | None = None,
220
221
  **kwargs,
221
222
  ) -> None:
222
- import cohere # type: ignore
223
+ requires_package(self, "cohere", model_name, "pip install 'mteb[cohere]'")
223
224
 
224
- self.model_name = model_name.lstrip("Cohere/Cohere-")
225
+ import cohere
226
+
227
+ self.model_name = model_name.removeprefix("Cohere/Cohere-")
225
228
  self.sep = sep
226
229
  self.model_prompts = self.validate_task_to_prompt_name(model_prompts)
227
230
  if embedding_type not in get_args(EmbeddingType):
@@ -377,6 +380,7 @@ cohere_mult_3 = ModelMeta(
377
380
  model_prompts=model_prompts,
378
381
  ),
379
382
  name="Cohere/Cohere-embed-multilingual-v3.0",
383
+ model_type=["dense"],
380
384
  languages=supported_languages,
381
385
  open_weights=False,
382
386
  revision="1",
@@ -388,7 +392,7 @@ cohere_mult_3 = ModelMeta(
388
392
  reference="https://cohere.com/blog/introducing-embed-v3",
389
393
  license=None,
390
394
  similarity_fn_name=ScoringFunction.COSINE,
391
- framework=["API"],
395
+ framework=["API", "Transformers"],
392
396
  use_instructions=True,
393
397
  public_training_code=None,
394
398
  public_training_data=None, # assumed
@@ -401,6 +405,7 @@ cohere_eng_3 = ModelMeta(
401
405
  model_prompts=model_prompts,
402
406
  ),
403
407
  name="Cohere/Cohere-embed-english-v3.0",
408
+ model_type=["dense"],
404
409
  languages=["eng-Latn"],
405
410
  open_weights=False,
406
411
  reference="https://cohere.com/blog/introducing-embed-v3",
@@ -412,7 +417,7 @@ cohere_eng_3 = ModelMeta(
412
417
  embed_dim=1024,
413
418
  license=None,
414
419
  similarity_fn_name=ScoringFunction.COSINE,
415
- framework=["API"],
420
+ framework=["API", "Transformers"],
416
421
  use_instructions=True,
417
422
  public_training_code=None,
418
423
  public_training_data=None, # assumed
@@ -425,6 +430,7 @@ cohere_mult_light_3 = ModelMeta(
425
430
  model_prompts=model_prompts,
426
431
  ),
427
432
  name="Cohere/Cohere-embed-multilingual-light-v3.0",
433
+ model_type=["dense"],
428
434
  languages=supported_languages,
429
435
  open_weights=False,
430
436
  revision="1",
@@ -436,7 +442,7 @@ cohere_mult_light_3 = ModelMeta(
436
442
  embed_dim=384,
437
443
  license=None,
438
444
  similarity_fn_name=ScoringFunction.COSINE,
439
- framework=["API"],
445
+ framework=["API", "Transformers"],
440
446
  use_instructions=True,
441
447
  public_training_code=None,
442
448
  public_training_data=None, # assumed
@@ -449,6 +455,7 @@ cohere_eng_light_3 = ModelMeta(
449
455
  model_prompts=model_prompts,
450
456
  ),
451
457
  name="Cohere/Cohere-embed-english-light-v3.0",
458
+ model_type=["dense"],
452
459
  languages=["eng-Latn"],
453
460
  open_weights=False,
454
461
  reference="https://cohere.com/blog/introducing-embed-v3",
@@ -460,7 +467,7 @@ cohere_eng_light_3 = ModelMeta(
460
467
  embed_dim=384,
461
468
  license=None,
462
469
  similarity_fn_name=ScoringFunction.COSINE,
463
- framework=["API"],
470
+ framework=["API", "Transformers"],
464
471
  use_instructions=True,
465
472
  public_training_code=None,
466
473
  public_training_data=None, # assumed
@@ -378,9 +378,10 @@ def cohere_v_loader(model_name, **kwargs):
378
378
 
379
379
 
380
380
  cohere_mult_3 = ModelMeta(
381
- loader=cohere_v_loader, # type: ignore
381
+ loader=cohere_v_loader,
382
382
  loader_kwargs={"model_name": "embed-multilingual-v3.0"},
383
383
  name="cohere/embed-multilingual-v3.0",
384
+ model_type=["dense"],
384
385
  languages=[], # Unknown, but support >100 languages
385
386
  revision="1",
386
387
  release_date="2024-10-24",
@@ -401,9 +402,10 @@ cohere_mult_3 = ModelMeta(
401
402
  )
402
403
 
403
404
  cohere_eng_3 = ModelMeta(
404
- loader=cohere_v_loader, # type: ignore
405
+ loader=cohere_v_loader,
405
406
  loader_kwargs={"model_name": "embed-english-v3.0"},
406
407
  name="cohere/embed-english-v3.0",
408
+ model_type=["dense"],
407
409
  languages=["eng-Latn"],
408
410
  revision="1",
409
411
  release_date="2024-10-24",
@@ -426,6 +428,7 @@ cohere_eng_3 = ModelMeta(
426
428
  cohere_embed_v4_multimodal = ModelMeta(
427
429
  loader=cohere_v_loader,
428
430
  loader_kwargs=dict(model_name="embed-v4.0"),
431
+ model_type=["dense"],
429
432
  name="Cohere/Cohere-embed-v4.0",
430
433
  languages=all_languages,
431
434
  revision="1",
@@ -450,6 +453,7 @@ cohere_embed_v4_multimodal_binary = ModelMeta(
450
453
  loader=cohere_v_loader,
451
454
  loader_kwargs=dict(embedding_type="binary"),
452
455
  name="Cohere/Cohere-embed-v4.0 (output_dtype=binary)",
456
+ model_type=["dense"],
453
457
  languages=all_languages,
454
458
  revision="1",
455
459
  release_date="2024-12-01",
@@ -474,6 +478,7 @@ cohere_embed_v4_multimodal_int8 = ModelMeta(
474
478
  loader=cohere_v_loader,
475
479
  loader_kwargs=dict(embedding_type="int8"),
476
480
  name="Cohere/Cohere-embed-v4.0 (output_dtype=int8)",
481
+ model_type=["dense"],
477
482
  languages=all_languages,
478
483
  revision="1",
479
484
  release_date="2024-12-01",
@@ -1,8 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from PIL import Image
6
7
  from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
@@ -15,6 +16,9 @@ from mteb.models.abs_encoder import AbsEncoder
15
16
  from mteb.models.model_meta import ModelMeta, ScoringFunction
16
17
  from mteb.types import Array, BatchedInput, PromptType
17
18
 
19
+ if TYPE_CHECKING:
20
+ from PIL import Image
21
+
18
22
  logger = logging.getLogger(__name__)
19
23
 
20
24
 
@@ -89,6 +93,7 @@ class ColPaliEngineWrapper(AbsEncoder):
89
93
  **kwargs,
90
94
  ):
91
95
  import torchvision.transforms.functional as F
96
+ from PIL import Image
92
97
 
93
98
  all_embeds = []
94
99
 
@@ -196,10 +201,10 @@ COLPALI_CITATION = """
196
201
 
197
202
  COLPALI_TRAINING_DATA = {
198
203
  # from https://huggingface.co/datasets/vidore/colpali_train_set
199
- "DocVQA",
200
- "InfoVQA",
201
- "TATDQA",
202
- "arXivQA",
204
+ "VidoreDocVQARetrieval",
205
+ "VidoreInfoVQARetrieval",
206
+ "VidoreTatdqaRetrieval",
207
+ "VidoreArxivQARetrieval",
203
208
  }
204
209
 
205
210
  colpali_v1_1 = ModelMeta(
@@ -208,6 +213,7 @@ colpali_v1_1 = ModelMeta(
208
213
  torch_dtype=torch.float16,
209
214
  ),
210
215
  name="vidore/colpali-v1.1",
216
+ model_type=["late-interaction"],
211
217
  languages=["eng-Latn"],
212
218
  revision="a0f15e3bcf97110e7ac1bb4be4bcd30eeb31992a",
213
219
  release_date="2024-08-21",
@@ -220,7 +226,7 @@ colpali_v1_1 = ModelMeta(
220
226
  open_weights=True,
221
227
  public_training_code="https://github.com/illuin-tech/colpali",
222
228
  public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set",
223
- framework=["ColPali"],
229
+ framework=["ColPali", "safetensors"],
224
230
  reference="https://huggingface.co/vidore/colpali-v1.1",
225
231
  similarity_fn_name=ScoringFunction.MAX_SIM,
226
232
  use_instructions=True,
@@ -234,6 +240,7 @@ colpali_v1_2 = ModelMeta(
234
240
  torch_dtype=torch.float16,
235
241
  ),
236
242
  name="vidore/colpali-v1.2",
243
+ model_type=["late-interaction"],
237
244
  languages=["eng-Latn"],
238
245
  revision="6b89bc63c16809af4d111bfe412e2ac6bc3c9451",
239
246
  release_date="2024-08-26",
@@ -246,7 +253,7 @@ colpali_v1_2 = ModelMeta(
246
253
  open_weights=True,
247
254
  public_training_code="https://github.com/illuin-tech/colpali",
248
255
  public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set",
249
- framework=["ColPali"],
256
+ framework=["ColPali", "safetensors"],
250
257
  reference="https://huggingface.co/vidore/colpali-v1.2",
251
258
  similarity_fn_name=ScoringFunction.MAX_SIM,
252
259
  use_instructions=True,
@@ -260,6 +267,7 @@ colpali_v1_3 = ModelMeta(
260
267
  torch_dtype=torch.float16,
261
268
  ),
262
269
  name="vidore/colpali-v1.3",
270
+ model_type=["late-interaction"],
263
271
  languages=["eng-Latn"],
264
272
  revision="1b5c8929330df1a66de441a9b5409a878f0de5b0",
265
273
  release_date="2024-11-01",
@@ -272,7 +280,7 @@ colpali_v1_3 = ModelMeta(
272
280
  open_weights=True,
273
281
  public_training_code="https://github.com/illuin-tech/colpali",
274
282
  public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set",
275
- framework=["ColPali"],
283
+ framework=["ColPali", "safetensors"],
276
284
  reference="https://huggingface.co/vidore/colpali-v1.3",
277
285
  similarity_fn_name=ScoringFunction.MAX_SIM,
278
286
  use_instructions=True,