mteb 2.1.7__py3-none-any.whl → 2.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. mteb/_create_dataloaders.py +6 -3
  2. mteb/_evaluators/any_sts_evaluator.py +14 -12
  3. mteb/_evaluators/clustering_evaluator.py +1 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/sklearn_evaluator.py +15 -28
  7. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  8. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  9. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  10. mteb/abstasks/clustering.py +1 -1
  11. mteb/abstasks/multilabel_classification.py +2 -2
  12. mteb/abstasks/task_metadata.py +1 -0
  13. mteb/benchmarks/benchmark.py +9 -0
  14. mteb/benchmarks/benchmarks/__init__.py +2 -0
  15. mteb/benchmarks/benchmarks/benchmarks.py +40 -1
  16. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  17. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  18. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  19. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  20. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  21. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  22. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  23. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  24. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  25. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  26. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  27. mteb/models/model_implementations/align_models.py +6 -0
  28. mteb/models/model_implementations/ara_models.py +7 -0
  29. mteb/models/model_implementations/blip2_models.py +9 -0
  30. mteb/models/model_implementations/blip_models.py +19 -0
  31. mteb/models/model_implementations/cadet_models.py +8 -0
  32. mteb/models/model_implementations/cde_models.py +12 -0
  33. mteb/models/model_implementations/codefuse_models.py +15 -0
  34. mteb/models/model_implementations/codesage_models.py +12 -0
  35. mteb/models/model_implementations/misc_models.py +6 -0
  36. mteb/models/model_implementations/moco_models.py +9 -0
  37. mteb/models/model_implementations/openclip_models.py +16 -0
  38. mteb/models/model_implementations/piccolo_models.py +6 -0
  39. mteb/models/model_implementations/rasgaard_models.py +7 -1
  40. mteb/models/model_implementations/tarka_models.py +317 -0
  41. mteb/models/search_wrappers.py +5 -5
  42. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +1 -5
  43. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  44. mteb/tasks/classification/ara/ajgt.py +1 -2
  45. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  46. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  47. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  48. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  49. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  50. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  51. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  52. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  53. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  54. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  55. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  56. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  57. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  58. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  59. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  60. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  61. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  62. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  63. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  64. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  65. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  66. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  67. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  68. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  69. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  70. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  71. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  72. mteb/tasks/classification/eng/news_classification.py +1 -2
  73. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  74. mteb/tasks/classification/eng/patent_classification.py +1 -2
  75. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  76. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  77. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  78. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  79. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  80. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  81. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  82. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  83. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  84. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  85. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  86. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  87. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  88. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  89. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  90. mteb/tasks/classification/est/estonian_valence.py +1 -2
  91. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  92. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  93. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  94. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  95. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  96. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  97. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +1 -2
  98. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  99. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  100. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  101. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  102. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  103. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  104. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  105. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  106. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  107. mteb/tasks/classification/kor/klue_tc.py +1 -2
  108. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  109. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  110. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  111. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  112. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  113. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  114. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  115. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  116. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  117. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  118. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  119. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  120. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  121. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +1 -3
  122. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  123. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  124. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  125. mteb/tasks/classification/pol/polish_classification.py +3 -6
  126. mteb/tasks/classification/ron/moroco.py +1 -2
  127. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  128. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  129. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  130. mteb/tasks/classification/rus/headline_classification.py +1 -2
  131. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  132. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  133. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  134. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  135. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  136. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  137. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  138. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  139. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  140. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  141. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  142. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  143. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  144. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  145. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  146. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  147. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  148. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  149. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  150. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  151. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  152. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  153. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  154. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  155. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  156. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  157. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  158. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  159. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  160. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  161. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  162. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  163. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  164. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  165. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  166. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  167. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  168. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  169. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  170. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  171. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  172. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  173. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  174. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  175. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  176. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  177. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  178. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  179. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  180. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  181. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  182. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  183. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  184. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  185. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  186. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  187. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  188. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  189. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  190. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  191. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  192. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  193. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  194. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  195. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  196. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  197. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  198. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  199. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  200. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  201. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  202. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  203. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  204. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  205. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  206. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  207. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  208. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  209. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  210. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  211. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  212. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  213. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  214. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  215. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  216. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  217. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  218. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  219. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  220. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  221. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  222. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  223. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  224. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  225. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  226. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  227. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  228. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  229. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  230. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  231. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  232. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  233. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  234. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/METADATA +1 -1
  235. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/RECORD +239 -228
  236. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/WHEEL +0 -0
  237. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/entry_points.txt +0 -0
  238. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/licenses/LICENSE +0 -0
  239. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,13 @@ from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.types import Array, BatchedInput, PromptType
12
12
 
13
+ BLIP2_CITATION = """@inproceedings{li2023blip2,
14
+ title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
15
+ author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
16
+ year={2023},
17
+ booktitle={ICML},
18
+ }"""
19
+
13
20
 
14
21
  def blip2_loader(model_name, **kwargs):
15
22
  requires_package(
@@ -176,6 +183,7 @@ blip2_opt_2_7b = ModelMeta(
176
183
  similarity_fn_name=ScoringFunction.COSINE,
177
184
  use_instructions=False,
178
185
  training_datasets=blip2_training_datasets,
186
+ citation=BLIP2_CITATION,
179
187
  )
180
188
 
181
189
  blip2_opt_6_7b_coco = ModelMeta(
@@ -198,4 +206,5 @@ blip2_opt_6_7b_coco = ModelMeta(
198
206
  similarity_fn_name=ScoringFunction.COSINE,
199
207
  use_instructions=False,
200
208
  training_datasets=blip2_training_datasets,
209
+ citation=BLIP2_CITATION,
201
210
  )
@@ -10,6 +10,17 @@ from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.types import Array, BatchedInput, PromptType
12
12
 
13
+ BLIP_CITATION = """@misc{https://doi.org/10.48550/arxiv.2201.12086,
14
+ doi = {10.48550/ARXIV.2201.12086},
15
+ url = {https://arxiv.org/abs/2201.12086},
16
+ author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
17
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
18
+ title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
19
+ publisher = {arXiv},
20
+ year = {2022},
21
+ copyright = {Creative Commons Attribution 4.0 International}
22
+ }"""
23
+
13
24
 
14
25
  class BLIPModel(AbsEncoder):
15
26
  def __init__(
@@ -140,6 +151,7 @@ blip_image_captioning_large = ModelMeta(
140
151
  # CC3M+CC12M+SBU
141
152
  # LAION115M
142
153
  ),
154
+ citation=BLIP_CITATION,
143
155
  )
144
156
 
145
157
  blip_image_captioning_base = ModelMeta(
@@ -166,6 +178,7 @@ blip_image_captioning_base = ModelMeta(
166
178
  # CC3M+CC12M+SBU
167
179
  # LAION115M
168
180
  ),
181
+ citation=BLIP_CITATION,
169
182
  )
170
183
 
171
184
 
@@ -192,6 +205,7 @@ blip_vqa_base = ModelMeta(
192
205
  # CC3M+CC12M+SBU
193
206
  # LAION115M
194
207
  ),
208
+ citation=BLIP_CITATION,
195
209
  )
196
210
 
197
211
  blip_vqa_capfilt_large = ModelMeta(
@@ -217,6 +231,7 @@ blip_vqa_capfilt_large = ModelMeta(
217
231
  # CC3M+CC12M+SBU
218
232
  # LAION115M
219
233
  ),
234
+ citation=BLIP_CITATION,
220
235
  )
221
236
 
222
237
  blip_itm_base_coco = ModelMeta(
@@ -242,6 +257,7 @@ blip_itm_base_coco = ModelMeta(
242
257
  # CC3M+CC12M+SBU
243
258
  # LAION115M
244
259
  ),
260
+ citation=BLIP_CITATION,
245
261
  )
246
262
 
247
263
  blip_itm_large_coco = ModelMeta(
@@ -268,6 +284,7 @@ blip_itm_large_coco = ModelMeta(
268
284
  # CC3M+CC12M+SBU
269
285
  # LAION115M
270
286
  ),
287
+ citation=BLIP_CITATION,
271
288
  )
272
289
 
273
290
  blip_itm_base_flickr = ModelMeta(
@@ -294,6 +311,7 @@ blip_itm_base_flickr = ModelMeta(
294
311
  # LAION115M
295
312
  # Flickr30k
296
313
  ),
314
+ citation=BLIP_CITATION,
297
315
  )
298
316
 
299
317
  blip_itm_large_flickr = ModelMeta(
@@ -319,4 +337,5 @@ blip_itm_large_flickr = ModelMeta(
319
337
  # CC3M+CC12M+SBU
320
338
  # LAION115M
321
339
  ),
340
+ citation=BLIP_CITATION,
322
341
  )
@@ -3,6 +3,13 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
3
3
 
4
4
  from .bge_models import bge_m3_training_data
5
5
 
6
+ CADET_CITATION = """@article{tamber2025conventionalcontrastivelearningfalls,
7
+ title={Conventional Contrastive Learning Often Falls Short: Improving Dense Retrieval with Cross-Encoder Listwise Distillation and Synthetic Data},
8
+ author={Manveer Singh Tamber and Suleman Kazi and Vivek Sourabh and Jimmy Lin},
9
+ journal={arXiv:2505.19274},
10
+ year={2025}
11
+ }"""
12
+
6
13
  cadet_training_data = {
7
14
  # we train with the corpora of FEVER, MSMARCO, and DBPEDIA. We only train with synthetic generated queries.
8
15
  # However, we do use queries from MSMARCO as examples for synthetic query generation.
@@ -46,4 +53,5 @@ cadet_embed = ModelMeta(
46
53
  public_training_data="https://github.com/manveertamber/cadet-dense-retrieval",
47
54
  training_datasets=cadet_training_data,
48
55
  adapted_from="intfloat/e5-base-unsupervised",
56
+ citation=CADET_CITATION,
49
57
  )
@@ -24,6 +24,16 @@ if TYPE_CHECKING:
24
24
  )
25
25
  logger = logging.getLogger(__name__)
26
26
 
27
+ CDE_CITATION = """@misc{morris2024contextualdocumentembeddings,
28
+ title={Contextual Document Embeddings},
29
+ author={John X. Morris and Alexander M. Rush},
30
+ year={2024},
31
+ eprint={2410.02525},
32
+ archivePrefix={arXiv},
33
+ primaryClass={cs.CL},
34
+ url={https://arxiv.org/abs/2410.02525},
35
+ }"""
36
+
27
37
 
28
38
  class CDEWrapper(SentenceTransformerEncoderWrapper):
29
39
  dataset_embeddings: torch.Tensor | None = None
@@ -217,6 +227,7 @@ cde_small_v1 = ModelMeta(
217
227
  training_datasets=bge_full_data,
218
228
  public_training_code="https://github.com/jxmorris12/cde",
219
229
  public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
230
+ citation=CDE_CITATION,
220
231
  )
221
232
 
222
233
  cde_small_v2 = ModelMeta(
@@ -244,4 +255,5 @@ cde_small_v2 = ModelMeta(
244
255
  training_datasets=bge_full_data,
245
256
  public_training_code="https://github.com/jxmorris12/cde",
246
257
  public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
258
+ citation=CDE_CITATION,
247
259
  )
@@ -2,6 +2,18 @@ from mteb.models import ModelMeta
2
2
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
3
3
  from mteb.types import PromptType
4
4
 
5
+ F2LLM_CITATION = """@article{2025F2LLM,
6
+ title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
7
+ author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
8
+ journal={CoRR},
9
+ volume={abs/2510.02294},
10
+ year={2025},
11
+ url={https://doi.org/10.48550/arXiv.2510.02294},
12
+ doi={10.48550/ARXIV.2510.02294},
13
+ eprinttype={arXiv},
14
+ eprint={2510.02294}
15
+ }"""
16
+
5
17
  training_datasets = {
6
18
  "MSMARCO",
7
19
  "ArguAna",
@@ -146,6 +158,7 @@ F2LLM_0B6 = ModelMeta(
146
158
  public_training_code="https://github.com/codefuse-ai/F2LLM",
147
159
  public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
148
160
  training_datasets=training_datasets,
161
+ citation=F2LLM_CITATION,
149
162
  )
150
163
 
151
164
  F2LLM_1B7 = ModelMeta(
@@ -174,6 +187,7 @@ F2LLM_1B7 = ModelMeta(
174
187
  public_training_code="https://github.com/codefuse-ai/F2LLM",
175
188
  public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
176
189
  training_datasets=training_datasets,
190
+ citation=F2LLM_CITATION,
177
191
  )
178
192
 
179
193
  F2LLM_4B = ModelMeta(
@@ -202,4 +216,5 @@ F2LLM_4B = ModelMeta(
202
216
  public_training_code="https://github.com/codefuse-ai/F2LLM",
203
217
  public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
204
218
  training_datasets=training_datasets,
219
+ citation=F2LLM_CITATION,
205
220
  )
@@ -1,6 +1,15 @@
1
1
  from mteb.models.model_meta import ModelMeta, ScoringFunction
2
2
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
3
3
 
4
+ CODESAGE_CITATION = """@inproceedings{
5
+ zhang2024code,
6
+ title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
7
+ author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
8
+ booktitle={The Twelfth International Conference on Learning Representations},
9
+ year={2024},
10
+ url={https://openreview.net/forum?id=vfzRRjumpX}
11
+ }"""
12
+
4
13
  codesage_languages = [
5
14
  "python-Code",
6
15
  "javascript-Code",
@@ -33,6 +42,7 @@ codesage_large = ModelMeta(
33
42
  "CodeSearchNetRetrieval",
34
43
  "CodeSearchNetCCRetrieval",
35
44
  },
45
+ citation=CODESAGE_CITATION,
36
46
  )
37
47
 
38
48
  codesage_base = ModelMeta(
@@ -58,6 +68,7 @@ codesage_base = ModelMeta(
58
68
  "CodeSearchNetRetrieval",
59
69
  "CodeSearchNetCCRetrieval",
60
70
  },
71
+ citation=CODESAGE_CITATION,
61
72
  )
62
73
 
63
74
  codesage_small = ModelMeta(
@@ -83,4 +94,5 @@ codesage_small = ModelMeta(
83
94
  "CodeSearchNetRetrieval",
84
95
  "CodeSearchNetCCRetrieval",
85
96
  },
97
+ citation=CODESAGE_CITATION,
86
98
  )
@@ -31,6 +31,12 @@ Haon_Chen__speed_embedding_7b_instruct = ModelMeta(
31
31
  training_datasets=None,
32
32
  adapted_from="mistralai/Mistral-7B-v0.1",
33
33
  superseded_by=None,
34
+ citation="""@article{chen2024little,
35
+ title={Little Giants: Synthesizing High-Quality Embedding Data at Scale},
36
+ author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
37
+ journal={arXiv preprint arXiv:2410.18634},
38
+ year={2024}
39
+ }""",
34
40
  )
35
41
  Gameselo__STS_multilingual_mpnet_base_v2 = ModelMeta(
36
42
  name="Gameselo/STS-multilingual-mpnet-base-v2",
@@ -10,6 +10,13 @@ from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.types import Array, BatchedInput, PromptType
12
12
 
13
+ MOCOV3_CITATION = """@Article{chen2021mocov3,
14
+ author = {Xinlei Chen* and Saining Xie* and Kaiming He},
15
+ title = {An Empirical Study of Training Self-Supervised Vision Transformers},
16
+ journal = {arXiv preprint arXiv:2104.02057},
17
+ year = {2021},
18
+ }"""
19
+
13
20
 
14
21
  def mocov3_loader(model_name, **kwargs):
15
22
  requires_package(mocov3_loader, "timm", model_name, "pip install 'mteb[timm]'")
@@ -129,6 +136,7 @@ mocov3_vit_base = ModelMeta(
129
136
  similarity_fn_name=ScoringFunction.COSINE,
130
137
  use_instructions=False,
131
138
  training_datasets=mocov3_training_datasets,
139
+ citation=MOCOV3_CITATION,
132
140
  )
133
141
 
134
142
  mocov3_vit_large = ModelMeta(
@@ -151,4 +159,5 @@ mocov3_vit_large = ModelMeta(
151
159
  similarity_fn_name=ScoringFunction.COSINE,
152
160
  use_instructions=False,
153
161
  training_datasets=mocov3_training_datasets,
162
+ citation=MOCOV3_CITATION,
154
163
  )
@@ -10,6 +10,14 @@ from mteb.models.abs_encoder import AbsEncoder
10
10
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
11
  from mteb.types import Array, BatchedInput, PromptType
12
12
 
13
+ OPENCLIP_CITATION = """@inproceedings{cherti2023reproducible,
14
+ title={Reproducible scaling laws for contrastive language-image learning},
15
+ author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
16
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
17
+ pages={2818--2829},
18
+ year={2023}
19
+ }"""
20
+
13
21
 
14
22
  def openclip_loader(model_name, **kwargs):
15
23
  requires_package(
@@ -133,6 +141,7 @@ CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
133
141
  training_datasets=set(
134
142
  # DataComp-1B
135
143
  ),
144
+ citation=OPENCLIP_CITATION,
136
145
  )
137
146
 
138
147
  CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
@@ -157,6 +166,7 @@ CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
157
166
  training_datasets=set(
158
167
  # DataComp-1B
159
168
  ),
169
+ citation=OPENCLIP_CITATION,
160
170
  )
161
171
 
162
172
  CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
@@ -181,6 +191,7 @@ CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
181
191
  training_datasets=set(
182
192
  # DataComp-1B
183
193
  ),
194
+ citation=OPENCLIP_CITATION,
184
195
  )
185
196
 
186
197
  CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
@@ -205,6 +216,7 @@ CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
205
216
  training_datasets=set(
206
217
  # 2 Billion sample English subset of LAION-5B
207
218
  ),
219
+ citation=OPENCLIP_CITATION,
208
220
  )
209
221
 
210
222
  CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
@@ -229,6 +241,7 @@ CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
229
241
  training_datasets=set(
230
242
  # 2 Billion sample English subset of LAION-5B
231
243
  ),
244
+ citation=OPENCLIP_CITATION,
232
245
  )
233
246
 
234
247
  CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
@@ -253,6 +266,7 @@ CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
253
266
  training_datasets=set(
254
267
  # 2 Billion sample English subset of LAION-5B
255
268
  ),
269
+ citation=OPENCLIP_CITATION,
256
270
  )
257
271
 
258
272
  CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
@@ -277,6 +291,7 @@ CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
277
291
  training_datasets=set(
278
292
  # 2 Billion sample English subset of LAION-5B
279
293
  ),
294
+ citation=OPENCLIP_CITATION,
280
295
  )
281
296
 
282
297
  CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta(
@@ -301,4 +316,5 @@ CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta(
301
316
  training_datasets=set(
302
317
  # 2 Billion sample English subset of LAION-5B
303
318
  ),
319
+ citation=OPENCLIP_CITATION,
304
320
  )
@@ -48,4 +48,10 @@ piccolo_large_zh_v2 = ModelMeta(
48
48
  public_training_code=None,
49
49
  public_training_data=None,
50
50
  training_datasets=None, # They don't say
51
+ citation="""@misc{2405.06932,
52
+ Author = {Junqin Huang and Zhongjie Hu and Zihao Jing and Mengya Gao and Yichao Wu},
53
+ Title = {Piccolo2: General Text Embedding with Multi-task Hybrid Loss Training},
54
+ Year = {2024},
55
+ Eprint = {arXiv:2405.06932},
56
+ }""",
51
57
  )
@@ -17,11 +17,17 @@ potion_base_8m = ModelMeta(
17
17
  license="mit",
18
18
  similarity_fn_name=ScoringFunction.COSINE,
19
19
  framework=["NumPy", "Sentence Transformers"],
20
- reference="https://huggingface.co/minishlab/rasgaard/m2v-dfm-large",
20
+ reference="https://huggingface.co/rasgaard/m2v-dfm-large",
21
21
  use_instructions=False,
22
22
  adapted_from="KennethEnevoldsen/dfm-sentence-encoder-large",
23
23
  superseded_by=None,
24
24
  training_datasets=set(), # distilled
25
25
  public_training_code="https://github.com/MinishLab/model2vec",
26
26
  public_training_data="https://huggingface.co/datasets/HuggingFaceFW/fineweb-2", # distilled on this
27
+ citation="""@article{minishlab2024model2vec,
28
+ author = {Tulkens, Stephan and {van Dongen}, Thomas},
29
+ title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
30
+ year = {2024},
31
+ url = {https://github.com/MinishLab/model2vec}
32
+ }""",
27
33
  )