mteb 2.1.7__py3-none-any.whl → 2.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +6 -3
- mteb/_evaluators/any_sts_evaluator.py +14 -12
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/multilabel_classification.py +2 -2
- mteb/abstasks/task_metadata.py +1 -0
- mteb/benchmarks/benchmark.py +9 -0
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +40 -1
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/model_implementations/align_models.py +6 -0
- mteb/models/model_implementations/ara_models.py +7 -0
- mteb/models/model_implementations/blip2_models.py +9 -0
- mteb/models/model_implementations/blip_models.py +19 -0
- mteb/models/model_implementations/cadet_models.py +8 -0
- mteb/models/model_implementations/cde_models.py +12 -0
- mteb/models/model_implementations/codefuse_models.py +15 -0
- mteb/models/model_implementations/codesage_models.py +12 -0
- mteb/models/model_implementations/misc_models.py +6 -0
- mteb/models/model_implementations/moco_models.py +9 -0
- mteb/models/model_implementations/openclip_models.py +16 -0
- mteb/models/model_implementations/piccolo_models.py +6 -0
- mteb/models/model_implementations/rasgaard_models.py +7 -1
- mteb/models/model_implementations/tarka_models.py +317 -0
- mteb/models/search_wrappers.py +5 -5
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +1 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +1 -2
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +1 -3
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/METADATA +1 -1
- {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/RECORD +239 -228
- {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/WHEEL +0 -0
- {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,13 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
11
|
from mteb.types import Array, BatchedInput, PromptType
|
|
12
12
|
|
|
13
|
+
BLIP2_CITATION = """@inproceedings{li2023blip2,
|
|
14
|
+
title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
|
|
15
|
+
author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
|
|
16
|
+
year={2023},
|
|
17
|
+
booktitle={ICML},
|
|
18
|
+
}"""
|
|
19
|
+
|
|
13
20
|
|
|
14
21
|
def blip2_loader(model_name, **kwargs):
|
|
15
22
|
requires_package(
|
|
@@ -176,6 +183,7 @@ blip2_opt_2_7b = ModelMeta(
|
|
|
176
183
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
177
184
|
use_instructions=False,
|
|
178
185
|
training_datasets=blip2_training_datasets,
|
|
186
|
+
citation=BLIP2_CITATION,
|
|
179
187
|
)
|
|
180
188
|
|
|
181
189
|
blip2_opt_6_7b_coco = ModelMeta(
|
|
@@ -198,4 +206,5 @@ blip2_opt_6_7b_coco = ModelMeta(
|
|
|
198
206
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
199
207
|
use_instructions=False,
|
|
200
208
|
training_datasets=blip2_training_datasets,
|
|
209
|
+
citation=BLIP2_CITATION,
|
|
201
210
|
)
|
|
@@ -10,6 +10,17 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
11
|
from mteb.types import Array, BatchedInput, PromptType
|
|
12
12
|
|
|
13
|
+
BLIP_CITATION = """@misc{https://doi.org/10.48550/arxiv.2201.12086,
|
|
14
|
+
doi = {10.48550/ARXIV.2201.12086},
|
|
15
|
+
url = {https://arxiv.org/abs/2201.12086},
|
|
16
|
+
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
|
|
17
|
+
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
|
|
18
|
+
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
|
|
19
|
+
publisher = {arXiv},
|
|
20
|
+
year = {2022},
|
|
21
|
+
copyright = {Creative Commons Attribution 4.0 International}
|
|
22
|
+
}"""
|
|
23
|
+
|
|
13
24
|
|
|
14
25
|
class BLIPModel(AbsEncoder):
|
|
15
26
|
def __init__(
|
|
@@ -140,6 +151,7 @@ blip_image_captioning_large = ModelMeta(
|
|
|
140
151
|
# CC3M+CC12M+SBU
|
|
141
152
|
# LAION115M
|
|
142
153
|
),
|
|
154
|
+
citation=BLIP_CITATION,
|
|
143
155
|
)
|
|
144
156
|
|
|
145
157
|
blip_image_captioning_base = ModelMeta(
|
|
@@ -166,6 +178,7 @@ blip_image_captioning_base = ModelMeta(
|
|
|
166
178
|
# CC3M+CC12M+SBU
|
|
167
179
|
# LAION115M
|
|
168
180
|
),
|
|
181
|
+
citation=BLIP_CITATION,
|
|
169
182
|
)
|
|
170
183
|
|
|
171
184
|
|
|
@@ -192,6 +205,7 @@ blip_vqa_base = ModelMeta(
|
|
|
192
205
|
# CC3M+CC12M+SBU
|
|
193
206
|
# LAION115M
|
|
194
207
|
),
|
|
208
|
+
citation=BLIP_CITATION,
|
|
195
209
|
)
|
|
196
210
|
|
|
197
211
|
blip_vqa_capfilt_large = ModelMeta(
|
|
@@ -217,6 +231,7 @@ blip_vqa_capfilt_large = ModelMeta(
|
|
|
217
231
|
# CC3M+CC12M+SBU
|
|
218
232
|
# LAION115M
|
|
219
233
|
),
|
|
234
|
+
citation=BLIP_CITATION,
|
|
220
235
|
)
|
|
221
236
|
|
|
222
237
|
blip_itm_base_coco = ModelMeta(
|
|
@@ -242,6 +257,7 @@ blip_itm_base_coco = ModelMeta(
|
|
|
242
257
|
# CC3M+CC12M+SBU
|
|
243
258
|
# LAION115M
|
|
244
259
|
),
|
|
260
|
+
citation=BLIP_CITATION,
|
|
245
261
|
)
|
|
246
262
|
|
|
247
263
|
blip_itm_large_coco = ModelMeta(
|
|
@@ -268,6 +284,7 @@ blip_itm_large_coco = ModelMeta(
|
|
|
268
284
|
# CC3M+CC12M+SBU
|
|
269
285
|
# LAION115M
|
|
270
286
|
),
|
|
287
|
+
citation=BLIP_CITATION,
|
|
271
288
|
)
|
|
272
289
|
|
|
273
290
|
blip_itm_base_flickr = ModelMeta(
|
|
@@ -294,6 +311,7 @@ blip_itm_base_flickr = ModelMeta(
|
|
|
294
311
|
# LAION115M
|
|
295
312
|
# Flickr30k
|
|
296
313
|
),
|
|
314
|
+
citation=BLIP_CITATION,
|
|
297
315
|
)
|
|
298
316
|
|
|
299
317
|
blip_itm_large_flickr = ModelMeta(
|
|
@@ -319,4 +337,5 @@ blip_itm_large_flickr = ModelMeta(
|
|
|
319
337
|
# CC3M+CC12M+SBU
|
|
320
338
|
# LAION115M
|
|
321
339
|
),
|
|
340
|
+
citation=BLIP_CITATION,
|
|
322
341
|
)
|
|
@@ -3,6 +3,13 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
|
|
|
3
3
|
|
|
4
4
|
from .bge_models import bge_m3_training_data
|
|
5
5
|
|
|
6
|
+
CADET_CITATION = """@article{tamber2025conventionalcontrastivelearningfalls,
|
|
7
|
+
title={Conventional Contrastive Learning Often Falls Short: Improving Dense Retrieval with Cross-Encoder Listwise Distillation and Synthetic Data},
|
|
8
|
+
author={Manveer Singh Tamber and Suleman Kazi and Vivek Sourabh and Jimmy Lin},
|
|
9
|
+
journal={arXiv:2505.19274},
|
|
10
|
+
year={2025}
|
|
11
|
+
}"""
|
|
12
|
+
|
|
6
13
|
cadet_training_data = {
|
|
7
14
|
# we train with the corpora of FEVER, MSMARCO, and DBPEDIA. We only train with synthetic generated queries.
|
|
8
15
|
# However, we do use queries from MSMARCO as examples for synthetic query generation.
|
|
@@ -46,4 +53,5 @@ cadet_embed = ModelMeta(
|
|
|
46
53
|
public_training_data="https://github.com/manveertamber/cadet-dense-retrieval",
|
|
47
54
|
training_datasets=cadet_training_data,
|
|
48
55
|
adapted_from="intfloat/e5-base-unsupervised",
|
|
56
|
+
citation=CADET_CITATION,
|
|
49
57
|
)
|
|
@@ -24,6 +24,16 @@ if TYPE_CHECKING:
|
|
|
24
24
|
)
|
|
25
25
|
logger = logging.getLogger(__name__)
|
|
26
26
|
|
|
27
|
+
CDE_CITATION = """@misc{morris2024contextualdocumentembeddings,
|
|
28
|
+
title={Contextual Document Embeddings},
|
|
29
|
+
author={John X. Morris and Alexander M. Rush},
|
|
30
|
+
year={2024},
|
|
31
|
+
eprint={2410.02525},
|
|
32
|
+
archivePrefix={arXiv},
|
|
33
|
+
primaryClass={cs.CL},
|
|
34
|
+
url={https://arxiv.org/abs/2410.02525},
|
|
35
|
+
}"""
|
|
36
|
+
|
|
27
37
|
|
|
28
38
|
class CDEWrapper(SentenceTransformerEncoderWrapper):
|
|
29
39
|
dataset_embeddings: torch.Tensor | None = None
|
|
@@ -217,6 +227,7 @@ cde_small_v1 = ModelMeta(
|
|
|
217
227
|
training_datasets=bge_full_data,
|
|
218
228
|
public_training_code="https://github.com/jxmorris12/cde",
|
|
219
229
|
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
|
|
230
|
+
citation=CDE_CITATION,
|
|
220
231
|
)
|
|
221
232
|
|
|
222
233
|
cde_small_v2 = ModelMeta(
|
|
@@ -244,4 +255,5 @@ cde_small_v2 = ModelMeta(
|
|
|
244
255
|
training_datasets=bge_full_data,
|
|
245
256
|
public_training_code="https://github.com/jxmorris12/cde",
|
|
246
257
|
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
|
|
258
|
+
citation=CDE_CITATION,
|
|
247
259
|
)
|
|
@@ -2,6 +2,18 @@ from mteb.models import ModelMeta
|
|
|
2
2
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
3
3
|
from mteb.types import PromptType
|
|
4
4
|
|
|
5
|
+
F2LLM_CITATION = """@article{2025F2LLM,
|
|
6
|
+
title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
|
|
7
|
+
author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
|
|
8
|
+
journal={CoRR},
|
|
9
|
+
volume={abs/2510.02294},
|
|
10
|
+
year={2025},
|
|
11
|
+
url={https://doi.org/10.48550/arXiv.2510.02294},
|
|
12
|
+
doi={10.48550/ARXIV.2510.02294},
|
|
13
|
+
eprinttype={arXiv},
|
|
14
|
+
eprint={2510.02294}
|
|
15
|
+
}"""
|
|
16
|
+
|
|
5
17
|
training_datasets = {
|
|
6
18
|
"MSMARCO",
|
|
7
19
|
"ArguAna",
|
|
@@ -146,6 +158,7 @@ F2LLM_0B6 = ModelMeta(
|
|
|
146
158
|
public_training_code="https://github.com/codefuse-ai/F2LLM",
|
|
147
159
|
public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
|
|
148
160
|
training_datasets=training_datasets,
|
|
161
|
+
citation=F2LLM_CITATION,
|
|
149
162
|
)
|
|
150
163
|
|
|
151
164
|
F2LLM_1B7 = ModelMeta(
|
|
@@ -174,6 +187,7 @@ F2LLM_1B7 = ModelMeta(
|
|
|
174
187
|
public_training_code="https://github.com/codefuse-ai/F2LLM",
|
|
175
188
|
public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
|
|
176
189
|
training_datasets=training_datasets,
|
|
190
|
+
citation=F2LLM_CITATION,
|
|
177
191
|
)
|
|
178
192
|
|
|
179
193
|
F2LLM_4B = ModelMeta(
|
|
@@ -202,4 +216,5 @@ F2LLM_4B = ModelMeta(
|
|
|
202
216
|
public_training_code="https://github.com/codefuse-ai/F2LLM",
|
|
203
217
|
public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
|
|
204
218
|
training_datasets=training_datasets,
|
|
219
|
+
citation=F2LLM_CITATION,
|
|
205
220
|
)
|
|
@@ -1,6 +1,15 @@
|
|
|
1
1
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
2
2
|
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
3
3
|
|
|
4
|
+
CODESAGE_CITATION = """@inproceedings{
|
|
5
|
+
zhang2024code,
|
|
6
|
+
title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
|
|
7
|
+
author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
|
|
8
|
+
booktitle={The Twelfth International Conference on Learning Representations},
|
|
9
|
+
year={2024},
|
|
10
|
+
url={https://openreview.net/forum?id=vfzRRjumpX}
|
|
11
|
+
}"""
|
|
12
|
+
|
|
4
13
|
codesage_languages = [
|
|
5
14
|
"python-Code",
|
|
6
15
|
"javascript-Code",
|
|
@@ -33,6 +42,7 @@ codesage_large = ModelMeta(
|
|
|
33
42
|
"CodeSearchNetRetrieval",
|
|
34
43
|
"CodeSearchNetCCRetrieval",
|
|
35
44
|
},
|
|
45
|
+
citation=CODESAGE_CITATION,
|
|
36
46
|
)
|
|
37
47
|
|
|
38
48
|
codesage_base = ModelMeta(
|
|
@@ -58,6 +68,7 @@ codesage_base = ModelMeta(
|
|
|
58
68
|
"CodeSearchNetRetrieval",
|
|
59
69
|
"CodeSearchNetCCRetrieval",
|
|
60
70
|
},
|
|
71
|
+
citation=CODESAGE_CITATION,
|
|
61
72
|
)
|
|
62
73
|
|
|
63
74
|
codesage_small = ModelMeta(
|
|
@@ -83,4 +94,5 @@ codesage_small = ModelMeta(
|
|
|
83
94
|
"CodeSearchNetRetrieval",
|
|
84
95
|
"CodeSearchNetCCRetrieval",
|
|
85
96
|
},
|
|
97
|
+
citation=CODESAGE_CITATION,
|
|
86
98
|
)
|
|
@@ -31,6 +31,12 @@ Haon_Chen__speed_embedding_7b_instruct = ModelMeta(
|
|
|
31
31
|
training_datasets=None,
|
|
32
32
|
adapted_from="mistralai/Mistral-7B-v0.1",
|
|
33
33
|
superseded_by=None,
|
|
34
|
+
citation="""@article{chen2024little,
|
|
35
|
+
title={Little Giants: Synthesizing High-Quality Embedding Data at Scale},
|
|
36
|
+
author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
|
|
37
|
+
journal={arXiv preprint arXiv:2410.18634},
|
|
38
|
+
year={2024}
|
|
39
|
+
}""",
|
|
34
40
|
)
|
|
35
41
|
Gameselo__STS_multilingual_mpnet_base_v2 = ModelMeta(
|
|
36
42
|
name="Gameselo/STS-multilingual-mpnet-base-v2",
|
|
@@ -10,6 +10,13 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
11
|
from mteb.types import Array, BatchedInput, PromptType
|
|
12
12
|
|
|
13
|
+
MOCOV3_CITATION = """@Article{chen2021mocov3,
|
|
14
|
+
author = {Xinlei Chen* and Saining Xie* and Kaiming He},
|
|
15
|
+
title = {An Empirical Study of Training Self-Supervised Vision Transformers},
|
|
16
|
+
journal = {arXiv preprint arXiv:2104.02057},
|
|
17
|
+
year = {2021},
|
|
18
|
+
}"""
|
|
19
|
+
|
|
13
20
|
|
|
14
21
|
def mocov3_loader(model_name, **kwargs):
|
|
15
22
|
requires_package(mocov3_loader, "timm", model_name, "pip install 'mteb[timm]'")
|
|
@@ -129,6 +136,7 @@ mocov3_vit_base = ModelMeta(
|
|
|
129
136
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
130
137
|
use_instructions=False,
|
|
131
138
|
training_datasets=mocov3_training_datasets,
|
|
139
|
+
citation=MOCOV3_CITATION,
|
|
132
140
|
)
|
|
133
141
|
|
|
134
142
|
mocov3_vit_large = ModelMeta(
|
|
@@ -151,4 +159,5 @@ mocov3_vit_large = ModelMeta(
|
|
|
151
159
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
152
160
|
use_instructions=False,
|
|
153
161
|
training_datasets=mocov3_training_datasets,
|
|
162
|
+
citation=MOCOV3_CITATION,
|
|
154
163
|
)
|
|
@@ -10,6 +10,14 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
11
|
from mteb.types import Array, BatchedInput, PromptType
|
|
12
12
|
|
|
13
|
+
OPENCLIP_CITATION = """@inproceedings{cherti2023reproducible,
|
|
14
|
+
title={Reproducible scaling laws for contrastive language-image learning},
|
|
15
|
+
author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
|
|
16
|
+
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
|
|
17
|
+
pages={2818--2829},
|
|
18
|
+
year={2023}
|
|
19
|
+
}"""
|
|
20
|
+
|
|
13
21
|
|
|
14
22
|
def openclip_loader(model_name, **kwargs):
|
|
15
23
|
requires_package(
|
|
@@ -133,6 +141,7 @@ CLIP_ViT_L_14_DataComp_XL_s13B_b90K = ModelMeta(
|
|
|
133
141
|
training_datasets=set(
|
|
134
142
|
# DataComp-1B
|
|
135
143
|
),
|
|
144
|
+
citation=OPENCLIP_CITATION,
|
|
136
145
|
)
|
|
137
146
|
|
|
138
147
|
CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
|
|
@@ -157,6 +166,7 @@ CLIP_ViT_B_32_DataComp_XL_s13B_b90K = ModelMeta(
|
|
|
157
166
|
training_datasets=set(
|
|
158
167
|
# DataComp-1B
|
|
159
168
|
),
|
|
169
|
+
citation=OPENCLIP_CITATION,
|
|
160
170
|
)
|
|
161
171
|
|
|
162
172
|
CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
|
|
@@ -181,6 +191,7 @@ CLIP_ViT_B_16_DataComp_XL_s13B_b90K = ModelMeta(
|
|
|
181
191
|
training_datasets=set(
|
|
182
192
|
# DataComp-1B
|
|
183
193
|
),
|
|
194
|
+
citation=OPENCLIP_CITATION,
|
|
184
195
|
)
|
|
185
196
|
|
|
186
197
|
CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
|
|
@@ -205,6 +216,7 @@ CLIP_ViT_bigG_14_laion2B_39B_b160k = ModelMeta(
|
|
|
205
216
|
training_datasets=set(
|
|
206
217
|
# 2 Billion sample English subset of LAION-5B
|
|
207
218
|
),
|
|
219
|
+
citation=OPENCLIP_CITATION,
|
|
208
220
|
)
|
|
209
221
|
|
|
210
222
|
CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
|
|
@@ -229,6 +241,7 @@ CLIP_ViT_g_14_laion2B_s34B_b88K = ModelMeta(
|
|
|
229
241
|
training_datasets=set(
|
|
230
242
|
# 2 Billion sample English subset of LAION-5B
|
|
231
243
|
),
|
|
244
|
+
citation=OPENCLIP_CITATION,
|
|
232
245
|
)
|
|
233
246
|
|
|
234
247
|
CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
|
|
@@ -253,6 +266,7 @@ CLIP_ViT_H_14_laion2B_s32B_b79K = ModelMeta(
|
|
|
253
266
|
training_datasets=set(
|
|
254
267
|
# 2 Billion sample English subset of LAION-5B
|
|
255
268
|
),
|
|
269
|
+
citation=OPENCLIP_CITATION,
|
|
256
270
|
)
|
|
257
271
|
|
|
258
272
|
CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
|
|
@@ -277,6 +291,7 @@ CLIP_ViT_L_14_laion2B_s32B_b82K = ModelMeta(
|
|
|
277
291
|
training_datasets=set(
|
|
278
292
|
# 2 Billion sample English subset of LAION-5B
|
|
279
293
|
),
|
|
294
|
+
citation=OPENCLIP_CITATION,
|
|
280
295
|
)
|
|
281
296
|
|
|
282
297
|
CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta(
|
|
@@ -301,4 +316,5 @@ CLIP_ViT_B_32_laion2B_s34B_b79K = ModelMeta(
|
|
|
301
316
|
training_datasets=set(
|
|
302
317
|
# 2 Billion sample English subset of LAION-5B
|
|
303
318
|
),
|
|
319
|
+
citation=OPENCLIP_CITATION,
|
|
304
320
|
)
|
|
@@ -48,4 +48,10 @@ piccolo_large_zh_v2 = ModelMeta(
|
|
|
48
48
|
public_training_code=None,
|
|
49
49
|
public_training_data=None,
|
|
50
50
|
training_datasets=None, # They don't say
|
|
51
|
+
citation="""@misc{2405.06932,
|
|
52
|
+
Author = {Junqin Huang and Zhongjie Hu and Zihao Jing and Mengya Gao and Yichao Wu},
|
|
53
|
+
Title = {Piccolo2: General Text Embedding with Multi-task Hybrid Loss Training},
|
|
54
|
+
Year = {2024},
|
|
55
|
+
Eprint = {arXiv:2405.06932},
|
|
56
|
+
}""",
|
|
51
57
|
)
|
|
@@ -17,11 +17,17 @@ potion_base_8m = ModelMeta(
|
|
|
17
17
|
license="mit",
|
|
18
18
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
19
19
|
framework=["NumPy", "Sentence Transformers"],
|
|
20
|
-
reference="https://huggingface.co/
|
|
20
|
+
reference="https://huggingface.co/rasgaard/m2v-dfm-large",
|
|
21
21
|
use_instructions=False,
|
|
22
22
|
adapted_from="KennethEnevoldsen/dfm-sentence-encoder-large",
|
|
23
23
|
superseded_by=None,
|
|
24
24
|
training_datasets=set(), # distilled
|
|
25
25
|
public_training_code="https://github.com/MinishLab/model2vec",
|
|
26
26
|
public_training_data="https://huggingface.co/datasets/HuggingFaceFW/fineweb-2", # distilled on this
|
|
27
|
+
citation="""@article{minishlab2024model2vec,
|
|
28
|
+
author = {Tulkens, Stephan and {van Dongen}, Thomas},
|
|
29
|
+
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
|
|
30
|
+
year = {2024},
|
|
31
|
+
url = {https://github.com/MinishLab/model2vec}
|
|
32
|
+
}""",
|
|
27
33
|
)
|