mteb 2.1.7__py3-none-any.whl → 2.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. mteb/_create_dataloaders.py +6 -3
  2. mteb/_evaluators/any_sts_evaluator.py +14 -12
  3. mteb/_evaluators/clustering_evaluator.py +1 -1
  4. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
  5. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  6. mteb/_evaluators/sklearn_evaluator.py +15 -28
  7. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  8. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  9. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  10. mteb/abstasks/clustering.py +1 -1
  11. mteb/abstasks/multilabel_classification.py +2 -2
  12. mteb/abstasks/task_metadata.py +1 -0
  13. mteb/benchmarks/benchmark.py +9 -0
  14. mteb/benchmarks/benchmarks/__init__.py +2 -0
  15. mteb/benchmarks/benchmarks/benchmarks.py +40 -1
  16. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  17. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  18. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  19. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  20. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  21. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  22. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  23. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  24. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  25. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  26. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  27. mteb/models/model_implementations/align_models.py +6 -0
  28. mteb/models/model_implementations/ara_models.py +7 -0
  29. mteb/models/model_implementations/blip2_models.py +9 -0
  30. mteb/models/model_implementations/blip_models.py +19 -0
  31. mteb/models/model_implementations/cadet_models.py +8 -0
  32. mteb/models/model_implementations/cde_models.py +12 -0
  33. mteb/models/model_implementations/codefuse_models.py +15 -0
  34. mteb/models/model_implementations/codesage_models.py +12 -0
  35. mteb/models/model_implementations/misc_models.py +6 -0
  36. mteb/models/model_implementations/moco_models.py +9 -0
  37. mteb/models/model_implementations/openclip_models.py +16 -0
  38. mteb/models/model_implementations/piccolo_models.py +6 -0
  39. mteb/models/model_implementations/rasgaard_models.py +7 -1
  40. mteb/models/model_implementations/tarka_models.py +317 -0
  41. mteb/models/search_wrappers.py +5 -5
  42. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +1 -5
  43. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  44. mteb/tasks/classification/ara/ajgt.py +1 -2
  45. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  46. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  47. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  48. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  49. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  50. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  51. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  52. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  53. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  54. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  55. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  56. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  57. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  58. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  59. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  60. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  61. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  62. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  63. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  64. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  65. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  66. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  67. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  68. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  69. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  70. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  71. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  72. mteb/tasks/classification/eng/news_classification.py +1 -2
  73. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  74. mteb/tasks/classification/eng/patent_classification.py +1 -2
  75. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  76. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  77. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  78. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  79. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  80. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  81. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  82. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  83. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  84. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  85. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  86. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  87. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  88. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  89. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  90. mteb/tasks/classification/est/estonian_valence.py +1 -2
  91. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  92. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  93. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  94. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  95. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  96. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  97. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +1 -2
  98. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  99. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  100. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  101. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  102. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  103. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  104. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  105. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  106. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  107. mteb/tasks/classification/kor/klue_tc.py +1 -2
  108. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  109. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  110. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  111. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  112. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  113. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  114. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  115. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  116. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  117. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  118. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  119. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  120. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  121. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +1 -3
  122. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  123. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  124. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  125. mteb/tasks/classification/pol/polish_classification.py +3 -6
  126. mteb/tasks/classification/ron/moroco.py +1 -2
  127. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  128. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  129. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  130. mteb/tasks/classification/rus/headline_classification.py +1 -2
  131. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  132. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  133. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  134. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  135. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  136. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  137. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  138. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  139. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  140. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  141. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  142. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  143. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  144. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  145. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  146. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  147. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  148. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  149. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  150. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  151. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  152. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  153. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  154. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  155. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  156. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  157. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  158. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  159. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  160. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  161. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  162. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  163. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  164. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  165. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  166. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  167. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  168. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  169. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  170. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  171. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  172. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  173. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  174. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  175. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  176. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  177. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  178. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  179. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  180. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  181. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  182. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  183. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  184. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  185. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  186. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  187. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  188. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  189. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  190. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  191. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  192. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  193. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  194. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  195. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  196. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  197. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  198. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  199. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  200. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  201. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  202. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  203. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  204. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  205. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  206. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  207. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  208. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  209. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  210. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  211. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  212. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  213. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  214. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  215. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  216. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  217. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  218. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  219. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  220. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  221. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  222. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  223. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  224. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  225. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  226. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  227. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  228. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  229. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  230. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  231. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  232. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  233. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  234. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/METADATA +1 -1
  235. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/RECORD +239 -228
  236. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/WHEEL +0 -0
  237. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/entry_points.txt +0 -0
  238. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/licenses/LICENSE +0 -0
  239. {mteb-2.1.7.dist-info → mteb-2.1.8.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ from collections.abc import Callable
3
3
  from typing import Any, cast
4
4
 
5
5
  import torch
6
- from datasets import Dataset
6
+ from datasets import Dataset, Image
7
7
  from torch.utils.data import DataLoader, default_collate
8
8
 
9
9
  from mteb.abstasks.task_metadata import TaskMetadata
@@ -22,12 +22,14 @@ logger = logging.getLogger(__name__)
22
22
  def _create_dataloader_from_texts(
23
23
  text: list[str],
24
24
  batch_size: int = 32,
25
+ **kwargs: dict[str, Any],
25
26
  ) -> DataLoader[TextInput]:
26
27
  """Create a dataloader from a list of text.
27
28
 
28
29
  Args:
29
30
  text: A list of text to create a dataloader from.
30
31
  batch_size: Batch size for the dataloader.
32
+ kwargs: Not used, present catching extra arguments.
31
33
 
32
34
  Returns:
33
35
  A dataloader with the text.
@@ -244,14 +246,15 @@ def _prepare_image_dataset(
244
246
  transform: Callable[[Any], Any] | None = None,
245
247
  ) -> Dataset:
246
248
  """Prepare the image dataset by converting images to RGB and applying transformations."""
247
- # If the dataset uses a different column name for images, rename it to "image".
248
249
  if (
249
250
  image_column_name
250
251
  and image_column_name in dataset.column_names
251
252
  and "image" not in dataset.column_names
252
253
  ):
253
254
  dataset = dataset.rename_column(image_column_name, "image")
254
- # Map the conversion function over the dataset.
255
+ # don't process image if it's already in the correct format
256
+ if isinstance(dataset.features["image"], Image):
257
+ return dataset
255
258
  return dataset.map(
256
259
  _convert_images_to_rgb,
257
260
  fn_kwargs={"image_col_name": "image", "transform": transform},
@@ -45,16 +45,8 @@ class AnySTSEvaluator(Evaluator):
45
45
  **kwargs,
46
46
  ) -> None:
47
47
  super().__init__(**kwargs)
48
- self.first_column = create_dataloader(
49
- dataset,
50
- task_metadata,
51
- input_column=sentences_column_names[0],
52
- )
53
- self.second_column = create_dataloader(
54
- dataset,
55
- task_metadata,
56
- input_column=sentences_column_names[1],
57
- )
48
+ self.dataset = dataset
49
+ self.input_columns = sentences_column_names
58
50
  self.task_metadata = task_metadata
59
51
  self.hf_split = hf_split
60
52
  self.hf_subset = hf_subset
@@ -67,7 +59,12 @@ class AnySTSEvaluator(Evaluator):
67
59
  ) -> STSEvaluatorScores:
68
60
  logger.info("Running semantic similarity - Encoding samples (1/2)")
69
61
  embeddings1 = model.encode(
70
- self.first_column,
62
+ create_dataloader(
63
+ self.dataset,
64
+ self.task_metadata,
65
+ input_column=self.input_columns[0],
66
+ **encode_kwargs,
67
+ ),
71
68
  task_metadata=self.task_metadata,
72
69
  hf_split=self.hf_split,
73
70
  hf_subset=self.hf_subset,
@@ -76,7 +73,12 @@ class AnySTSEvaluator(Evaluator):
76
73
 
77
74
  logger.info("Running semantic similarity - Encoding samples (2/2)...")
78
75
  embeddings2 = model.encode(
79
- self.second_column,
76
+ create_dataloader(
77
+ self.dataset,
78
+ self.task_metadata,
79
+ input_column=self.input_columns[1],
80
+ **encode_kwargs,
81
+ ),
80
82
  task_metadata=self.task_metadata,
81
83
  hf_split=self.hf_split,
82
84
  hf_subset=self.hf_subset,
@@ -44,7 +44,7 @@ class ClusteringEvaluator(Evaluator):
44
44
  self.dataset,
45
45
  self.task_metadata,
46
46
  input_column=self.input_column_name,
47
- batch_size=encode_kwargs["batch_size"],
47
+ **encode_kwargs,
48
48
  )
49
49
 
50
50
  logger.info("Running clustering - Encoding samples...")
@@ -103,7 +103,7 @@ class ImageTextPairClassificationEvaluator(Evaluator):
103
103
  text_embeddings = model.encode(
104
104
  DataLoader(
105
105
  Dataset.from_dict({"text": texts}),
106
- batch_size=encode_kwargs["batch_size"],
106
+ **encode_kwargs,
107
107
  ),
108
108
  task_metadata=self.task_metadata,
109
109
  hf_subset=self.hf_subset,
@@ -122,8 +122,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
122
122
  image_embeddings = model.encode(
123
123
  DataLoader(
124
124
  CustomImageDataset(images),
125
- batch_size=encode_kwargs["batch_size"],
126
125
  collate_fn=lambda x: {"image": [item["image"] for item in x]},
126
+ **encode_kwargs,
127
127
  ),
128
128
  task_metadata=self.task_metadata,
129
129
  hf_subset=self.hf_subset,
@@ -106,6 +106,7 @@ class PairClassificationEvaluator(Evaluator):
106
106
  self.dataset,
107
107
  task_metadata=self.task_metadata,
108
108
  input_column=self.input1_column_name,
109
+ **encode_kwargs,
109
110
  ),
110
111
  task_metadata=self.task_metadata,
111
112
  hf_split=self.hf_split,
@@ -117,6 +118,7 @@ class PairClassificationEvaluator(Evaluator):
117
118
  self.dataset,
118
119
  task_metadata=self.task_metadata,
119
120
  input_column=self.input2_column_name,
121
+ **encode_kwargs,
120
122
  ),
121
123
  task_metadata=self.task_metadata,
122
124
  hf_split=self.hf_split,
@@ -168,7 +170,7 @@ class PairClassificationEvaluator(Evaluator):
168
170
  )
169
171
  all_unique_texts_embs = np.asarray(
170
172
  model.encode(
171
- _create_dataloader_from_texts(all_unique_texts),
173
+ _create_dataloader_from_texts(all_unique_texts, **encode_kwargs),
172
174
  task_metadata=task_metadata,
173
175
  hf_split=hf_split,
174
176
  hf_subset=hf_subset,
@@ -6,7 +6,7 @@ from datasets import Dataset
6
6
  from torch.utils.data import DataLoader
7
7
  from typing_extensions import Self
8
8
 
9
- from mteb._create_dataloaders import _create_image_dataloader
9
+ from mteb._create_dataloaders import create_dataloader
10
10
  from mteb.abstasks.task_metadata import TaskMetadata
11
11
  from mteb.models import EncoderProtocol
12
12
  from mteb.types import BatchedInput
@@ -50,33 +50,20 @@ class SklearnEvaluator(Evaluator):
50
50
  self.evaluator_model = evaluator_model
51
51
 
52
52
  def create_dataloaders(
53
- self, batch_size: int
53
+ self, encode_kwargs: dict[str, Any]
54
54
  ) -> tuple[DataLoader[BatchedInput], DataLoader[BatchedInput]]:
55
- if self.task_metadata.modalities == ["image"]:
56
- dataloader_train = _create_image_dataloader(
57
- self.train_dataset,
58
- image_column_name=self.values_column_name,
59
- batch_size=batch_size,
60
- )
61
- dataloader_test = _create_image_dataloader(
62
- self.eval_dataset,
63
- image_column_name=self.values_column_name,
64
- batch_size=batch_size,
65
- )
66
- elif self.task_metadata.modalities == ["text"]:
67
- if self.values_column_name != "text":
68
- self.train_dataset = self.train_dataset.rename_column(
69
- self.values_column_name, "text"
70
- )
71
- self.eval_dataset = self.eval_dataset.rename_column(
72
- self.values_column_name, "text"
73
- )
74
- dataloader_train = DataLoader(self.train_dataset)
75
- dataloader_test = DataLoader(self.eval_dataset)
76
- else:
77
- raise ValueError(
78
- "ClassificationEvaluator only supports image and text modalities."
79
- )
55
+ dataloader_train = create_dataloader(
56
+ self.train_dataset,
57
+ self.task_metadata,
58
+ input_column=self.values_column_name,
59
+ **encode_kwargs,
60
+ )
61
+ dataloader_test = create_dataloader(
62
+ self.eval_dataset,
63
+ self.task_metadata,
64
+ input_column=self.values_column_name,
65
+ **encode_kwargs,
66
+ )
80
67
  return dataloader_train, dataloader_test
81
68
 
82
69
  def __call__( # type: ignore[override]
@@ -98,7 +85,7 @@ class SklearnEvaluator(Evaluator):
98
85
 
99
86
  """
100
87
  dataloader_train, dataloader_test = self.create_dataloaders(
101
- batch_size=encode_kwargs["batch_size"]
88
+ encode_kwargs=encode_kwargs,
102
89
  )
103
90
 
104
91
  logger.info("Running - Encoding samples...")
@@ -46,7 +46,10 @@ class BitextMiningEvaluator(Evaluator):
46
46
 
47
47
  embeddings = {}
48
48
  for sub in tqdm(subsets):
49
- dataloader = _create_dataloader_from_texts(self.sentences[sub])
49
+ dataloader = _create_dataloader_from_texts(
50
+ self.sentences[sub],
51
+ **encode_kwargs,
52
+ )
50
53
  embeddings[sub] = model.encode(
51
54
  dataloader,
52
55
  task_metadata=self.task_metadata,
@@ -109,7 +109,8 @@ class SummarizationEvaluator(Evaluator):
109
109
  summary
110
110
  for human_summaries in self.human_summaries
111
111
  for summary in human_summaries
112
- ]
112
+ ],
113
+ **encode_kwargs,
113
114
  ),
114
115
  task_metadata=self.task_metadata,
115
116
  hf_subset=self.hf_subset,
@@ -124,7 +125,8 @@ class SummarizationEvaluator(Evaluator):
124
125
  summary
125
126
  for machine_summaries in self.machine_summaries
126
127
  for summary in machine_summaries
127
- ]
128
+ ],
129
+ **encode_kwargs,
128
130
  ),
129
131
  task_metadata=self.task_metadata,
130
132
  hf_subset=self.hf_subset,
@@ -42,14 +42,14 @@ class ZeroShotClassificationEvaluator(Evaluator):
42
42
  ) -> Array:
43
43
  dataloader = create_dataloader(
44
44
  self.dataset,
45
- batch_size=encode_kwargs["batch_size"],
46
45
  input_column=self.input_column_name,
47
46
  task_metadata=self.task_metadata,
47
+ **encode_kwargs,
48
48
  )
49
49
 
50
50
  logger.info("Running zero-shot classification - Encoding labels...")
51
51
  text_label_embeddings = model.encode(
52
- _create_dataloader_from_texts(self.candidate_labels),
52
+ _create_dataloader_from_texts(self.candidate_labels, **encode_kwargs),
53
53
  task_metadata=self.task_metadata,
54
54
  hf_subset=self.hf_subset,
55
55
  hf_split=self.hf_split,
@@ -200,7 +200,7 @@ class AbsTaskClustering(AbsTask):
200
200
  downsampled_dataset,
201
201
  self.metadata,
202
202
  input_column=self.input_column_name,
203
- batch_size=encode_kwargs["batch_size"],
203
+ **encode_kwargs,
204
204
  ),
205
205
  task_metadata=self.metadata,
206
206
  hf_subset=hf_subset,
@@ -112,7 +112,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
112
112
  unique_train_dataset,
113
113
  self.metadata,
114
114
  input_column=self.input_column_name,
115
- batch_size=encode_kwargs["batch_size"],
115
+ **encode_kwargs,
116
116
  )
117
117
 
118
118
  logger.info("Running multilabel classification - Encoding training set...")
@@ -141,7 +141,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
141
141
  test_dataset.select_columns(self.input_column_name),
142
142
  self.metadata,
143
143
  input_column=self.input_column_name,
144
- batch_size=encode_kwargs["batch_size"],
144
+ **encode_kwargs,
145
145
  )
146
146
 
147
147
  logger.info("Running multilabel classification - Encoding test set...")
@@ -107,6 +107,7 @@ The domains follow the categories used in the [Universal Dependencies project](h
107
107
  SampleCreationMethod = Literal[
108
108
  "found",
109
109
  "created",
110
+ "created and machine-translated",
110
111
  "human-translated and localized",
111
112
  "human-translated",
112
113
  "machine-translated",
@@ -106,3 +106,12 @@ class MIEBBenchmark(Benchmark):
106
106
  self, benchmark_results: BenchmarkResults
107
107
  ) -> pd.DataFrame:
108
108
  return _create_summary_table_mean_task_type(benchmark_results)
109
+
110
+
111
+ class Vidore3Benchmark(Benchmark):
112
+ """Wrapper for Vidore3 benchmark."""
113
+
114
+ def _create_summary_table(
115
+ self, benchmark_results: BenchmarkResults
116
+ ) -> pd.DataFrame:
117
+ return _create_summary_table_mean_public_private(benchmark_results)
@@ -38,6 +38,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
38
38
  SEB,
39
39
  VIDORE,
40
40
  VIDORE_V2,
41
+ VIDORE_V3,
41
42
  VISUAL_DOCUMENT_RETRIEVAL,
42
43
  VN_MTEB,
43
44
  CoIR,
@@ -108,6 +109,7 @@ __all__ = [
108
109
  "SEB",
109
110
  "VIDORE",
110
111
  "VIDORE_V2",
112
+ "VIDORE_V3",
111
113
  "VISUAL_DOCUMENT_RETRIEVAL",
112
114
  "VN_MTEB",
113
115
  "CoIR",
@@ -1,4 +1,9 @@
1
- from mteb.benchmarks.benchmark import Benchmark, HUMEBenchmark, MIEBBenchmark
1
+ from mteb.benchmarks.benchmark import (
2
+ Benchmark,
3
+ HUMEBenchmark,
4
+ MIEBBenchmark,
5
+ Vidore3Benchmark,
6
+ )
2
7
  from mteb.get_tasks import MTEBTasks, get_task, get_tasks
3
8
 
4
9
  MMTEB_CITATION = r"""@article{enevoldsen2025mmtebmassivemultilingualtext,
@@ -2214,6 +2219,40 @@ VIDORE_V2 = Benchmark(
2214
2219
  """,
2215
2220
  )
2216
2221
 
2222
+ VIDORE_V3 = Vidore3Benchmark(
2223
+ name="ViDoRe(v3)",
2224
+ display_name="ViDoRe V3",
2225
+ icon="https://cdn-uploads.huggingface.co/production/uploads/66e16a677c2eb2da5109fb5c/x99xqw__fl2UaPbiIdC_f.png",
2226
+ tasks=get_tasks(
2227
+ tasks=[
2228
+ "Vidore3FinanceEnRetrieval",
2229
+ "Vidore3IndustrialRetrieval",
2230
+ "Vidore3ComputerScienceRetrieval",
2231
+ "Vidore3PharmaceuticalsRetrieval",
2232
+ "Vidore3HrRetrieval",
2233
+ "Vidore3FinanceFrRetrieval",
2234
+ "Vidore3PhysicsRetrieval",
2235
+ "Vidore3EnergyRetrieval",
2236
+ "Vidore3TelecomRetrieval",
2237
+ "Vidore3NuclearRetrieval",
2238
+ ]
2239
+ ),
2240
+ description="ViDoRe V3 sets a new industry gold standard for multi-modal, enterprise document visual retrieval evaluation. It addresses a critical challenge in production RAG systems: retrieving accurate information from complex, visually-rich documents. The benchmark includes both open and closed datasets: to submit results on private tasks, please [open an issue](https://github.com/embeddings-benchmark/mteb/issues?template=eval_request.yaml).",
2241
+ reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
2242
+ citation=r"""
2243
+ @misc{mace2025vidorev3,
2244
+ author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
2245
+ day = {5},
2246
+ howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
2247
+ journal = {Hugging Face Blog},
2248
+ month = {November},
2249
+ publisher = {Hugging Face},
2250
+ title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
2251
+ year = {2025},
2252
+ }
2253
+ """,
2254
+ )
2255
+
2217
2256
  VISUAL_DOCUMENT_RETRIEVAL = Benchmark(
2218
2257
  name="VisualDocumentRetrieval",
2219
2258
  display_name="Visual Document Retrieval",
@@ -0,0 +1,214 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 9450,
4
+ "number_of_characters": 152825,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 1700,
8
+ "average_image_width": 1700.0,
9
+ "max_image_width": 1700,
10
+ "min_image_height": 2200,
11
+ "average_image_height": 2200.0,
12
+ "max_image_height": 2200,
13
+ "unique_images": 1359
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 152825,
17
+ "min_text_length": 21,
18
+ "average_text_length": 118.46899224806202,
19
+ "max_text_length": 591,
20
+ "unique_texts": 1290
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 37764,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 4.879069767441861,
27
+ "max_relevant_docs_per_query": 21,
28
+ "unique_relevant_docs": 3534
29
+ },
30
+ "top_ranked_statistics": null,
31
+ "hf_subset_descriptive_stats": {
32
+ "french": {
33
+ "num_samples": 1575,
34
+ "number_of_characters": 27948,
35
+ "documents_text_statistics": null,
36
+ "documents_image_statistics": {
37
+ "min_image_width": 1700,
38
+ "average_image_width": 1700.0,
39
+ "max_image_width": 1700,
40
+ "min_image_height": 2200,
41
+ "average_image_height": 2200.0,
42
+ "max_image_height": 2200,
43
+ "unique_images": 1359
44
+ },
45
+ "queries_text_statistics": {
46
+ "total_text_length": 27948,
47
+ "min_text_length": 28,
48
+ "average_text_length": 129.9906976744186,
49
+ "max_text_length": 563,
50
+ "unique_texts": 215
51
+ },
52
+ "queries_image_statistics": null,
53
+ "relevant_docs_statistics": {
54
+ "num_relevant_docs": 6294,
55
+ "min_relevant_docs_per_query": 1,
56
+ "average_relevant_docs_per_query": 4.879069767441861,
57
+ "max_relevant_docs_per_query": 21,
58
+ "unique_relevant_docs": 589
59
+ },
60
+ "top_ranked_statistics": null
61
+ },
62
+ "spanish": {
63
+ "num_samples": 1575,
64
+ "number_of_characters": 26025,
65
+ "documents_text_statistics": null,
66
+ "documents_image_statistics": {
67
+ "min_image_width": 1700,
68
+ "average_image_width": 1700.0,
69
+ "max_image_width": 1700,
70
+ "min_image_height": 2200,
71
+ "average_image_height": 2200.0,
72
+ "max_image_height": 2200,
73
+ "unique_images": 1359
74
+ },
75
+ "queries_text_statistics": {
76
+ "total_text_length": 26025,
77
+ "min_text_length": 30,
78
+ "average_text_length": 121.04651162790698,
79
+ "max_text_length": 565,
80
+ "unique_texts": 215
81
+ },
82
+ "queries_image_statistics": null,
83
+ "relevant_docs_statistics": {
84
+ "num_relevant_docs": 6294,
85
+ "min_relevant_docs_per_query": 1,
86
+ "average_relevant_docs_per_query": 4.879069767441861,
87
+ "max_relevant_docs_per_query": 21,
88
+ "unique_relevant_docs": 589
89
+ },
90
+ "top_ranked_statistics": null
91
+ },
92
+ "english": {
93
+ "num_samples": 1575,
94
+ "number_of_characters": 22198,
95
+ "documents_text_statistics": null,
96
+ "documents_image_statistics": {
97
+ "min_image_width": 1700,
98
+ "average_image_width": 1700.0,
99
+ "max_image_width": 1700,
100
+ "min_image_height": 2200,
101
+ "average_image_height": 2200.0,
102
+ "max_image_height": 2200,
103
+ "unique_images": 1359
104
+ },
105
+ "queries_text_statistics": {
106
+ "total_text_length": 22198,
107
+ "min_text_length": 22,
108
+ "average_text_length": 103.24651162790698,
109
+ "max_text_length": 486,
110
+ "unique_texts": 215
111
+ },
112
+ "queries_image_statistics": null,
113
+ "relevant_docs_statistics": {
114
+ "num_relevant_docs": 6294,
115
+ "min_relevant_docs_per_query": 1,
116
+ "average_relevant_docs_per_query": 4.879069767441861,
117
+ "max_relevant_docs_per_query": 21,
118
+ "unique_relevant_docs": 589
119
+ },
120
+ "top_ranked_statistics": null
121
+ },
122
+ "german": {
123
+ "num_samples": 1575,
124
+ "number_of_characters": 26237,
125
+ "documents_text_statistics": null,
126
+ "documents_image_statistics": {
127
+ "min_image_width": 1700,
128
+ "average_image_width": 1700.0,
129
+ "max_image_width": 1700,
130
+ "min_image_height": 2200,
131
+ "average_image_height": 2200.0,
132
+ "max_image_height": 2200,
133
+ "unique_images": 1359
134
+ },
135
+ "queries_text_statistics": {
136
+ "total_text_length": 26237,
137
+ "min_text_length": 22,
138
+ "average_text_length": 122.03255813953488,
139
+ "max_text_length": 542,
140
+ "unique_texts": 215
141
+ },
142
+ "queries_image_statistics": null,
143
+ "relevant_docs_statistics": {
144
+ "num_relevant_docs": 6294,
145
+ "min_relevant_docs_per_query": 1,
146
+ "average_relevant_docs_per_query": 4.879069767441861,
147
+ "max_relevant_docs_per_query": 21,
148
+ "unique_relevant_docs": 589
149
+ },
150
+ "top_ranked_statistics": null
151
+ },
152
+ "italian": {
153
+ "num_samples": 1575,
154
+ "number_of_characters": 25835,
155
+ "documents_text_statistics": null,
156
+ "documents_image_statistics": {
157
+ "min_image_width": 1700,
158
+ "average_image_width": 1700.0,
159
+ "max_image_width": 1700,
160
+ "min_image_height": 2200,
161
+ "average_image_height": 2200.0,
162
+ "max_image_height": 2200,
163
+ "unique_images": 1359
164
+ },
165
+ "queries_text_statistics": {
166
+ "total_text_length": 25835,
167
+ "min_text_length": 21,
168
+ "average_text_length": 120.16279069767442,
169
+ "max_text_length": 521,
170
+ "unique_texts": 215
171
+ },
172
+ "queries_image_statistics": null,
173
+ "relevant_docs_statistics": {
174
+ "num_relevant_docs": 6294,
175
+ "min_relevant_docs_per_query": 1,
176
+ "average_relevant_docs_per_query": 4.879069767441861,
177
+ "max_relevant_docs_per_query": 21,
178
+ "unique_relevant_docs": 589
179
+ },
180
+ "top_ranked_statistics": null
181
+ },
182
+ "portuguese": {
183
+ "num_samples": 1575,
184
+ "number_of_characters": 24582,
185
+ "documents_text_statistics": null,
186
+ "documents_image_statistics": {
187
+ "min_image_width": 1700,
188
+ "average_image_width": 1700.0,
189
+ "max_image_width": 1700,
190
+ "min_image_height": 2200,
191
+ "average_image_height": 2200.0,
192
+ "max_image_height": 2200,
193
+ "unique_images": 1359
194
+ },
195
+ "queries_text_statistics": {
196
+ "total_text_length": 24582,
197
+ "min_text_length": 26,
198
+ "average_text_length": 114.33488372093024,
199
+ "max_text_length": 591,
200
+ "unique_texts": 215
201
+ },
202
+ "queries_image_statistics": null,
203
+ "relevant_docs_statistics": {
204
+ "num_relevant_docs": 6294,
205
+ "min_relevant_docs_per_query": 1,
206
+ "average_relevant_docs_per_query": 4.879069767441861,
207
+ "max_relevant_docs_per_query": 21,
208
+ "unique_relevant_docs": 589
209
+ },
210
+ "top_ranked_statistics": null
211
+ }
212
+ }
213
+ }
214
+ }