mteb 2.3.0__py3-none-any.whl → 2.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. mteb/benchmarks/benchmarks/__init__.py +2 -0
  2. mteb/benchmarks/benchmarks/benchmarks.py +62 -1
  3. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  4. mteb/evaluate.py +38 -7
  5. mteb/leaderboard/app.py +161 -124
  6. mteb/leaderboard/benchmark_selector.py +5 -2
  7. mteb/leaderboard/table.py +2 -4
  8. mteb/models/model_implementations/colpali_models.py +4 -4
  9. mteb/models/model_implementations/colqwen_models.py +206 -2
  10. mteb/models/model_implementations/euler_models.py +25 -0
  11. mteb/models/model_implementations/jina_models.py +203 -5
  12. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +8 -9
  13. mteb/models/model_implementations/ru_sentence_models.py +9 -0
  14. mteb/models/model_implementations/vdr_models.py +1 -0
  15. mteb/models/model_implementations/yuan_models_en.py +57 -0
  16. mteb/results/model_result.py +2 -1
  17. mteb/results/task_result.py +12 -0
  18. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  19. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  20. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  21. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  22. {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/METADATA +5 -2
  23. {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/RECORD +27 -23
  24. {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/WHEEL +0 -0
  25. {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/entry_points.txt +0 -0
  26. {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/licenses/LICENSE +0 -0
  27. {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/top_level.txt +0 -0
mteb/evaluate.py CHANGED
@@ -7,6 +7,7 @@ from pathlib import Path
7
7
  from time import time
8
8
  from typing import TYPE_CHECKING, Any, cast
9
9
 
10
+ from datasets.exceptions import DatasetNotFoundError
10
11
  from tqdm.auto import tqdm
11
12
 
12
13
  from mteb._helpful_enum import HelpfulStrEnum
@@ -25,6 +26,7 @@ from mteb.models.sentence_transformer_wrapper import (
25
26
  SentenceTransformerEncoderWrapper,
26
27
  )
27
28
  from mteb.results import ModelResult, TaskResult
29
+ from mteb.results.task_result import TaskError
28
30
  from mteb.types import HFSubset, PromptType, SplitName
29
31
  from mteb.types._metadata import ModelName, Revision
30
32
 
@@ -117,7 +119,8 @@ def _evaluate_task(
117
119
  co2_tracker: bool | None,
118
120
  encode_kwargs: dict[str, Any],
119
121
  prediction_folder: Path | None,
120
- ) -> TaskResult:
122
+ public_only: bool | None,
123
+ ) -> TaskResult | TaskError:
121
124
  """The core logic to run a model on a given task. See `evaluate` for more details.
122
125
 
123
126
  Returns:
@@ -149,6 +152,7 @@ def _evaluate_task(
149
152
  encode_kwargs=encode_kwargs,
150
153
  co2_tracker=False,
151
154
  prediction_folder=prediction_folder,
155
+ public_only=public_only,
152
156
  )
153
157
  result.kg_co2_emissions = tracker.final_emissions
154
158
  return result
@@ -159,7 +163,20 @@ def _evaluate_task(
159
163
 
160
164
  data_loaded = task.data_loaded
161
165
  if not data_loaded:
162
- task.load_data()
166
+ try:
167
+ task.load_data()
168
+ except DatasetNotFoundError as e:
169
+ if not task.metadata.is_public and public_only is None:
170
+ logger.warning(
171
+ f"Dataset for private task '{task.metadata.name}' not found. "
172
+ "Make sure you have access to the dataset and that you have set up the authentication correctly. To disable this warning set `public_only=False`"
173
+ )
174
+ return TaskError(
175
+ task_name=task.metadata.name,
176
+ exception=str(e),
177
+ )
178
+ if public_only is False:
179
+ raise e
163
180
 
164
181
  evaluation_time = 0
165
182
 
@@ -281,6 +298,7 @@ def evaluate(
281
298
  overwrite_strategy: str | OverwriteStrategy = "only-missing",
282
299
  prediction_folder: Path | str | None = None,
283
300
  show_progress_bar: bool = True,
301
+ public_only: bool | None = None,
284
302
  ) -> ModelResult:
285
303
  """This function runs a model on a given task and returns the results.
286
304
 
@@ -304,6 +322,7 @@ def evaluate(
304
322
  prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be sabed in `prediction_folder/{task_name}_predictions.json`
305
323
  show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
306
324
  `encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
325
+ public_only: Run only public tasks. If None, it will attempt to run the private task.
307
326
 
308
327
  Returns:
309
328
  The results of the evaluation.
@@ -355,6 +374,7 @@ def evaluate(
355
374
  overwrite_strategy=overwrite_strategy,
356
375
  prediction_folder=prediction_folder,
357
376
  show_progress_bar=show_progress_bar,
377
+ public_only=public_only,
358
378
  )
359
379
  result = task.combine_task_results(results.task_results)
360
380
  return ModelResult(
@@ -367,6 +387,7 @@ def evaluate(
367
387
  task = tasks
368
388
  else:
369
389
  results = []
390
+ exceptions = []
370
391
  tasks_tqdm = tqdm(
371
392
  tasks,
372
393
  desc="Evaluating tasks",
@@ -384,12 +405,16 @@ def evaluate(
384
405
  overwrite_strategy=overwrite_strategy,
385
406
  prediction_folder=prediction_folder,
386
407
  show_progress_bar=False,
408
+ public_only=public_only,
387
409
  )
388
410
  results.extend(_res.task_results)
411
+ if _res.exceptions:
412
+ exceptions.extend(_res.exceptions)
389
413
  return ModelResult(
390
414
  model_name=_res.model_name,
391
415
  model_revision=_res.model_revision,
392
416
  task_results=results,
417
+ exceptions=exceptions,
393
418
  )
394
419
 
395
420
  overwrite_strategy = OverwriteStrategy.from_str(overwrite_strategy)
@@ -459,16 +484,13 @@ def evaluate(
459
484
  co2_tracker=co2_tracker,
460
485
  encode_kwargs=encode_kwargs,
461
486
  prediction_folder=prediction_folder,
487
+ public_only=public_only,
462
488
  )
463
489
  except Exception as e:
464
490
  logger.error(
465
491
  f"Error while running task {task.metadata.name} on splits {list(missing_eval.keys())}: {e}"
466
492
  )
467
- return ModelResult(
468
- model_name=model_name,
469
- model_revision=model_revision,
470
- task_results=[],
471
- )
493
+ result = TaskError(task_name=task.metadata.name, exception=str(e))
472
494
  else:
473
495
  result = _evaluate_task(
474
496
  model=model,
@@ -477,9 +499,18 @@ def evaluate(
477
499
  co2_tracker=False,
478
500
  encode_kwargs=encode_kwargs,
479
501
  prediction_folder=prediction_folder,
502
+ public_only=public_only,
480
503
  )
481
504
  logger.info(f"✓ Finished evaluation for {task.metadata.name}")
482
505
 
506
+ if isinstance(result, TaskError):
507
+ return ModelResult(
508
+ model_name=model_name,
509
+ model_revision=model_revision,
510
+ task_results=[],
511
+ exceptions=[result],
512
+ )
513
+
483
514
  if existing_results:
484
515
  result = result.merge(existing_results)
485
516
 
mteb/leaderboard/app.py CHANGED
@@ -5,7 +5,7 @@ import tempfile
5
5
  import time
6
6
  import warnings
7
7
  from pathlib import Path
8
- from typing import Literal, get_args
8
+ from typing import Literal
9
9
  from urllib.parse import urlencode
10
10
 
11
11
  import cachetools
@@ -14,7 +14,6 @@ import pandas as pd
14
14
 
15
15
  import mteb
16
16
  from mteb import BenchmarkResults
17
- from mteb.abstasks.task_metadata import TaskDomain, TaskType
18
17
  from mteb.benchmarks.benchmark import RtebBenchmark
19
18
  from mteb.cache import ResultCache
20
19
  from mteb.leaderboard.benchmark_selector import (
@@ -29,7 +28,6 @@ from mteb.leaderboard.table import (
29
28
  apply_summary_styling_from_benchmark,
30
29
  )
31
30
  from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
32
- from mteb.types import Modalities
33
31
 
34
32
  logger = logging.getLogger(__name__)
35
33
 
@@ -139,7 +137,10 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
139
137
  df["languages"] = df["languages"].map(_format_list)
140
138
  df = df.sort_values("name")
141
139
  df["domains"] = df["domains"].map(_format_list)
142
- df["name"] = f'<a href="{df["reference"]}" target="_blank">{df["name"]}</a>'
140
+ df["name"] = df.apply(
141
+ lambda row: f'<a href="{row["reference"]}" target="_blank">{row["name"]}</a>',
142
+ axis=1,
143
+ )
143
144
  df["modalities"] = df["modalities"].map(_format_list)
144
145
  df = df.rename(
145
146
  columns={
@@ -155,9 +156,8 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
155
156
  df = df.drop(columns="reference")
156
157
  return gr.DataFrame(
157
158
  df,
158
- datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
159
- show_copy_button=True,
160
- show_fullscreen_button=True,
159
+ datatype=["markdown"] + ["str"] * (len(df.columns) - 1), # type: ignore
160
+ buttons=["copy", "fullscreen"],
161
161
  show_search="filter",
162
162
  )
163
163
 
@@ -215,6 +215,110 @@ def _should_show_zero_shot_filter(benchmark_name: str) -> bool:
215
215
  return True
216
216
 
217
217
 
218
+ @cachetools.cached(
219
+ cache={},
220
+ key=lambda benchmark_name, all_benchmark_results: hash(benchmark_name),
221
+ )
222
+ def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
223
+ start_time = time.time()
224
+ benchmark = mteb.get_benchmark(benchmark_name)
225
+ languages = [task.languages for task in benchmark.tasks if task.languages]
226
+ languages = set(itertools.chain.from_iterable(languages))
227
+ languages = sorted(languages)
228
+ domains = [
229
+ task.metadata.domains for task in benchmark.tasks if task.metadata.domains
230
+ ]
231
+ domains = set(itertools.chain.from_iterable(domains))
232
+ types = {task.metadata.type for task in benchmark.tasks if task.metadata.type}
233
+ modalities = set()
234
+ for task in benchmark.tasks:
235
+ modalities.update(task.metadata.modalities)
236
+ languages, domains, types, modalities = (
237
+ sorted(languages),
238
+ sorted(domains),
239
+ sorted(types),
240
+ sorted(modalities),
241
+ )
242
+ elapsed = time.time() - start_time
243
+ benchmark_results = all_benchmark_results[benchmark_name]
244
+ scores = benchmark_results._get_scores(format="long")
245
+ logger.debug(f"on_benchmark_select callback: {elapsed}s")
246
+ show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
247
+
248
+ # Calculate initial models for this benchmark to avoid race conditions
249
+ benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
250
+ all_models_in_scores = list({entry["model_name"] for entry in scores})
251
+ initial_models = _filter_models(
252
+ all_models_in_scores,
253
+ benchmark_tasks,
254
+ availability=None,
255
+ compatibility=[],
256
+ instructions=None,
257
+ max_model_size=MAX_MODEL_SIZE,
258
+ zero_shot_setting="allow_all",
259
+ )
260
+ # Sort to ensure consistency with update_models
261
+ initial_models = sorted(initial_models)
262
+
263
+ return (
264
+ languages,
265
+ domains,
266
+ types,
267
+ modalities,
268
+ benchmark_tasks,
269
+ scores,
270
+ show_zero_shot,
271
+ initial_models,
272
+ )
273
+
274
+
275
+ @cachetools.cached(
276
+ cache={},
277
+ key=lambda benchmark_name,
278
+ type_select,
279
+ domain_select,
280
+ lang_select,
281
+ modality_select: hash(
282
+ (
283
+ hash(benchmark_name),
284
+ hash(tuple(type_select)),
285
+ hash(tuple(domain_select)),
286
+ hash(tuple(lang_select)),
287
+ hash(tuple(modality_select)),
288
+ )
289
+ ),
290
+ )
291
+ def _cache_update_task_list(
292
+ benchmark_name, type_select, domain_select, lang_select, modality_select
293
+ ):
294
+ if not len(lang_select):
295
+ return []
296
+ start_time = time.time()
297
+ benchmark_tasks = []
298
+ tasks_to_keep = []
299
+ for task in mteb.get_benchmark(benchmark_name).tasks:
300
+ benchmark_tasks.append(task.metadata.name)
301
+ if task.metadata.type not in type_select:
302
+ continue
303
+ if task.metadata.domains and not (
304
+ set(task.metadata.domains) & set(domain_select)
305
+ ):
306
+ continue
307
+ if task.languages and not (set(task.languages) & set(lang_select)):
308
+ continue
309
+ if task.metadata.modalities and not (
310
+ set(task.metadata.modalities) & set(modality_select)
311
+ ):
312
+ continue
313
+ tasks_to_keep.append(task.metadata.name)
314
+ benchmark_tasks.sort()
315
+ tasks_to_keep.sort()
316
+ elapsed = time.time() - start_time
317
+ logger.debug(f"update_task_list callback: {elapsed}s")
318
+
319
+ return benchmark_tasks, tasks_to_keep
320
+
321
+
218
322
  def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
219
323
  """Returns a Gradio Blocks app for the MTEB leaderboard."""
220
324
  logger.info("Loading all benchmark results")
@@ -227,6 +331,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
227
331
  benchmark.name: all_results.select_tasks(benchmark.tasks).join_revisions()
228
332
  for benchmark in benchmarks
229
333
  }
334
+
230
335
  default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME)
231
336
  default_results = all_benchmark_results[default_benchmark.name]
232
337
  logger.info("Benchmark results loaded")
@@ -257,55 +362,48 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
257
362
  default_benchmark, filtered_benchmark_results
258
363
  )
259
364
 
260
- lang_select = gr.Dropdown(
261
- LANGUAGE,
365
+ lang_select = gr.CheckboxGroup(
366
+ sorted(default_results.languages),
262
367
  value=sorted(default_results.languages),
263
- allow_custom_value=True,
264
- multiselect=True,
368
+ show_label=True,
369
+ show_select_all=True,
265
370
  label="Language",
266
371
  info="Select languages to include.",
267
372
  )
268
- type_select = gr.Dropdown(
269
- sorted(get_args(TaskType)),
373
+ type_select = gr.CheckboxGroup(
374
+ sorted(default_results.task_types),
270
375
  value=sorted(default_results.task_types),
271
- multiselect=True,
376
+ show_label=True,
377
+ show_select_all=True,
272
378
  label="Task Type",
273
379
  info="Select task types to include.",
274
380
  )
275
- domain_select = gr.Dropdown(
276
- sorted(get_args(TaskDomain)),
381
+ domain_select = gr.CheckboxGroup(
382
+ sorted(default_results.domains),
277
383
  value=sorted(default_results.domains),
278
- multiselect=True,
384
+ show_label=True,
385
+ show_select_all=True,
279
386
  label="Domain",
280
387
  info="Select domains to include.",
281
388
  )
282
- task_select = gr.Dropdown(
283
- sorted(all_results.task_names),
389
+ task_select = gr.CheckboxGroup(
390
+ sorted(default_results.task_names),
284
391
  value=sorted(default_results.task_names),
285
- allow_custom_value=True,
286
- multiselect=True,
392
+ show_label=True,
393
+ show_select_all=True,
287
394
  label="Task",
288
395
  info="Select specific tasks to include",
289
396
  )
290
- modality_select = gr.Dropdown(
291
- sorted(get_args(Modalities)),
397
+ modality_select = gr.CheckboxGroup(
398
+ sorted(default_results.modalities),
292
399
  value=sorted(default_results.modalities),
293
- multiselect=True,
400
+ show_label=True,
401
+ show_select_all=True,
294
402
  label="Modality",
295
403
  info="Select modalities to include.",
296
404
  )
297
405
 
298
- head = """
299
- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
300
- """
301
-
302
- with gr.Blocks(
303
- fill_width=True,
304
- theme=gr.themes.Soft(
305
- font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
306
- ),
307
- head=head,
308
- ) as demo:
406
+ with gr.Blocks(fill_width=True) as demo:
309
407
  with gr.Sidebar(
310
408
  position="left",
311
409
  label="Benchmark Selection and Customization",
@@ -465,62 +563,25 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
465
563
  # This sets the benchmark from the URL query parameters
466
564
  demo.load(_set_benchmark_on_load, inputs=[], outputs=[benchmark_select])
467
565
 
468
- @cachetools.cached(
469
- cache={},
470
- key=lambda benchmark_name: hash(benchmark_name),
471
- )
472
566
  def on_benchmark_select(benchmark_name):
473
- start_time = time.time()
474
- benchmark = mteb.get_benchmark(benchmark_name)
475
- languages = [task.languages for task in benchmark.tasks if task.languages]
476
- languages = set(itertools.chain.from_iterable(languages))
477
- languages = sorted(languages)
478
- domains = [
479
- task.metadata.domains
480
- for task in benchmark.tasks
481
- if task.metadata.domains
482
- ]
483
- domains = set(itertools.chain.from_iterable(domains))
484
- types = {
485
- task.metadata.type for task in benchmark.tasks if task.metadata.type
486
- }
487
- modalities = set()
488
- for task in benchmark.tasks:
489
- modalities.update(task.metadata.modalities)
490
- languages, domains, types, modalities = (
491
- sorted(languages),
492
- sorted(domains),
493
- sorted(types),
494
- sorted(modalities),
495
- )
496
- elapsed = time.time() - start_time
497
- benchmark_results = all_benchmark_results[benchmark_name]
498
- scores = benchmark_results._get_scores(format="long")
499
- logger.debug(f"on_benchmark_select callback: {elapsed}s")
500
- show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
501
-
502
- # Calculate initial models for this benchmark to avoid race conditions
503
- benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
504
- all_models_in_scores = list({entry["model_name"] for entry in scores})
505
- initial_models = _filter_models(
506
- all_models_in_scores,
507
- benchmark_tasks,
508
- availability=None,
509
- compatibility=[],
510
- instructions=None,
511
- max_model_size=MAX_MODEL_SIZE,
512
- zero_shot_setting="allow_all",
513
- )
514
- # Sort to ensure consistency with update_models
515
- initial_models = sorted(initial_models)
516
-
517
- return (
567
+ (
518
568
  languages,
519
569
  domains,
520
570
  types,
521
571
  modalities,
522
572
  benchmark_tasks,
523
573
  scores,
574
+ show_zero_shot,
575
+ initial_models,
576
+ ) = _cache_on_benchmark_select(benchmark_name, all_benchmark_results)
577
+
578
+ return (
579
+ gr.update(choices=languages, value=languages),
580
+ gr.update(choices=domains, value=domains),
581
+ gr.update(choices=types, value=types),
582
+ gr.update(choices=modalities, value=modalities),
583
+ gr.update(choices=benchmark_tasks, value=benchmark_tasks),
584
+ scores,
524
585
  gr.update(visible=show_zero_shot),
525
586
  initial_models,
526
587
  )
@@ -562,48 +623,13 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
562
623
  outputs=[scores],
563
624
  )
564
625
 
565
- @cachetools.cached(
566
- cache={},
567
- key=lambda benchmark_name,
568
- type_select,
569
- domain_select,
570
- lang_select,
571
- modality_select: hash(
572
- (
573
- hash(benchmark_name),
574
- hash(tuple(type_select)),
575
- hash(tuple(domain_select)),
576
- hash(tuple(lang_select)),
577
- hash(tuple(modality_select)),
578
- )
579
- ),
580
- )
581
626
  def update_task_list(
582
627
  benchmark_name, type_select, domain_select, lang_select, modality_select
583
628
  ):
584
- if not len(lang_select):
585
- return []
586
- start_time = time.time()
587
- tasks_to_keep = []
588
- for task in mteb.get_benchmark(benchmark_name).tasks:
589
- if task.metadata.type not in type_select:
590
- continue
591
- if task.metadata.domains is not None and not (
592
- set(task.metadata.domains) & set(domain_select)
593
- ):
594
- continue
595
- if task.languages is not None and not (
596
- set(task.languages) & set(lang_select)
597
- ):
598
- continue
599
- if task.metadata.modalities and not (
600
- set(task.metadata.modalities) & set(modality_select)
601
- ):
602
- continue
603
- tasks_to_keep.append(task.metadata.name)
604
- elapsed = time.time() - start_time
605
- logger.debug(f"update_task_list callback: {elapsed}s")
606
- return sorted(tasks_to_keep)
629
+ benchmark_tasks, tasks_to_keep = _cache_update_task_list(
630
+ benchmark_name, type_select, domain_select, lang_select, modality_select
631
+ )
632
+ return gr.update(choices=benchmark_tasks, value=tasks_to_keep)
607
633
 
608
634
  type_select.input(
609
635
  update_task_list,
@@ -913,4 +939,15 @@ if __name__ == "__main__":
913
939
  warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
914
940
 
915
941
  app = get_leaderboard_app()
916
- app.launch(server_name="0.0.0.0", server_port=7860)
942
+
943
+ head = """
944
+ <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
945
+ """
946
+ app.launch(
947
+ server_name="0.0.0.0",
948
+ server_port=7860,
949
+ theme=gr.themes.Soft(
950
+ font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
951
+ ),
952
+ head=head,
953
+ )
@@ -75,14 +75,17 @@ GP_BENCHMARK_ENTRIES = [
75
75
  "MTEB(kor, v1)",
76
76
  "MTEB(nld, v1)",
77
77
  "MTEB(pol, v1)",
78
- "MTEB(rus, v1)",
78
+ "MTEB(rus, v1.1)",
79
79
  "MTEB(fas, v2)",
80
80
  "VN-MTEB (vie, v1)",
81
81
  ]
82
82
  )
83
83
  + [
84
84
  MenuEntry(
85
- "Other", mteb.get_benchmarks(["MTEB(eng, v1)", "MTEB(fas, v1)"])
85
+ "Other",
86
+ mteb.get_benchmarks(
87
+ ["MTEB(eng, v1)", "MTEB(fas, v1)", "MTEB(rus, v1)"]
88
+ ),
86
89
  )
87
90
  ],
88
91
  ),
mteb/leaderboard/table.py CHANGED
@@ -204,8 +204,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
204
204
  pinned_columns=2,
205
205
  column_widths=column_widths,
206
206
  wrap=True,
207
- show_fullscreen_button=True,
208
- show_copy_button=True,
207
+ buttons=["copy", "fullscreen"],
209
208
  show_search="filter",
210
209
  )
211
210
 
@@ -227,7 +226,6 @@ def _apply_per_task_table_styling(per_task: pd.DataFrame) -> gr.DataFrame:
227
226
  per_task_style,
228
227
  interactive=False,
229
228
  pinned_columns=1,
230
- show_fullscreen_button=True,
231
- show_copy_button=True,
229
+ buttons=["copy", "fullscreen"],
232
230
  show_search="filter",
233
231
  )
@@ -196,10 +196,10 @@ COLPALI_CITATION = """
196
196
 
197
197
  COLPALI_TRAINING_DATA = {
198
198
  # from https://huggingface.co/datasets/vidore/colpali_train_set
199
- "DocVQA",
200
- "InfoVQA",
201
- "TATDQA",
202
- "arXivQA",
199
+ "VidoreDocVQARetrieval",
200
+ "VidoreInfoVQARetrieval",
201
+ "VidoreTatdqaRetrieval",
202
+ "VidoreArxivQARetrieval",
203
203
  }
204
204
 
205
205
  colpali_v1_1 = ModelMeta(