mteb 2.3.0__py3-none-any.whl → 2.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +62 -1
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/evaluate.py +38 -7
- mteb/leaderboard/app.py +161 -124
- mteb/leaderboard/benchmark_selector.py +5 -2
- mteb/leaderboard/table.py +2 -4
- mteb/models/model_implementations/colpali_models.py +4 -4
- mteb/models/model_implementations/colqwen_models.py +206 -2
- mteb/models/model_implementations/euler_models.py +25 -0
- mteb/models/model_implementations/jina_models.py +203 -5
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +8 -9
- mteb/models/model_implementations/ru_sentence_models.py +9 -0
- mteb/models/model_implementations/vdr_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +57 -0
- mteb/results/model_result.py +2 -1
- mteb/results/task_result.py +12 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/METADATA +5 -2
- {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/RECORD +27 -23
- {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/WHEEL +0 -0
- {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.0.dist-info → mteb-2.3.2.dist-info}/top_level.txt +0 -0
mteb/evaluate.py
CHANGED
|
@@ -7,6 +7,7 @@ from pathlib import Path
|
|
|
7
7
|
from time import time
|
|
8
8
|
from typing import TYPE_CHECKING, Any, cast
|
|
9
9
|
|
|
10
|
+
from datasets.exceptions import DatasetNotFoundError
|
|
10
11
|
from tqdm.auto import tqdm
|
|
11
12
|
|
|
12
13
|
from mteb._helpful_enum import HelpfulStrEnum
|
|
@@ -25,6 +26,7 @@ from mteb.models.sentence_transformer_wrapper import (
|
|
|
25
26
|
SentenceTransformerEncoderWrapper,
|
|
26
27
|
)
|
|
27
28
|
from mteb.results import ModelResult, TaskResult
|
|
29
|
+
from mteb.results.task_result import TaskError
|
|
28
30
|
from mteb.types import HFSubset, PromptType, SplitName
|
|
29
31
|
from mteb.types._metadata import ModelName, Revision
|
|
30
32
|
|
|
@@ -117,7 +119,8 @@ def _evaluate_task(
|
|
|
117
119
|
co2_tracker: bool | None,
|
|
118
120
|
encode_kwargs: dict[str, Any],
|
|
119
121
|
prediction_folder: Path | None,
|
|
120
|
-
|
|
122
|
+
public_only: bool | None,
|
|
123
|
+
) -> TaskResult | TaskError:
|
|
121
124
|
"""The core logic to run a model on a given task. See `evaluate` for more details.
|
|
122
125
|
|
|
123
126
|
Returns:
|
|
@@ -149,6 +152,7 @@ def _evaluate_task(
|
|
|
149
152
|
encode_kwargs=encode_kwargs,
|
|
150
153
|
co2_tracker=False,
|
|
151
154
|
prediction_folder=prediction_folder,
|
|
155
|
+
public_only=public_only,
|
|
152
156
|
)
|
|
153
157
|
result.kg_co2_emissions = tracker.final_emissions
|
|
154
158
|
return result
|
|
@@ -159,7 +163,20 @@ def _evaluate_task(
|
|
|
159
163
|
|
|
160
164
|
data_loaded = task.data_loaded
|
|
161
165
|
if not data_loaded:
|
|
162
|
-
|
|
166
|
+
try:
|
|
167
|
+
task.load_data()
|
|
168
|
+
except DatasetNotFoundError as e:
|
|
169
|
+
if not task.metadata.is_public and public_only is None:
|
|
170
|
+
logger.warning(
|
|
171
|
+
f"Dataset for private task '{task.metadata.name}' not found. "
|
|
172
|
+
"Make sure you have access to the dataset and that you have set up the authentication correctly. To disable this warning set `public_only=False`"
|
|
173
|
+
)
|
|
174
|
+
return TaskError(
|
|
175
|
+
task_name=task.metadata.name,
|
|
176
|
+
exception=str(e),
|
|
177
|
+
)
|
|
178
|
+
if public_only is False:
|
|
179
|
+
raise e
|
|
163
180
|
|
|
164
181
|
evaluation_time = 0
|
|
165
182
|
|
|
@@ -281,6 +298,7 @@ def evaluate(
|
|
|
281
298
|
overwrite_strategy: str | OverwriteStrategy = "only-missing",
|
|
282
299
|
prediction_folder: Path | str | None = None,
|
|
283
300
|
show_progress_bar: bool = True,
|
|
301
|
+
public_only: bool | None = None,
|
|
284
302
|
) -> ModelResult:
|
|
285
303
|
"""This function runs a model on a given task and returns the results.
|
|
286
304
|
|
|
@@ -304,6 +322,7 @@ def evaluate(
|
|
|
304
322
|
prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be sabed in `prediction_folder/{task_name}_predictions.json`
|
|
305
323
|
show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
|
|
306
324
|
`encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
|
|
325
|
+
public_only: Run only public tasks. If None, it will attempt to run the private task.
|
|
307
326
|
|
|
308
327
|
Returns:
|
|
309
328
|
The results of the evaluation.
|
|
@@ -355,6 +374,7 @@ def evaluate(
|
|
|
355
374
|
overwrite_strategy=overwrite_strategy,
|
|
356
375
|
prediction_folder=prediction_folder,
|
|
357
376
|
show_progress_bar=show_progress_bar,
|
|
377
|
+
public_only=public_only,
|
|
358
378
|
)
|
|
359
379
|
result = task.combine_task_results(results.task_results)
|
|
360
380
|
return ModelResult(
|
|
@@ -367,6 +387,7 @@ def evaluate(
|
|
|
367
387
|
task = tasks
|
|
368
388
|
else:
|
|
369
389
|
results = []
|
|
390
|
+
exceptions = []
|
|
370
391
|
tasks_tqdm = tqdm(
|
|
371
392
|
tasks,
|
|
372
393
|
desc="Evaluating tasks",
|
|
@@ -384,12 +405,16 @@ def evaluate(
|
|
|
384
405
|
overwrite_strategy=overwrite_strategy,
|
|
385
406
|
prediction_folder=prediction_folder,
|
|
386
407
|
show_progress_bar=False,
|
|
408
|
+
public_only=public_only,
|
|
387
409
|
)
|
|
388
410
|
results.extend(_res.task_results)
|
|
411
|
+
if _res.exceptions:
|
|
412
|
+
exceptions.extend(_res.exceptions)
|
|
389
413
|
return ModelResult(
|
|
390
414
|
model_name=_res.model_name,
|
|
391
415
|
model_revision=_res.model_revision,
|
|
392
416
|
task_results=results,
|
|
417
|
+
exceptions=exceptions,
|
|
393
418
|
)
|
|
394
419
|
|
|
395
420
|
overwrite_strategy = OverwriteStrategy.from_str(overwrite_strategy)
|
|
@@ -459,16 +484,13 @@ def evaluate(
|
|
|
459
484
|
co2_tracker=co2_tracker,
|
|
460
485
|
encode_kwargs=encode_kwargs,
|
|
461
486
|
prediction_folder=prediction_folder,
|
|
487
|
+
public_only=public_only,
|
|
462
488
|
)
|
|
463
489
|
except Exception as e:
|
|
464
490
|
logger.error(
|
|
465
491
|
f"Error while running task {task.metadata.name} on splits {list(missing_eval.keys())}: {e}"
|
|
466
492
|
)
|
|
467
|
-
|
|
468
|
-
model_name=model_name,
|
|
469
|
-
model_revision=model_revision,
|
|
470
|
-
task_results=[],
|
|
471
|
-
)
|
|
493
|
+
result = TaskError(task_name=task.metadata.name, exception=str(e))
|
|
472
494
|
else:
|
|
473
495
|
result = _evaluate_task(
|
|
474
496
|
model=model,
|
|
@@ -477,9 +499,18 @@ def evaluate(
|
|
|
477
499
|
co2_tracker=False,
|
|
478
500
|
encode_kwargs=encode_kwargs,
|
|
479
501
|
prediction_folder=prediction_folder,
|
|
502
|
+
public_only=public_only,
|
|
480
503
|
)
|
|
481
504
|
logger.info(f"✓ Finished evaluation for {task.metadata.name}")
|
|
482
505
|
|
|
506
|
+
if isinstance(result, TaskError):
|
|
507
|
+
return ModelResult(
|
|
508
|
+
model_name=model_name,
|
|
509
|
+
model_revision=model_revision,
|
|
510
|
+
task_results=[],
|
|
511
|
+
exceptions=[result],
|
|
512
|
+
)
|
|
513
|
+
|
|
483
514
|
if existing_results:
|
|
484
515
|
result = result.merge(existing_results)
|
|
485
516
|
|
mteb/leaderboard/app.py
CHANGED
|
@@ -5,7 +5,7 @@ import tempfile
|
|
|
5
5
|
import time
|
|
6
6
|
import warnings
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Literal
|
|
8
|
+
from typing import Literal
|
|
9
9
|
from urllib.parse import urlencode
|
|
10
10
|
|
|
11
11
|
import cachetools
|
|
@@ -14,7 +14,6 @@ import pandas as pd
|
|
|
14
14
|
|
|
15
15
|
import mteb
|
|
16
16
|
from mteb import BenchmarkResults
|
|
17
|
-
from mteb.abstasks.task_metadata import TaskDomain, TaskType
|
|
18
17
|
from mteb.benchmarks.benchmark import RtebBenchmark
|
|
19
18
|
from mteb.cache import ResultCache
|
|
20
19
|
from mteb.leaderboard.benchmark_selector import (
|
|
@@ -29,7 +28,6 @@ from mteb.leaderboard.table import (
|
|
|
29
28
|
apply_summary_styling_from_benchmark,
|
|
30
29
|
)
|
|
31
30
|
from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
|
|
32
|
-
from mteb.types import Modalities
|
|
33
31
|
|
|
34
32
|
logger = logging.getLogger(__name__)
|
|
35
33
|
|
|
@@ -139,7 +137,10 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
|
|
|
139
137
|
df["languages"] = df["languages"].map(_format_list)
|
|
140
138
|
df = df.sort_values("name")
|
|
141
139
|
df["domains"] = df["domains"].map(_format_list)
|
|
142
|
-
df["name"] =
|
|
140
|
+
df["name"] = df.apply(
|
|
141
|
+
lambda row: f'<a href="{row["reference"]}" target="_blank">{row["name"]}</a>',
|
|
142
|
+
axis=1,
|
|
143
|
+
)
|
|
143
144
|
df["modalities"] = df["modalities"].map(_format_list)
|
|
144
145
|
df = df.rename(
|
|
145
146
|
columns={
|
|
@@ -155,9 +156,8 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
|
|
|
155
156
|
df = df.drop(columns="reference")
|
|
156
157
|
return gr.DataFrame(
|
|
157
158
|
df,
|
|
158
|
-
datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
|
|
159
|
-
|
|
160
|
-
show_fullscreen_button=True,
|
|
159
|
+
datatype=["markdown"] + ["str"] * (len(df.columns) - 1), # type: ignore
|
|
160
|
+
buttons=["copy", "fullscreen"],
|
|
161
161
|
show_search="filter",
|
|
162
162
|
)
|
|
163
163
|
|
|
@@ -215,6 +215,110 @@ def _should_show_zero_shot_filter(benchmark_name: str) -> bool:
|
|
|
215
215
|
return True
|
|
216
216
|
|
|
217
217
|
|
|
218
|
+
@cachetools.cached(
|
|
219
|
+
cache={},
|
|
220
|
+
key=lambda benchmark_name, all_benchmark_results: hash(benchmark_name),
|
|
221
|
+
)
|
|
222
|
+
def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
|
|
223
|
+
start_time = time.time()
|
|
224
|
+
benchmark = mteb.get_benchmark(benchmark_name)
|
|
225
|
+
languages = [task.languages for task in benchmark.tasks if task.languages]
|
|
226
|
+
languages = set(itertools.chain.from_iterable(languages))
|
|
227
|
+
languages = sorted(languages)
|
|
228
|
+
domains = [
|
|
229
|
+
task.metadata.domains for task in benchmark.tasks if task.metadata.domains
|
|
230
|
+
]
|
|
231
|
+
domains = set(itertools.chain.from_iterable(domains))
|
|
232
|
+
types = {task.metadata.type for task in benchmark.tasks if task.metadata.type}
|
|
233
|
+
modalities = set()
|
|
234
|
+
for task in benchmark.tasks:
|
|
235
|
+
modalities.update(task.metadata.modalities)
|
|
236
|
+
languages, domains, types, modalities = (
|
|
237
|
+
sorted(languages),
|
|
238
|
+
sorted(domains),
|
|
239
|
+
sorted(types),
|
|
240
|
+
sorted(modalities),
|
|
241
|
+
)
|
|
242
|
+
elapsed = time.time() - start_time
|
|
243
|
+
benchmark_results = all_benchmark_results[benchmark_name]
|
|
244
|
+
scores = benchmark_results._get_scores(format="long")
|
|
245
|
+
logger.debug(f"on_benchmark_select callback: {elapsed}s")
|
|
246
|
+
show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
|
|
247
|
+
|
|
248
|
+
# Calculate initial models for this benchmark to avoid race conditions
|
|
249
|
+
benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
|
|
250
|
+
all_models_in_scores = list({entry["model_name"] for entry in scores})
|
|
251
|
+
initial_models = _filter_models(
|
|
252
|
+
all_models_in_scores,
|
|
253
|
+
benchmark_tasks,
|
|
254
|
+
availability=None,
|
|
255
|
+
compatibility=[],
|
|
256
|
+
instructions=None,
|
|
257
|
+
max_model_size=MAX_MODEL_SIZE,
|
|
258
|
+
zero_shot_setting="allow_all",
|
|
259
|
+
)
|
|
260
|
+
# Sort to ensure consistency with update_models
|
|
261
|
+
initial_models = sorted(initial_models)
|
|
262
|
+
|
|
263
|
+
return (
|
|
264
|
+
languages,
|
|
265
|
+
domains,
|
|
266
|
+
types,
|
|
267
|
+
modalities,
|
|
268
|
+
benchmark_tasks,
|
|
269
|
+
scores,
|
|
270
|
+
show_zero_shot,
|
|
271
|
+
initial_models,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
@cachetools.cached(
|
|
276
|
+
cache={},
|
|
277
|
+
key=lambda benchmark_name,
|
|
278
|
+
type_select,
|
|
279
|
+
domain_select,
|
|
280
|
+
lang_select,
|
|
281
|
+
modality_select: hash(
|
|
282
|
+
(
|
|
283
|
+
hash(benchmark_name),
|
|
284
|
+
hash(tuple(type_select)),
|
|
285
|
+
hash(tuple(domain_select)),
|
|
286
|
+
hash(tuple(lang_select)),
|
|
287
|
+
hash(tuple(modality_select)),
|
|
288
|
+
)
|
|
289
|
+
),
|
|
290
|
+
)
|
|
291
|
+
def _cache_update_task_list(
|
|
292
|
+
benchmark_name, type_select, domain_select, lang_select, modality_select
|
|
293
|
+
):
|
|
294
|
+
if not len(lang_select):
|
|
295
|
+
return []
|
|
296
|
+
start_time = time.time()
|
|
297
|
+
benchmark_tasks = []
|
|
298
|
+
tasks_to_keep = []
|
|
299
|
+
for task in mteb.get_benchmark(benchmark_name).tasks:
|
|
300
|
+
benchmark_tasks.append(task.metadata.name)
|
|
301
|
+
if task.metadata.type not in type_select:
|
|
302
|
+
continue
|
|
303
|
+
if task.metadata.domains and not (
|
|
304
|
+
set(task.metadata.domains) & set(domain_select)
|
|
305
|
+
):
|
|
306
|
+
continue
|
|
307
|
+
if task.languages and not (set(task.languages) & set(lang_select)):
|
|
308
|
+
continue
|
|
309
|
+
if task.metadata.modalities and not (
|
|
310
|
+
set(task.metadata.modalities) & set(modality_select)
|
|
311
|
+
):
|
|
312
|
+
continue
|
|
313
|
+
tasks_to_keep.append(task.metadata.name)
|
|
314
|
+
benchmark_tasks.sort()
|
|
315
|
+
tasks_to_keep.sort()
|
|
316
|
+
elapsed = time.time() - start_time
|
|
317
|
+
logger.debug(f"update_task_list callback: {elapsed}s")
|
|
318
|
+
|
|
319
|
+
return benchmark_tasks, tasks_to_keep
|
|
320
|
+
|
|
321
|
+
|
|
218
322
|
def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
219
323
|
"""Returns a Gradio Blocks app for the MTEB leaderboard."""
|
|
220
324
|
logger.info("Loading all benchmark results")
|
|
@@ -227,6 +331,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
227
331
|
benchmark.name: all_results.select_tasks(benchmark.tasks).join_revisions()
|
|
228
332
|
for benchmark in benchmarks
|
|
229
333
|
}
|
|
334
|
+
|
|
230
335
|
default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME)
|
|
231
336
|
default_results = all_benchmark_results[default_benchmark.name]
|
|
232
337
|
logger.info("Benchmark results loaded")
|
|
@@ -257,55 +362,48 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
257
362
|
default_benchmark, filtered_benchmark_results
|
|
258
363
|
)
|
|
259
364
|
|
|
260
|
-
lang_select = gr.
|
|
261
|
-
|
|
365
|
+
lang_select = gr.CheckboxGroup(
|
|
366
|
+
sorted(default_results.languages),
|
|
262
367
|
value=sorted(default_results.languages),
|
|
263
|
-
|
|
264
|
-
|
|
368
|
+
show_label=True,
|
|
369
|
+
show_select_all=True,
|
|
265
370
|
label="Language",
|
|
266
371
|
info="Select languages to include.",
|
|
267
372
|
)
|
|
268
|
-
type_select = gr.
|
|
269
|
-
sorted(
|
|
373
|
+
type_select = gr.CheckboxGroup(
|
|
374
|
+
sorted(default_results.task_types),
|
|
270
375
|
value=sorted(default_results.task_types),
|
|
271
|
-
|
|
376
|
+
show_label=True,
|
|
377
|
+
show_select_all=True,
|
|
272
378
|
label="Task Type",
|
|
273
379
|
info="Select task types to include.",
|
|
274
380
|
)
|
|
275
|
-
domain_select = gr.
|
|
276
|
-
sorted(
|
|
381
|
+
domain_select = gr.CheckboxGroup(
|
|
382
|
+
sorted(default_results.domains),
|
|
277
383
|
value=sorted(default_results.domains),
|
|
278
|
-
|
|
384
|
+
show_label=True,
|
|
385
|
+
show_select_all=True,
|
|
279
386
|
label="Domain",
|
|
280
387
|
info="Select domains to include.",
|
|
281
388
|
)
|
|
282
|
-
task_select = gr.
|
|
283
|
-
sorted(
|
|
389
|
+
task_select = gr.CheckboxGroup(
|
|
390
|
+
sorted(default_results.task_names),
|
|
284
391
|
value=sorted(default_results.task_names),
|
|
285
|
-
|
|
286
|
-
|
|
392
|
+
show_label=True,
|
|
393
|
+
show_select_all=True,
|
|
287
394
|
label="Task",
|
|
288
395
|
info="Select specific tasks to include",
|
|
289
396
|
)
|
|
290
|
-
modality_select = gr.
|
|
291
|
-
sorted(
|
|
397
|
+
modality_select = gr.CheckboxGroup(
|
|
398
|
+
sorted(default_results.modalities),
|
|
292
399
|
value=sorted(default_results.modalities),
|
|
293
|
-
|
|
400
|
+
show_label=True,
|
|
401
|
+
show_select_all=True,
|
|
294
402
|
label="Modality",
|
|
295
403
|
info="Select modalities to include.",
|
|
296
404
|
)
|
|
297
405
|
|
|
298
|
-
|
|
299
|
-
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
|
|
300
|
-
"""
|
|
301
|
-
|
|
302
|
-
with gr.Blocks(
|
|
303
|
-
fill_width=True,
|
|
304
|
-
theme=gr.themes.Soft(
|
|
305
|
-
font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
|
|
306
|
-
),
|
|
307
|
-
head=head,
|
|
308
|
-
) as demo:
|
|
406
|
+
with gr.Blocks(fill_width=True) as demo:
|
|
309
407
|
with gr.Sidebar(
|
|
310
408
|
position="left",
|
|
311
409
|
label="Benchmark Selection and Customization",
|
|
@@ -465,62 +563,25 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
465
563
|
# This sets the benchmark from the URL query parameters
|
|
466
564
|
demo.load(_set_benchmark_on_load, inputs=[], outputs=[benchmark_select])
|
|
467
565
|
|
|
468
|
-
@cachetools.cached(
|
|
469
|
-
cache={},
|
|
470
|
-
key=lambda benchmark_name: hash(benchmark_name),
|
|
471
|
-
)
|
|
472
566
|
def on_benchmark_select(benchmark_name):
|
|
473
|
-
|
|
474
|
-
benchmark = mteb.get_benchmark(benchmark_name)
|
|
475
|
-
languages = [task.languages for task in benchmark.tasks if task.languages]
|
|
476
|
-
languages = set(itertools.chain.from_iterable(languages))
|
|
477
|
-
languages = sorted(languages)
|
|
478
|
-
domains = [
|
|
479
|
-
task.metadata.domains
|
|
480
|
-
for task in benchmark.tasks
|
|
481
|
-
if task.metadata.domains
|
|
482
|
-
]
|
|
483
|
-
domains = set(itertools.chain.from_iterable(domains))
|
|
484
|
-
types = {
|
|
485
|
-
task.metadata.type for task in benchmark.tasks if task.metadata.type
|
|
486
|
-
}
|
|
487
|
-
modalities = set()
|
|
488
|
-
for task in benchmark.tasks:
|
|
489
|
-
modalities.update(task.metadata.modalities)
|
|
490
|
-
languages, domains, types, modalities = (
|
|
491
|
-
sorted(languages),
|
|
492
|
-
sorted(domains),
|
|
493
|
-
sorted(types),
|
|
494
|
-
sorted(modalities),
|
|
495
|
-
)
|
|
496
|
-
elapsed = time.time() - start_time
|
|
497
|
-
benchmark_results = all_benchmark_results[benchmark_name]
|
|
498
|
-
scores = benchmark_results._get_scores(format="long")
|
|
499
|
-
logger.debug(f"on_benchmark_select callback: {elapsed}s")
|
|
500
|
-
show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
|
|
501
|
-
|
|
502
|
-
# Calculate initial models for this benchmark to avoid race conditions
|
|
503
|
-
benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
|
|
504
|
-
all_models_in_scores = list({entry["model_name"] for entry in scores})
|
|
505
|
-
initial_models = _filter_models(
|
|
506
|
-
all_models_in_scores,
|
|
507
|
-
benchmark_tasks,
|
|
508
|
-
availability=None,
|
|
509
|
-
compatibility=[],
|
|
510
|
-
instructions=None,
|
|
511
|
-
max_model_size=MAX_MODEL_SIZE,
|
|
512
|
-
zero_shot_setting="allow_all",
|
|
513
|
-
)
|
|
514
|
-
# Sort to ensure consistency with update_models
|
|
515
|
-
initial_models = sorted(initial_models)
|
|
516
|
-
|
|
517
|
-
return (
|
|
567
|
+
(
|
|
518
568
|
languages,
|
|
519
569
|
domains,
|
|
520
570
|
types,
|
|
521
571
|
modalities,
|
|
522
572
|
benchmark_tasks,
|
|
523
573
|
scores,
|
|
574
|
+
show_zero_shot,
|
|
575
|
+
initial_models,
|
|
576
|
+
) = _cache_on_benchmark_select(benchmark_name, all_benchmark_results)
|
|
577
|
+
|
|
578
|
+
return (
|
|
579
|
+
gr.update(choices=languages, value=languages),
|
|
580
|
+
gr.update(choices=domains, value=domains),
|
|
581
|
+
gr.update(choices=types, value=types),
|
|
582
|
+
gr.update(choices=modalities, value=modalities),
|
|
583
|
+
gr.update(choices=benchmark_tasks, value=benchmark_tasks),
|
|
584
|
+
scores,
|
|
524
585
|
gr.update(visible=show_zero_shot),
|
|
525
586
|
initial_models,
|
|
526
587
|
)
|
|
@@ -562,48 +623,13 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
562
623
|
outputs=[scores],
|
|
563
624
|
)
|
|
564
625
|
|
|
565
|
-
@cachetools.cached(
|
|
566
|
-
cache={},
|
|
567
|
-
key=lambda benchmark_name,
|
|
568
|
-
type_select,
|
|
569
|
-
domain_select,
|
|
570
|
-
lang_select,
|
|
571
|
-
modality_select: hash(
|
|
572
|
-
(
|
|
573
|
-
hash(benchmark_name),
|
|
574
|
-
hash(tuple(type_select)),
|
|
575
|
-
hash(tuple(domain_select)),
|
|
576
|
-
hash(tuple(lang_select)),
|
|
577
|
-
hash(tuple(modality_select)),
|
|
578
|
-
)
|
|
579
|
-
),
|
|
580
|
-
)
|
|
581
626
|
def update_task_list(
|
|
582
627
|
benchmark_name, type_select, domain_select, lang_select, modality_select
|
|
583
628
|
):
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
for task in mteb.get_benchmark(benchmark_name).tasks:
|
|
589
|
-
if task.metadata.type not in type_select:
|
|
590
|
-
continue
|
|
591
|
-
if task.metadata.domains is not None and not (
|
|
592
|
-
set(task.metadata.domains) & set(domain_select)
|
|
593
|
-
):
|
|
594
|
-
continue
|
|
595
|
-
if task.languages is not None and not (
|
|
596
|
-
set(task.languages) & set(lang_select)
|
|
597
|
-
):
|
|
598
|
-
continue
|
|
599
|
-
if task.metadata.modalities and not (
|
|
600
|
-
set(task.metadata.modalities) & set(modality_select)
|
|
601
|
-
):
|
|
602
|
-
continue
|
|
603
|
-
tasks_to_keep.append(task.metadata.name)
|
|
604
|
-
elapsed = time.time() - start_time
|
|
605
|
-
logger.debug(f"update_task_list callback: {elapsed}s")
|
|
606
|
-
return sorted(tasks_to_keep)
|
|
629
|
+
benchmark_tasks, tasks_to_keep = _cache_update_task_list(
|
|
630
|
+
benchmark_name, type_select, domain_select, lang_select, modality_select
|
|
631
|
+
)
|
|
632
|
+
return gr.update(choices=benchmark_tasks, value=tasks_to_keep)
|
|
607
633
|
|
|
608
634
|
type_select.input(
|
|
609
635
|
update_task_list,
|
|
@@ -913,4 +939,15 @@ if __name__ == "__main__":
|
|
|
913
939
|
warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
|
|
914
940
|
|
|
915
941
|
app = get_leaderboard_app()
|
|
916
|
-
|
|
942
|
+
|
|
943
|
+
head = """
|
|
944
|
+
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
|
|
945
|
+
"""
|
|
946
|
+
app.launch(
|
|
947
|
+
server_name="0.0.0.0",
|
|
948
|
+
server_port=7860,
|
|
949
|
+
theme=gr.themes.Soft(
|
|
950
|
+
font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
|
|
951
|
+
),
|
|
952
|
+
head=head,
|
|
953
|
+
)
|
|
@@ -75,14 +75,17 @@ GP_BENCHMARK_ENTRIES = [
|
|
|
75
75
|
"MTEB(kor, v1)",
|
|
76
76
|
"MTEB(nld, v1)",
|
|
77
77
|
"MTEB(pol, v1)",
|
|
78
|
-
"MTEB(rus, v1)",
|
|
78
|
+
"MTEB(rus, v1.1)",
|
|
79
79
|
"MTEB(fas, v2)",
|
|
80
80
|
"VN-MTEB (vie, v1)",
|
|
81
81
|
]
|
|
82
82
|
)
|
|
83
83
|
+ [
|
|
84
84
|
MenuEntry(
|
|
85
|
-
"Other",
|
|
85
|
+
"Other",
|
|
86
|
+
mteb.get_benchmarks(
|
|
87
|
+
["MTEB(eng, v1)", "MTEB(fas, v1)", "MTEB(rus, v1)"]
|
|
88
|
+
),
|
|
86
89
|
)
|
|
87
90
|
],
|
|
88
91
|
),
|
mteb/leaderboard/table.py
CHANGED
|
@@ -204,8 +204,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
|
|
|
204
204
|
pinned_columns=2,
|
|
205
205
|
column_widths=column_widths,
|
|
206
206
|
wrap=True,
|
|
207
|
-
|
|
208
|
-
show_copy_button=True,
|
|
207
|
+
buttons=["copy", "fullscreen"],
|
|
209
208
|
show_search="filter",
|
|
210
209
|
)
|
|
211
210
|
|
|
@@ -227,7 +226,6 @@ def _apply_per_task_table_styling(per_task: pd.DataFrame) -> gr.DataFrame:
|
|
|
227
226
|
per_task_style,
|
|
228
227
|
interactive=False,
|
|
229
228
|
pinned_columns=1,
|
|
230
|
-
|
|
231
|
-
show_copy_button=True,
|
|
229
|
+
buttons=["copy", "fullscreen"],
|
|
232
230
|
show_search="filter",
|
|
233
231
|
)
|
|
@@ -196,10 +196,10 @@ COLPALI_CITATION = """
|
|
|
196
196
|
|
|
197
197
|
COLPALI_TRAINING_DATA = {
|
|
198
198
|
# from https://huggingface.co/datasets/vidore/colpali_train_set
|
|
199
|
-
"
|
|
200
|
-
"
|
|
201
|
-
"
|
|
202
|
-
"
|
|
199
|
+
"VidoreDocVQARetrieval",
|
|
200
|
+
"VidoreInfoVQARetrieval",
|
|
201
|
+
"VidoreTatdqaRetrieval",
|
|
202
|
+
"VidoreArxivQARetrieval",
|
|
203
203
|
}
|
|
204
204
|
|
|
205
205
|
colpali_v1_1 = ModelMeta(
|