mteb 2.3.1__py3-none-any.whl → 2.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +62 -1
- mteb/leaderboard/app.py +161 -124
- mteb/leaderboard/benchmark_selector.py +5 -2
- mteb/leaderboard/table.py +2 -4
- mteb/models/model_implementations/ru_sentence_models.py +9 -0
- {mteb-2.3.1.dist-info → mteb-2.3.2.dist-info}/METADATA +2 -2
- {mteb-2.3.1.dist-info → mteb-2.3.2.dist-info}/RECORD +12 -12
- {mteb-2.3.1.dist-info → mteb-2.3.2.dist-info}/WHEEL +0 -0
- {mteb-2.3.1.dist-info → mteb-2.3.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.1.dist-info → mteb-2.3.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.1.dist-info → mteb-2.3.2.dist-info}/top_level.txt +0 -0
|
@@ -43,6 +43,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
43
43
|
VN_MTEB,
|
|
44
44
|
CoIR,
|
|
45
45
|
MTEB_code,
|
|
46
|
+
MTEB_MAIN_RU_v1_1,
|
|
46
47
|
MTEB_multilingual_v1,
|
|
47
48
|
MTEB_multilingual_v2,
|
|
48
49
|
RAR_b,
|
|
@@ -113,6 +114,7 @@ __all__ = [
|
|
|
113
114
|
"VISUAL_DOCUMENT_RETRIEVAL",
|
|
114
115
|
"VN_MTEB",
|
|
115
116
|
"CoIR",
|
|
117
|
+
"MTEB_MAIN_RU_v1_1",
|
|
116
118
|
"MTEB_code",
|
|
117
119
|
"MTEB_multilingual_v1",
|
|
118
120
|
"MTEB_multilingual_v2",
|
|
@@ -185,7 +185,7 @@ We recommend that you use [MTEB(eng, v2)](http://mteb-leaderboard.hf.space/?benc
|
|
|
185
185
|
|
|
186
186
|
MTEB_MAIN_RU = Benchmark(
|
|
187
187
|
name="MTEB(rus, v1)",
|
|
188
|
-
display_name="Russian",
|
|
188
|
+
display_name="Russian legacy",
|
|
189
189
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg",
|
|
190
190
|
tasks=MTEBTasks(
|
|
191
191
|
get_tasks(
|
|
@@ -240,6 +240,67 @@ MTEB_MAIN_RU = Benchmark(
|
|
|
240
240
|
year = {2024},
|
|
241
241
|
}
|
|
242
242
|
""",
|
|
243
|
+
contacts=["Samoed", "artemsnegirev", "Drozhzhinastya"],
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
MTEB_MAIN_RU_v1_1 = Benchmark(
|
|
247
|
+
name="MTEB(rus, v1.1)",
|
|
248
|
+
display_name="Russian",
|
|
249
|
+
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg",
|
|
250
|
+
tasks=MTEBTasks(
|
|
251
|
+
get_tasks(
|
|
252
|
+
languages=["rus"],
|
|
253
|
+
tasks=[
|
|
254
|
+
# Classification
|
|
255
|
+
"GeoreviewClassification",
|
|
256
|
+
"HeadlineClassification",
|
|
257
|
+
"InappropriatenessClassification",
|
|
258
|
+
"KinopoiskClassification",
|
|
259
|
+
"MassiveIntentClassification",
|
|
260
|
+
"MassiveScenarioClassification",
|
|
261
|
+
"RuReviewsClassification",
|
|
262
|
+
"RuSciBenchGRNTIClassification",
|
|
263
|
+
"RuSciBenchOECDClassification",
|
|
264
|
+
# Clustering
|
|
265
|
+
"GeoreviewClusteringP2P",
|
|
266
|
+
"RuSciBenchGRNTIClusteringP2P",
|
|
267
|
+
"RuSciBenchOECDClusteringP2P",
|
|
268
|
+
# MultiLabelClassification
|
|
269
|
+
"CEDRClassification",
|
|
270
|
+
"SensitiveTopicsClassification",
|
|
271
|
+
# PairClassification
|
|
272
|
+
"TERRa",
|
|
273
|
+
# Reranking
|
|
274
|
+
"MIRACLReranking",
|
|
275
|
+
"RuBQReranking",
|
|
276
|
+
# Retrieval
|
|
277
|
+
"MIRACLRetrievalHardNegatives.v2",
|
|
278
|
+
"RiaNewsRetrievalHardNegatives.v2",
|
|
279
|
+
"RuBQRetrieval",
|
|
280
|
+
# STS
|
|
281
|
+
"RUParaPhraserSTS",
|
|
282
|
+
"STS22",
|
|
283
|
+
],
|
|
284
|
+
)
|
|
285
|
+
+ get_tasks(
|
|
286
|
+
tasks=["RuSTSBenchmarkSTS"],
|
|
287
|
+
eval_splits=["test"],
|
|
288
|
+
)
|
|
289
|
+
),
|
|
290
|
+
description="A Russian version of the Massive Text Embedding Benchmark covering the task categories of classification, clustering, reranking, pair classification, retrieval, and semantic similarity. In v1.1, MIRACLRetrieval and RiaNewsRetrieval were replaced with their HardNegatives variants for improved time-optimization measurement. MIRACLRetrievalHardNegatives and RiaNewsRetrievalHardNegatives are used in their updated versions (v2), both of which include improved default prompts.",
|
|
291
|
+
reference="https://aclanthology.org/2023.eacl-main.148/",
|
|
292
|
+
citation=r"""
|
|
293
|
+
@misc{snegirev2024russianfocusedembeddersexplorationrumteb,
|
|
294
|
+
archiveprefix = {arXiv},
|
|
295
|
+
author = {Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov},
|
|
296
|
+
eprint = {2408.12503},
|
|
297
|
+
primaryclass = {cs.CL},
|
|
298
|
+
title = {The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design},
|
|
299
|
+
url = {https://arxiv.org/abs/2408.12503},
|
|
300
|
+
year = {2024},
|
|
301
|
+
}
|
|
302
|
+
""",
|
|
303
|
+
contacts=["Samoed", "artemsnegirev", "Drozhzhinastya"],
|
|
243
304
|
)
|
|
244
305
|
|
|
245
306
|
|
mteb/leaderboard/app.py
CHANGED
|
@@ -5,7 +5,7 @@ import tempfile
|
|
|
5
5
|
import time
|
|
6
6
|
import warnings
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Literal
|
|
8
|
+
from typing import Literal
|
|
9
9
|
from urllib.parse import urlencode
|
|
10
10
|
|
|
11
11
|
import cachetools
|
|
@@ -14,7 +14,6 @@ import pandas as pd
|
|
|
14
14
|
|
|
15
15
|
import mteb
|
|
16
16
|
from mteb import BenchmarkResults
|
|
17
|
-
from mteb.abstasks.task_metadata import TaskDomain, TaskType
|
|
18
17
|
from mteb.benchmarks.benchmark import RtebBenchmark
|
|
19
18
|
from mteb.cache import ResultCache
|
|
20
19
|
from mteb.leaderboard.benchmark_selector import (
|
|
@@ -29,7 +28,6 @@ from mteb.leaderboard.table import (
|
|
|
29
28
|
apply_summary_styling_from_benchmark,
|
|
30
29
|
)
|
|
31
30
|
from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
|
|
32
|
-
from mteb.types import Modalities
|
|
33
31
|
|
|
34
32
|
logger = logging.getLogger(__name__)
|
|
35
33
|
|
|
@@ -139,7 +137,10 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
|
|
|
139
137
|
df["languages"] = df["languages"].map(_format_list)
|
|
140
138
|
df = df.sort_values("name")
|
|
141
139
|
df["domains"] = df["domains"].map(_format_list)
|
|
142
|
-
df["name"] =
|
|
140
|
+
df["name"] = df.apply(
|
|
141
|
+
lambda row: f'<a href="{row["reference"]}" target="_blank">{row["name"]}</a>',
|
|
142
|
+
axis=1,
|
|
143
|
+
)
|
|
143
144
|
df["modalities"] = df["modalities"].map(_format_list)
|
|
144
145
|
df = df.rename(
|
|
145
146
|
columns={
|
|
@@ -155,9 +156,8 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
|
|
|
155
156
|
df = df.drop(columns="reference")
|
|
156
157
|
return gr.DataFrame(
|
|
157
158
|
df,
|
|
158
|
-
datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
|
|
159
|
-
|
|
160
|
-
show_fullscreen_button=True,
|
|
159
|
+
datatype=["markdown"] + ["str"] * (len(df.columns) - 1), # type: ignore
|
|
160
|
+
buttons=["copy", "fullscreen"],
|
|
161
161
|
show_search="filter",
|
|
162
162
|
)
|
|
163
163
|
|
|
@@ -215,6 +215,110 @@ def _should_show_zero_shot_filter(benchmark_name: str) -> bool:
|
|
|
215
215
|
return True
|
|
216
216
|
|
|
217
217
|
|
|
218
|
+
@cachetools.cached(
|
|
219
|
+
cache={},
|
|
220
|
+
key=lambda benchmark_name, all_benchmark_results: hash(benchmark_name),
|
|
221
|
+
)
|
|
222
|
+
def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
|
|
223
|
+
start_time = time.time()
|
|
224
|
+
benchmark = mteb.get_benchmark(benchmark_name)
|
|
225
|
+
languages = [task.languages for task in benchmark.tasks if task.languages]
|
|
226
|
+
languages = set(itertools.chain.from_iterable(languages))
|
|
227
|
+
languages = sorted(languages)
|
|
228
|
+
domains = [
|
|
229
|
+
task.metadata.domains for task in benchmark.tasks if task.metadata.domains
|
|
230
|
+
]
|
|
231
|
+
domains = set(itertools.chain.from_iterable(domains))
|
|
232
|
+
types = {task.metadata.type for task in benchmark.tasks if task.metadata.type}
|
|
233
|
+
modalities = set()
|
|
234
|
+
for task in benchmark.tasks:
|
|
235
|
+
modalities.update(task.metadata.modalities)
|
|
236
|
+
languages, domains, types, modalities = (
|
|
237
|
+
sorted(languages),
|
|
238
|
+
sorted(domains),
|
|
239
|
+
sorted(types),
|
|
240
|
+
sorted(modalities),
|
|
241
|
+
)
|
|
242
|
+
elapsed = time.time() - start_time
|
|
243
|
+
benchmark_results = all_benchmark_results[benchmark_name]
|
|
244
|
+
scores = benchmark_results._get_scores(format="long")
|
|
245
|
+
logger.debug(f"on_benchmark_select callback: {elapsed}s")
|
|
246
|
+
show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
|
|
247
|
+
|
|
248
|
+
# Calculate initial models for this benchmark to avoid race conditions
|
|
249
|
+
benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
|
|
250
|
+
all_models_in_scores = list({entry["model_name"] for entry in scores})
|
|
251
|
+
initial_models = _filter_models(
|
|
252
|
+
all_models_in_scores,
|
|
253
|
+
benchmark_tasks,
|
|
254
|
+
availability=None,
|
|
255
|
+
compatibility=[],
|
|
256
|
+
instructions=None,
|
|
257
|
+
max_model_size=MAX_MODEL_SIZE,
|
|
258
|
+
zero_shot_setting="allow_all",
|
|
259
|
+
)
|
|
260
|
+
# Sort to ensure consistency with update_models
|
|
261
|
+
initial_models = sorted(initial_models)
|
|
262
|
+
|
|
263
|
+
return (
|
|
264
|
+
languages,
|
|
265
|
+
domains,
|
|
266
|
+
types,
|
|
267
|
+
modalities,
|
|
268
|
+
benchmark_tasks,
|
|
269
|
+
scores,
|
|
270
|
+
show_zero_shot,
|
|
271
|
+
initial_models,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
@cachetools.cached(
|
|
276
|
+
cache={},
|
|
277
|
+
key=lambda benchmark_name,
|
|
278
|
+
type_select,
|
|
279
|
+
domain_select,
|
|
280
|
+
lang_select,
|
|
281
|
+
modality_select: hash(
|
|
282
|
+
(
|
|
283
|
+
hash(benchmark_name),
|
|
284
|
+
hash(tuple(type_select)),
|
|
285
|
+
hash(tuple(domain_select)),
|
|
286
|
+
hash(tuple(lang_select)),
|
|
287
|
+
hash(tuple(modality_select)),
|
|
288
|
+
)
|
|
289
|
+
),
|
|
290
|
+
)
|
|
291
|
+
def _cache_update_task_list(
|
|
292
|
+
benchmark_name, type_select, domain_select, lang_select, modality_select
|
|
293
|
+
):
|
|
294
|
+
if not len(lang_select):
|
|
295
|
+
return []
|
|
296
|
+
start_time = time.time()
|
|
297
|
+
benchmark_tasks = []
|
|
298
|
+
tasks_to_keep = []
|
|
299
|
+
for task in mteb.get_benchmark(benchmark_name).tasks:
|
|
300
|
+
benchmark_tasks.append(task.metadata.name)
|
|
301
|
+
if task.metadata.type not in type_select:
|
|
302
|
+
continue
|
|
303
|
+
if task.metadata.domains and not (
|
|
304
|
+
set(task.metadata.domains) & set(domain_select)
|
|
305
|
+
):
|
|
306
|
+
continue
|
|
307
|
+
if task.languages and not (set(task.languages) & set(lang_select)):
|
|
308
|
+
continue
|
|
309
|
+
if task.metadata.modalities and not (
|
|
310
|
+
set(task.metadata.modalities) & set(modality_select)
|
|
311
|
+
):
|
|
312
|
+
continue
|
|
313
|
+
tasks_to_keep.append(task.metadata.name)
|
|
314
|
+
benchmark_tasks.sort()
|
|
315
|
+
tasks_to_keep.sort()
|
|
316
|
+
elapsed = time.time() - start_time
|
|
317
|
+
logger.debug(f"update_task_list callback: {elapsed}s")
|
|
318
|
+
|
|
319
|
+
return benchmark_tasks, tasks_to_keep
|
|
320
|
+
|
|
321
|
+
|
|
218
322
|
def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
219
323
|
"""Returns a Gradio Blocks app for the MTEB leaderboard."""
|
|
220
324
|
logger.info("Loading all benchmark results")
|
|
@@ -227,6 +331,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
227
331
|
benchmark.name: all_results.select_tasks(benchmark.tasks).join_revisions()
|
|
228
332
|
for benchmark in benchmarks
|
|
229
333
|
}
|
|
334
|
+
|
|
230
335
|
default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME)
|
|
231
336
|
default_results = all_benchmark_results[default_benchmark.name]
|
|
232
337
|
logger.info("Benchmark results loaded")
|
|
@@ -257,55 +362,48 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
257
362
|
default_benchmark, filtered_benchmark_results
|
|
258
363
|
)
|
|
259
364
|
|
|
260
|
-
lang_select = gr.
|
|
261
|
-
|
|
365
|
+
lang_select = gr.CheckboxGroup(
|
|
366
|
+
sorted(default_results.languages),
|
|
262
367
|
value=sorted(default_results.languages),
|
|
263
|
-
|
|
264
|
-
|
|
368
|
+
show_label=True,
|
|
369
|
+
show_select_all=True,
|
|
265
370
|
label="Language",
|
|
266
371
|
info="Select languages to include.",
|
|
267
372
|
)
|
|
268
|
-
type_select = gr.
|
|
269
|
-
sorted(
|
|
373
|
+
type_select = gr.CheckboxGroup(
|
|
374
|
+
sorted(default_results.task_types),
|
|
270
375
|
value=sorted(default_results.task_types),
|
|
271
|
-
|
|
376
|
+
show_label=True,
|
|
377
|
+
show_select_all=True,
|
|
272
378
|
label="Task Type",
|
|
273
379
|
info="Select task types to include.",
|
|
274
380
|
)
|
|
275
|
-
domain_select = gr.
|
|
276
|
-
sorted(
|
|
381
|
+
domain_select = gr.CheckboxGroup(
|
|
382
|
+
sorted(default_results.domains),
|
|
277
383
|
value=sorted(default_results.domains),
|
|
278
|
-
|
|
384
|
+
show_label=True,
|
|
385
|
+
show_select_all=True,
|
|
279
386
|
label="Domain",
|
|
280
387
|
info="Select domains to include.",
|
|
281
388
|
)
|
|
282
|
-
task_select = gr.
|
|
283
|
-
sorted(
|
|
389
|
+
task_select = gr.CheckboxGroup(
|
|
390
|
+
sorted(default_results.task_names),
|
|
284
391
|
value=sorted(default_results.task_names),
|
|
285
|
-
|
|
286
|
-
|
|
392
|
+
show_label=True,
|
|
393
|
+
show_select_all=True,
|
|
287
394
|
label="Task",
|
|
288
395
|
info="Select specific tasks to include",
|
|
289
396
|
)
|
|
290
|
-
modality_select = gr.
|
|
291
|
-
sorted(
|
|
397
|
+
modality_select = gr.CheckboxGroup(
|
|
398
|
+
sorted(default_results.modalities),
|
|
292
399
|
value=sorted(default_results.modalities),
|
|
293
|
-
|
|
400
|
+
show_label=True,
|
|
401
|
+
show_select_all=True,
|
|
294
402
|
label="Modality",
|
|
295
403
|
info="Select modalities to include.",
|
|
296
404
|
)
|
|
297
405
|
|
|
298
|
-
|
|
299
|
-
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
|
|
300
|
-
"""
|
|
301
|
-
|
|
302
|
-
with gr.Blocks(
|
|
303
|
-
fill_width=True,
|
|
304
|
-
theme=gr.themes.Soft(
|
|
305
|
-
font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
|
|
306
|
-
),
|
|
307
|
-
head=head,
|
|
308
|
-
) as demo:
|
|
406
|
+
with gr.Blocks(fill_width=True) as demo:
|
|
309
407
|
with gr.Sidebar(
|
|
310
408
|
position="left",
|
|
311
409
|
label="Benchmark Selection and Customization",
|
|
@@ -465,62 +563,25 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
465
563
|
# This sets the benchmark from the URL query parameters
|
|
466
564
|
demo.load(_set_benchmark_on_load, inputs=[], outputs=[benchmark_select])
|
|
467
565
|
|
|
468
|
-
@cachetools.cached(
|
|
469
|
-
cache={},
|
|
470
|
-
key=lambda benchmark_name: hash(benchmark_name),
|
|
471
|
-
)
|
|
472
566
|
def on_benchmark_select(benchmark_name):
|
|
473
|
-
|
|
474
|
-
benchmark = mteb.get_benchmark(benchmark_name)
|
|
475
|
-
languages = [task.languages for task in benchmark.tasks if task.languages]
|
|
476
|
-
languages = set(itertools.chain.from_iterable(languages))
|
|
477
|
-
languages = sorted(languages)
|
|
478
|
-
domains = [
|
|
479
|
-
task.metadata.domains
|
|
480
|
-
for task in benchmark.tasks
|
|
481
|
-
if task.metadata.domains
|
|
482
|
-
]
|
|
483
|
-
domains = set(itertools.chain.from_iterable(domains))
|
|
484
|
-
types = {
|
|
485
|
-
task.metadata.type for task in benchmark.tasks if task.metadata.type
|
|
486
|
-
}
|
|
487
|
-
modalities = set()
|
|
488
|
-
for task in benchmark.tasks:
|
|
489
|
-
modalities.update(task.metadata.modalities)
|
|
490
|
-
languages, domains, types, modalities = (
|
|
491
|
-
sorted(languages),
|
|
492
|
-
sorted(domains),
|
|
493
|
-
sorted(types),
|
|
494
|
-
sorted(modalities),
|
|
495
|
-
)
|
|
496
|
-
elapsed = time.time() - start_time
|
|
497
|
-
benchmark_results = all_benchmark_results[benchmark_name]
|
|
498
|
-
scores = benchmark_results._get_scores(format="long")
|
|
499
|
-
logger.debug(f"on_benchmark_select callback: {elapsed}s")
|
|
500
|
-
show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
|
|
501
|
-
|
|
502
|
-
# Calculate initial models for this benchmark to avoid race conditions
|
|
503
|
-
benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
|
|
504
|
-
all_models_in_scores = list({entry["model_name"] for entry in scores})
|
|
505
|
-
initial_models = _filter_models(
|
|
506
|
-
all_models_in_scores,
|
|
507
|
-
benchmark_tasks,
|
|
508
|
-
availability=None,
|
|
509
|
-
compatibility=[],
|
|
510
|
-
instructions=None,
|
|
511
|
-
max_model_size=MAX_MODEL_SIZE,
|
|
512
|
-
zero_shot_setting="allow_all",
|
|
513
|
-
)
|
|
514
|
-
# Sort to ensure consistency with update_models
|
|
515
|
-
initial_models = sorted(initial_models)
|
|
516
|
-
|
|
517
|
-
return (
|
|
567
|
+
(
|
|
518
568
|
languages,
|
|
519
569
|
domains,
|
|
520
570
|
types,
|
|
521
571
|
modalities,
|
|
522
572
|
benchmark_tasks,
|
|
523
573
|
scores,
|
|
574
|
+
show_zero_shot,
|
|
575
|
+
initial_models,
|
|
576
|
+
) = _cache_on_benchmark_select(benchmark_name, all_benchmark_results)
|
|
577
|
+
|
|
578
|
+
return (
|
|
579
|
+
gr.update(choices=languages, value=languages),
|
|
580
|
+
gr.update(choices=domains, value=domains),
|
|
581
|
+
gr.update(choices=types, value=types),
|
|
582
|
+
gr.update(choices=modalities, value=modalities),
|
|
583
|
+
gr.update(choices=benchmark_tasks, value=benchmark_tasks),
|
|
584
|
+
scores,
|
|
524
585
|
gr.update(visible=show_zero_shot),
|
|
525
586
|
initial_models,
|
|
526
587
|
)
|
|
@@ -562,48 +623,13 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
562
623
|
outputs=[scores],
|
|
563
624
|
)
|
|
564
625
|
|
|
565
|
-
@cachetools.cached(
|
|
566
|
-
cache={},
|
|
567
|
-
key=lambda benchmark_name,
|
|
568
|
-
type_select,
|
|
569
|
-
domain_select,
|
|
570
|
-
lang_select,
|
|
571
|
-
modality_select: hash(
|
|
572
|
-
(
|
|
573
|
-
hash(benchmark_name),
|
|
574
|
-
hash(tuple(type_select)),
|
|
575
|
-
hash(tuple(domain_select)),
|
|
576
|
-
hash(tuple(lang_select)),
|
|
577
|
-
hash(tuple(modality_select)),
|
|
578
|
-
)
|
|
579
|
-
),
|
|
580
|
-
)
|
|
581
626
|
def update_task_list(
|
|
582
627
|
benchmark_name, type_select, domain_select, lang_select, modality_select
|
|
583
628
|
):
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
for task in mteb.get_benchmark(benchmark_name).tasks:
|
|
589
|
-
if task.metadata.type not in type_select:
|
|
590
|
-
continue
|
|
591
|
-
if task.metadata.domains is not None and not (
|
|
592
|
-
set(task.metadata.domains) & set(domain_select)
|
|
593
|
-
):
|
|
594
|
-
continue
|
|
595
|
-
if task.languages is not None and not (
|
|
596
|
-
set(task.languages) & set(lang_select)
|
|
597
|
-
):
|
|
598
|
-
continue
|
|
599
|
-
if task.metadata.modalities and not (
|
|
600
|
-
set(task.metadata.modalities) & set(modality_select)
|
|
601
|
-
):
|
|
602
|
-
continue
|
|
603
|
-
tasks_to_keep.append(task.metadata.name)
|
|
604
|
-
elapsed = time.time() - start_time
|
|
605
|
-
logger.debug(f"update_task_list callback: {elapsed}s")
|
|
606
|
-
return sorted(tasks_to_keep)
|
|
629
|
+
benchmark_tasks, tasks_to_keep = _cache_update_task_list(
|
|
630
|
+
benchmark_name, type_select, domain_select, lang_select, modality_select
|
|
631
|
+
)
|
|
632
|
+
return gr.update(choices=benchmark_tasks, value=tasks_to_keep)
|
|
607
633
|
|
|
608
634
|
type_select.input(
|
|
609
635
|
update_task_list,
|
|
@@ -913,4 +939,15 @@ if __name__ == "__main__":
|
|
|
913
939
|
warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
|
|
914
940
|
|
|
915
941
|
app = get_leaderboard_app()
|
|
916
|
-
|
|
942
|
+
|
|
943
|
+
head = """
|
|
944
|
+
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
|
|
945
|
+
"""
|
|
946
|
+
app.launch(
|
|
947
|
+
server_name="0.0.0.0",
|
|
948
|
+
server_port=7860,
|
|
949
|
+
theme=gr.themes.Soft(
|
|
950
|
+
font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
|
|
951
|
+
),
|
|
952
|
+
head=head,
|
|
953
|
+
)
|
|
@@ -75,14 +75,17 @@ GP_BENCHMARK_ENTRIES = [
|
|
|
75
75
|
"MTEB(kor, v1)",
|
|
76
76
|
"MTEB(nld, v1)",
|
|
77
77
|
"MTEB(pol, v1)",
|
|
78
|
-
"MTEB(rus, v1)",
|
|
78
|
+
"MTEB(rus, v1.1)",
|
|
79
79
|
"MTEB(fas, v2)",
|
|
80
80
|
"VN-MTEB (vie, v1)",
|
|
81
81
|
]
|
|
82
82
|
)
|
|
83
83
|
+ [
|
|
84
84
|
MenuEntry(
|
|
85
|
-
"Other",
|
|
85
|
+
"Other",
|
|
86
|
+
mteb.get_benchmarks(
|
|
87
|
+
["MTEB(eng, v1)", "MTEB(fas, v1)", "MTEB(rus, v1)"]
|
|
88
|
+
),
|
|
86
89
|
)
|
|
87
90
|
],
|
|
88
91
|
),
|
mteb/leaderboard/table.py
CHANGED
|
@@ -204,8 +204,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
|
|
|
204
204
|
pinned_columns=2,
|
|
205
205
|
column_widths=column_widths,
|
|
206
206
|
wrap=True,
|
|
207
|
-
|
|
208
|
-
show_copy_button=True,
|
|
207
|
+
buttons=["copy", "fullscreen"],
|
|
209
208
|
show_search="filter",
|
|
210
209
|
)
|
|
211
210
|
|
|
@@ -227,7 +226,6 @@ def _apply_per_task_table_styling(per_task: pd.DataFrame) -> gr.DataFrame:
|
|
|
227
226
|
per_task_style,
|
|
228
227
|
interactive=False,
|
|
229
228
|
pinned_columns=1,
|
|
230
|
-
|
|
231
|
-
show_copy_button=True,
|
|
229
|
+
buttons=["copy", "fullscreen"],
|
|
232
230
|
show_search="filter",
|
|
233
231
|
)
|
|
@@ -43,6 +43,10 @@ GIGA_task_prompts = {
|
|
|
43
43
|
"query": "Given a news title, retrieve relevant news article",
|
|
44
44
|
"document": "",
|
|
45
45
|
},
|
|
46
|
+
"RiaNewsRetrievalHardNegatives.v2": {
|
|
47
|
+
"query": "Given a news title, retrieve relevant news article",
|
|
48
|
+
"document": "",
|
|
49
|
+
},
|
|
46
50
|
"MIRACLReranking": {
|
|
47
51
|
"query": "Given a question, retrieve Wikipedia passages that answer the question",
|
|
48
52
|
"document": "",
|
|
@@ -51,6 +55,10 @@ GIGA_task_prompts = {
|
|
|
51
55
|
"query": "Given a question, retrieve Wikipedia passages that answer the question",
|
|
52
56
|
"document": "",
|
|
53
57
|
},
|
|
58
|
+
"MIRACLRetrievalHardNegatives.v2": {
|
|
59
|
+
"query": "Given a question, retrieve Wikipedia passages that answer the question",
|
|
60
|
+
"document": "",
|
|
61
|
+
},
|
|
54
62
|
"ArguAna": {
|
|
55
63
|
"query": "Given a search query, retrieve passages that answer the question",
|
|
56
64
|
"document": "Given a search query, retrieve passages that answer the question",
|
|
@@ -755,6 +763,7 @@ frida_prompts = {
|
|
|
755
763
|
"SensitiveTopicsClassification": "categorize_topic: ",
|
|
756
764
|
"TERRa": "categorize_entailment: ",
|
|
757
765
|
"RiaNewsRetrieval": "categorize: ",
|
|
766
|
+
"RiaNewsRetrievalHardNegatives.v2": "",
|
|
758
767
|
}
|
|
759
768
|
|
|
760
769
|
frida_training_datasets = {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.3.
|
|
3
|
+
Version: 2.3.2
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -37,7 +37,7 @@ Requires-Dist: torchvision>0.2.1; extra == "image"
|
|
|
37
37
|
Provides-Extra: codecarbon
|
|
38
38
|
Requires-Dist: codecarbon<3.0.0,>=2.0.0; extra == "codecarbon"
|
|
39
39
|
Provides-Extra: leaderboard
|
|
40
|
-
Requires-Dist: gradio==
|
|
40
|
+
Requires-Dist: gradio==6.0.1; extra == "leaderboard"
|
|
41
41
|
Requires-Dist: plotly<6.0.0,>=5.24.0; extra == "leaderboard"
|
|
42
42
|
Requires-Dist: cachetools>=5.2.0; extra == "leaderboard"
|
|
43
43
|
Requires-Dist: matplotlib>=3.9.4; extra == "leaderboard"
|
|
@@ -55,8 +55,8 @@ mteb/benchmarks/__init__.py,sha256=MQEVeli-zLaJ7Xg0z7RhXQwsdmm7Ht_W2Ln0rZo1Szc,2
|
|
|
55
55
|
mteb/benchmarks/_create_table.py,sha256=z3iqa5dajLk0DYxEE9EeO1qpR3VJXokg8ZQ2rdUkvdM,20452
|
|
56
56
|
mteb/benchmarks/benchmark.py,sha256=70RlMyyg_wkWTlU_IbfLl-KaqRWXGCKTd8fWe9X-AQE,4173
|
|
57
57
|
mteb/benchmarks/get_benchmark.py,sha256=-n_O-gitRKZi48gJKNgGuI36hsP7yLVSiwulnMHN7Gw,3935
|
|
58
|
-
mteb/benchmarks/benchmarks/__init__.py,sha256=
|
|
59
|
-
mteb/benchmarks/benchmarks/benchmarks.py,sha256=
|
|
58
|
+
mteb/benchmarks/benchmarks/__init__.py,sha256=0ySgD14Mu3Y1nJzazR_eUir81ia3x6E23N57SzQNkF0,2150
|
|
59
|
+
mteb/benchmarks/benchmarks/benchmarks.py,sha256=Ob2cHVXwFk328xbV-2ZmUibiVAMtT2RN1ygGgiP6UNQ,92662
|
|
60
60
|
mteb/benchmarks/benchmarks/rteb_benchmarks.py,sha256=QnCSrTTaBfcRlAQp2Nu81tgv1idMXqiM16Fp2zKJ5Ys,10607
|
|
61
61
|
mteb/cli/__init__.py,sha256=v-csUr3eUZElIvrGB6QGtaIdndDfNWEe9oZchsGsJpg,64
|
|
62
62
|
mteb/cli/_display_tasks.py,sha256=7A06dT9sSoTz6shyMvskPxuc5eHY_H7PGPlROzMP0yw,2196
|
|
@@ -1424,10 +1424,10 @@ mteb/languages/language_family.json,sha256=OUGcHeOIPcZPb2FWmYLhxTS0JxjK5y3Fo6x0P
|
|
|
1424
1424
|
mteb/languages/language_scripts.py,sha256=5wix9HTYolNIpTiS5oXf2pGJyL7ftdGKs_m432w81V8,3998
|
|
1425
1425
|
mteb/languages/programming_languages.py,sha256=zxAakT3OSUnAuTnQ34VyeFIECnNXMlleZmAake6jsZE,211
|
|
1426
1426
|
mteb/leaderboard/__init__.py,sha256=991roXmtRwEQysV-37hWEzWpkvPgMCGRqZTHR-hm2io,88
|
|
1427
|
-
mteb/leaderboard/app.py,sha256=
|
|
1428
|
-
mteb/leaderboard/benchmark_selector.py,sha256=
|
|
1427
|
+
mteb/leaderboard/app.py,sha256=rwU3sHxx8YP3kFOvFNAF8izgBd5zgv6lrvO4mZcEmfA,33255
|
|
1428
|
+
mteb/leaderboard/benchmark_selector.py,sha256=uH66SI0iT1J4_fnebViWa83dQwhPi7toBv7PRL_epDw,7784
|
|
1429
1429
|
mteb/leaderboard/figures.py,sha256=Rq20LFpaUhQD4tuKp7P7ExQtAjonMLibgO3ud0ykMag,7491
|
|
1430
|
-
mteb/leaderboard/table.py,sha256=
|
|
1430
|
+
mteb/leaderboard/table.py,sha256=ZBCW8JDk5gLbi06FA6zuGESQ5Xri0XZIO0uK-aWb2us,7772
|
|
1431
1431
|
mteb/leaderboard/text_segments.py,sha256=iMIkS04QQjPbT-SkU0x6fOcS8xRbUYevryu9HydipKM,6570
|
|
1432
1432
|
mteb/models/__init__.py,sha256=ABTuoqiBjBtBWW3LYY7ItBHdylR6jWoy06HH0g6j6fU,910
|
|
1433
1433
|
mteb/models/abs_encoder.py,sha256=m0JkRfRPMYadDgBR9eozRloI31ZSWkSzDFINpwbfLZk,16533
|
|
@@ -1531,7 +1531,7 @@ mteb/models/model_implementations/repllama_models.py,sha256=89HoqEpzkNysHeuf_-Yh
|
|
|
1531
1531
|
mteb/models/model_implementations/rerankers_custom.py,sha256=ro73A9-hHudy3_qIMrhP-ja-3Xqu78r_aORm856zHQc,10651
|
|
1532
1532
|
mteb/models/model_implementations/rerankers_monot5_based.py,sha256=rxVwzapNnHl4gCw79XVCaTXj3-wbToyj7XVL97tpAF4,34302
|
|
1533
1533
|
mteb/models/model_implementations/richinfoai_models.py,sha256=llvYa0JUjyOOMbuTgOYoJ2qeqZ5rLHX1ZjZIYlYbdvA,989
|
|
1534
|
-
mteb/models/model_implementations/ru_sentence_models.py,sha256=
|
|
1534
|
+
mteb/models/model_implementations/ru_sentence_models.py,sha256=GuZFwbzaooufvSMGNjIsL0DDLrqHjhdSsAQHHZo5H08,40480
|
|
1535
1535
|
mteb/models/model_implementations/salesforce_models.py,sha256=KslTK-IKeLvNG-vQir9k6swkaOgjk6eyozm_BOVgTpY,5160
|
|
1536
1536
|
mteb/models/model_implementations/samilpwc_models.py,sha256=oMwKNwCxoH1jZgCy04oo2oVlBZWu253QMpnEEC6emz8,2021
|
|
1537
1537
|
mteb/models/model_implementations/searchmap_models.py,sha256=XvVl99emIgnNUCxkTuFQXW6py2R8vgsArfpyHveCugw,1904
|
|
@@ -2567,9 +2567,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
|
|
|
2567
2567
|
mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
|
|
2568
2568
|
mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
|
|
2569
2569
|
mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
|
|
2570
|
-
mteb-2.3.
|
|
2571
|
-
mteb-2.3.
|
|
2572
|
-
mteb-2.3.
|
|
2573
|
-
mteb-2.3.
|
|
2574
|
-
mteb-2.3.
|
|
2575
|
-
mteb-2.3.
|
|
2570
|
+
mteb-2.3.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2571
|
+
mteb-2.3.2.dist-info/METADATA,sha256=LEbGSbNtHSdIf03wLQKaayWlIbr0sGHRfUCvlO4Voe0,13797
|
|
2572
|
+
mteb-2.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
2573
|
+
mteb-2.3.2.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
|
|
2574
|
+
mteb-2.3.2.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
|
|
2575
|
+
mteb-2.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|