mteb 2.5.3__py3-none-any.whl → 2.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +10 -15
- mteb/_evaluators/any_sts_evaluator.py +1 -4
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +17 -16
- mteb/_evaluators/sklearn_evaluator.py +9 -8
- mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
- mteb/_evaluators/text/summarization_evaluator.py +20 -16
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +27 -21
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +3 -16
- mteb/abstasks/classification.py +10 -4
- mteb/abstasks/clustering.py +18 -14
- mteb/abstasks/clustering_legacy.py +8 -8
- mteb/abstasks/image/image_text_pair_classification.py +5 -3
- mteb/abstasks/multilabel_classification.py +20 -16
- mteb/abstasks/pair_classification.py +18 -9
- mteb/abstasks/regression.py +3 -3
- mteb/abstasks/retrieval.py +12 -9
- mteb/abstasks/sts.py +6 -3
- mteb/abstasks/task_metadata.py +20 -16
- mteb/abstasks/text/bitext_mining.py +36 -25
- mteb/abstasks/text/reranking.py +7 -5
- mteb/abstasks/text/summarization.py +8 -3
- mteb/abstasks/zeroshot_classification.py +5 -2
- mteb/benchmarks/benchmark.py +4 -2
- mteb/benchmarks/benchmarks/benchmarks.py +22 -1
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +21 -18
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +8 -8
- mteb/cli/generate_model_card.py +39 -20
- mteb/deprecated_evaluator.py +56 -43
- mteb/evaluate.py +35 -29
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +25 -27
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +1 -1
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +2 -2
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +8 -1
- mteb/models/instruct_wrapper.py +11 -5
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/dino_models.py +23 -23
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/model2vec_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +8 -8
- mteb/models/model_implementations/openclip_models.py +7 -7
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/repllama_models.py +2 -2
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/vlm2vec_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +4 -4
- mteb/models/model_meta.py +30 -14
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
- mteb/models/search_wrappers.py +22 -10
- mteb/models/sentence_transformer_wrapper.py +9 -4
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +25 -19
- mteb/results/model_result.py +49 -21
- mteb/results/task_result.py +45 -51
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/METADATA +1 -1
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/RECORD +105 -104
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/WHEEL +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/top_level.txt +0 -0
mteb/benchmarks/get_benchmark.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import difflib
|
|
2
2
|
import logging
|
|
3
|
-
import warnings
|
|
4
3
|
from functools import lru_cache
|
|
5
4
|
|
|
6
5
|
from .benchmark import Benchmark
|
|
@@ -20,53 +19,16 @@ def _build_registry() -> dict[str, Benchmark]:
|
|
|
20
19
|
return benchmark_registry
|
|
21
20
|
|
|
22
21
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
MTEB_INDIC,
|
|
34
|
-
MTEB_JPN,
|
|
35
|
-
MTEB_KOR,
|
|
36
|
-
MTEB_MAIN_RU,
|
|
37
|
-
MTEB_POL,
|
|
38
|
-
MTEB_RETRIEVAL_LAW,
|
|
39
|
-
MTEB_RETRIEVAL_MEDICAL,
|
|
40
|
-
MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
|
|
41
|
-
SEB,
|
|
42
|
-
VISUAL_DOCUMENT_RETRIEVAL,
|
|
43
|
-
MTEB_code,
|
|
44
|
-
MTEB_multilingual_v2,
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
previous_benchmark_names = {
|
|
48
|
-
"MTEB(eng)": MTEB_EN.name,
|
|
49
|
-
"MTEB(eng, classic)": MTEB_ENG_CLASSIC.name,
|
|
50
|
-
"MTEB(rus)": MTEB_MAIN_RU.name,
|
|
51
|
-
"MTEB(Retrieval w/Instructions)": MTEB_RETRIEVAL_WITH_INSTRUCTIONS.name,
|
|
52
|
-
"MTEB(law)": MTEB_RETRIEVAL_LAW.name,
|
|
53
|
-
"MTEB(Medical)": MTEB_RETRIEVAL_MEDICAL.name,
|
|
54
|
-
"MTEB(Scandinavian)": SEB.name,
|
|
55
|
-
"MTEB(fra)": MTEB_FRA.name,
|
|
56
|
-
"MTEB(deu)": MTEB_DEU.name,
|
|
57
|
-
"MTEB(kor)": MTEB_KOR.name,
|
|
58
|
-
"MTEB(pol)": MTEB_POL.name,
|
|
59
|
-
"MTEB(code)": MTEB_code.name,
|
|
60
|
-
"MTEB(Multilingual)": MTEB_multilingual_v2.name,
|
|
61
|
-
"MTEB(jpn)": MTEB_JPN.name,
|
|
62
|
-
"MTEB(Indic)": MTEB_INDIC.name,
|
|
63
|
-
"MTEB(Europe)": MTEB_EU.name,
|
|
64
|
-
"MTEB(Chinese)": C_MTEB.name,
|
|
65
|
-
"FaMTEB(fas, beta)": FA_MTEB.name,
|
|
66
|
-
"BRIGHT(long)": BRIGHT_LONG.name,
|
|
67
|
-
"VisualDocumentRetrieval": VISUAL_DOCUMENT_RETRIEVAL.name,
|
|
68
|
-
}
|
|
69
|
-
return previous_benchmark_names
|
|
22
|
+
@lru_cache
|
|
23
|
+
def _build_aliases_registry() -> dict[str, Benchmark]:
|
|
24
|
+
import mteb.benchmarks.benchmarks as benchmark_module
|
|
25
|
+
|
|
26
|
+
aliases: dict[str, Benchmark] = {}
|
|
27
|
+
for _, inst in benchmark_module.__dict__.items():
|
|
28
|
+
if isinstance(inst, Benchmark) and inst.aliases is not None:
|
|
29
|
+
for alias in inst.aliases:
|
|
30
|
+
aliases[alias] = inst
|
|
31
|
+
return aliases
|
|
70
32
|
|
|
71
33
|
|
|
72
34
|
def get_benchmark(
|
|
@@ -80,14 +42,11 @@ def get_benchmark(
|
|
|
80
42
|
Returns:
|
|
81
43
|
The Benchmark instance corresponding to the given name.
|
|
82
44
|
"""
|
|
83
|
-
previous_benchmark_names = _get_previous_benchmark_names()
|
|
84
45
|
benchmark_registry = _build_registry()
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
)
|
|
90
|
-
benchmark_name = previous_benchmark_names[benchmark_name]
|
|
46
|
+
aliases_registry = _build_aliases_registry()
|
|
47
|
+
|
|
48
|
+
if benchmark_name in aliases_registry:
|
|
49
|
+
return aliases_registry[benchmark_name]
|
|
91
50
|
if benchmark_name not in benchmark_registry:
|
|
92
51
|
close_matches = difflib.get_close_matches(
|
|
93
52
|
benchmark_name, benchmark_registry.keys()
|
mteb/cache.py
CHANGED
|
@@ -5,7 +5,7 @@ import shutil
|
|
|
5
5
|
import subprocess
|
|
6
6
|
import warnings
|
|
7
7
|
from collections import defaultdict
|
|
8
|
-
from collections.abc import Sequence
|
|
8
|
+
from collections.abc import Iterable, Sequence
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import cast
|
|
11
11
|
|
|
@@ -291,8 +291,8 @@ class ResultCache:
|
|
|
291
291
|
|
|
292
292
|
def get_cache_paths(
|
|
293
293
|
self,
|
|
294
|
-
models: Sequence[str] |
|
|
295
|
-
tasks: Sequence[str] |
|
|
294
|
+
models: Sequence[str] | Iterable[ModelMeta] | None = None,
|
|
295
|
+
tasks: Sequence[str] | Iterable[AbsTask] | None = None,
|
|
296
296
|
require_model_meta: bool = True,
|
|
297
297
|
include_remote: bool = True,
|
|
298
298
|
) -> list[Path]:
|
|
@@ -425,7 +425,7 @@ class ResultCache:
|
|
|
425
425
|
@staticmethod
|
|
426
426
|
def _filter_paths_by_model_and_revision(
|
|
427
427
|
paths: list[Path],
|
|
428
|
-
models: Sequence[str] |
|
|
428
|
+
models: Sequence[str] | Iterable[ModelMeta] | None = None,
|
|
429
429
|
) -> list[Path]:
|
|
430
430
|
"""Filter a list of paths by model name and optional revision.
|
|
431
431
|
|
|
@@ -435,8 +435,9 @@ class ResultCache:
|
|
|
435
435
|
if not models:
|
|
436
436
|
return paths
|
|
437
437
|
|
|
438
|
-
|
|
439
|
-
|
|
438
|
+
first_model = next(iter(models))
|
|
439
|
+
if isinstance(first_model, ModelMeta):
|
|
440
|
+
models = cast(Iterable[ModelMeta], models)
|
|
440
441
|
name_and_revision = {
|
|
441
442
|
(m.model_name_as_path(), m.revision or "no_revision_available")
|
|
442
443
|
for m in models
|
|
@@ -447,13 +448,14 @@ class ResultCache:
|
|
|
447
448
|
if (p.parent.parent.name, p.parent.name) in name_and_revision
|
|
448
449
|
]
|
|
449
450
|
|
|
450
|
-
|
|
451
|
+
str_models = cast(Sequence[str], models)
|
|
452
|
+
model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
|
|
451
453
|
return [p for p in paths if p.parent.parent.name in model_names]
|
|
452
454
|
|
|
453
455
|
@staticmethod
|
|
454
456
|
def _filter_paths_by_task(
|
|
455
457
|
paths: list[Path],
|
|
456
|
-
tasks: Sequence[str] |
|
|
458
|
+
tasks: Sequence[str] | Iterable[AbsTask] | None = None,
|
|
457
459
|
) -> list[Path]:
|
|
458
460
|
if tasks is not None:
|
|
459
461
|
task_names = set()
|
|
@@ -469,8 +471,8 @@ class ResultCache:
|
|
|
469
471
|
|
|
470
472
|
def load_results(
|
|
471
473
|
self,
|
|
472
|
-
models: Sequence[str] |
|
|
473
|
-
tasks: Sequence[str] |
|
|
474
|
+
models: Sequence[str] | Iterable[ModelMeta] | None = None,
|
|
475
|
+
tasks: Sequence[str] | Iterable[AbsTask] | Benchmark | str | None = None,
|
|
474
476
|
require_model_meta: bool = True,
|
|
475
477
|
include_remote: bool = True,
|
|
476
478
|
validate_and_filter: bool = False,
|
|
@@ -481,6 +483,7 @@ class ResultCache:
|
|
|
481
483
|
Args:
|
|
482
484
|
models: A list of model names to load the results for. If None it will load the results for all models.
|
|
483
485
|
tasks: A list of task names to load the results for. If str is passed, then benchmark will be loaded.
|
|
486
|
+
If Benchmark is passed, then all tasks in the benchmark will be loaded.
|
|
484
487
|
If None it will load the results for all tasks.
|
|
485
488
|
require_model_meta: If True it will ignore results that do not have a model_meta.json file. If false it attempt to
|
|
486
489
|
extract the model name and revision from the path.
|
|
@@ -514,7 +517,7 @@ class ResultCache:
|
|
|
514
517
|
)
|
|
515
518
|
models_results = defaultdict(list)
|
|
516
519
|
|
|
517
|
-
task_names = {}
|
|
520
|
+
task_names: dict[str, AbsTask | None] = {}
|
|
518
521
|
if tasks is not None:
|
|
519
522
|
for task in tasks:
|
|
520
523
|
if isinstance(task, AbsTask):
|
|
@@ -532,9 +535,11 @@ class ResultCache:
|
|
|
532
535
|
)
|
|
533
536
|
|
|
534
537
|
if validate_and_filter:
|
|
535
|
-
|
|
538
|
+
task_instance = task_names[task_result.task_name]
|
|
536
539
|
try:
|
|
537
|
-
task_result = task_result.validate_and_filter_scores(
|
|
540
|
+
task_result = task_result.validate_and_filter_scores(
|
|
541
|
+
task=task_instance
|
|
542
|
+
)
|
|
538
543
|
except Exception as e:
|
|
539
544
|
logger.info(
|
|
540
545
|
f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
|
|
@@ -544,7 +549,7 @@ class ResultCache:
|
|
|
544
549
|
models_results[(model_name, revision)].append(task_result)
|
|
545
550
|
|
|
546
551
|
# create BenchmarkResults object
|
|
547
|
-
|
|
552
|
+
models_results_object = [
|
|
548
553
|
ModelResult(
|
|
549
554
|
model_name=model_name,
|
|
550
555
|
model_revision=revision,
|
|
@@ -553,9 +558,7 @@ class ResultCache:
|
|
|
553
558
|
for (model_name, revision), task_results in models_results.items()
|
|
554
559
|
]
|
|
555
560
|
|
|
556
|
-
|
|
557
|
-
model_results=
|
|
561
|
+
return BenchmarkResults(
|
|
562
|
+
model_results=models_results_object,
|
|
558
563
|
benchmark=tasks if isinstance(tasks, Benchmark) else None,
|
|
559
564
|
)
|
|
560
|
-
|
|
561
|
-
return benchmark_results
|
mteb/cli/_display_tasks.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from collections.abc import Sequence
|
|
1
|
+
from collections.abc import Iterable, Sequence
|
|
2
2
|
|
|
3
3
|
from mteb.abstasks import AbsTask
|
|
4
4
|
from mteb.benchmarks import Benchmark
|
|
@@ -31,7 +31,7 @@ def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
|
|
|
31
31
|
_display_tasks(benchmark.tasks, name=name)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def _display_tasks(task_list:
|
|
34
|
+
def _display_tasks(task_list: Iterable[AbsTask], name: str | None = None) -> None:
|
|
35
35
|
from rich.console import Console
|
|
36
36
|
|
|
37
37
|
console = Console()
|
mteb/cli/build_cli.py
CHANGED
|
@@ -8,12 +8,12 @@ import torch
|
|
|
8
8
|
from rich.logging import RichHandler
|
|
9
9
|
|
|
10
10
|
import mteb
|
|
11
|
+
from mteb.abstasks.abstask import AbsTask
|
|
11
12
|
from mteb.cache import ResultCache
|
|
13
|
+
from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
|
|
12
14
|
from mteb.cli.generate_model_card import generate_model_card
|
|
13
15
|
from mteb.evaluate import OverwriteStrategy
|
|
14
16
|
|
|
15
|
-
from ._display_tasks import _display_benchmarks, _display_tasks
|
|
16
|
-
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
19
19
|
|
|
@@ -54,7 +54,7 @@ def run(args: argparse.Namespace) -> None:
|
|
|
54
54
|
|
|
55
55
|
if args.benchmarks:
|
|
56
56
|
benchmarks = mteb.get_benchmarks(names=args.benchmarks)
|
|
57
|
-
tasks =
|
|
57
|
+
tasks = tuple(t for b in benchmarks for t in b.tasks)
|
|
58
58
|
else:
|
|
59
59
|
tasks = mteb.get_tasks(
|
|
60
60
|
categories=args.categories,
|
|
@@ -290,17 +290,17 @@ def _create_meta(args: argparse.Namespace) -> None:
|
|
|
290
290
|
"Output path already exists, use --overwrite to overwrite."
|
|
291
291
|
)
|
|
292
292
|
|
|
293
|
-
|
|
293
|
+
benchmarks = None
|
|
294
|
+
tasks: list[AbsTask] = []
|
|
294
295
|
if tasks_names is not None:
|
|
295
|
-
tasks = mteb.get_tasks(tasks_names)
|
|
296
|
+
tasks = list(mteb.get_tasks(tasks_names))
|
|
296
297
|
if benchmarks is not None:
|
|
297
298
|
benchmarks = mteb.get_benchmarks(benchmarks)
|
|
298
|
-
for benchmark in benchmarks:
|
|
299
|
-
tasks.extend(benchmark.tasks)
|
|
300
299
|
|
|
301
300
|
generate_model_card(
|
|
302
301
|
model_name,
|
|
303
|
-
tasks
|
|
302
|
+
tasks,
|
|
303
|
+
benchmarks,
|
|
304
304
|
existing_model_card_id_or_path=from_existing,
|
|
305
305
|
results_cache=ResultCache(results_folder),
|
|
306
306
|
output_path=output_path,
|
mteb/cli/generate_model_card.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import warnings
|
|
3
|
+
from collections.abc import Sequence
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
from huggingface_hub import ModelCard, ModelCardData, repo_exists
|
|
6
7
|
|
|
7
|
-
from mteb import BenchmarkResults
|
|
8
8
|
from mteb.abstasks.abstask import AbsTask
|
|
9
|
+
from mteb.benchmarks.benchmark import Benchmark
|
|
9
10
|
from mteb.cache import ResultCache
|
|
10
11
|
|
|
11
12
|
logger = logging.getLogger(__name__)
|
|
@@ -13,12 +14,13 @@ logger = logging.getLogger(__name__)
|
|
|
13
14
|
|
|
14
15
|
def generate_model_card(
|
|
15
16
|
model_name: str,
|
|
16
|
-
tasks:
|
|
17
|
+
tasks: Sequence[AbsTask] | None = None,
|
|
18
|
+
benchmarks: Sequence[Benchmark] | None = None,
|
|
17
19
|
existing_model_card_id_or_path: str | Path | None = None,
|
|
18
20
|
results_cache: ResultCache = ResultCache(),
|
|
19
21
|
output_path: Path = Path("model_card.md"),
|
|
20
22
|
add_table_to_model_card: bool = False,
|
|
21
|
-
models_to_compare:
|
|
23
|
+
models_to_compare: Sequence[str] | None = None,
|
|
22
24
|
token: str | None = None,
|
|
23
25
|
push_to_hub: bool = False,
|
|
24
26
|
) -> None:
|
|
@@ -27,6 +29,7 @@ def generate_model_card(
|
|
|
27
29
|
Args:
|
|
28
30
|
model_name: Name of the model.
|
|
29
31
|
tasks: List of tasks to generate results for.
|
|
32
|
+
benchmarks: A Benchmark or list of benchmarks to generate results for.
|
|
30
33
|
existing_model_card_id_or_path: Path or ID of an existing model card to update.
|
|
31
34
|
results_cache: Instance of ResultCache to load results from.
|
|
32
35
|
output_path: Path to save the generated model card.
|
|
@@ -40,16 +43,24 @@ def generate_model_card(
|
|
|
40
43
|
if existing_model_card_id_or_path:
|
|
41
44
|
existing_model_card = ModelCard.load(existing_model_card_id_or_path)
|
|
42
45
|
|
|
46
|
+
all_tasks: list[AbsTask] = []
|
|
47
|
+
if tasks is not None:
|
|
48
|
+
all_tasks.extend(tasks)
|
|
49
|
+
|
|
50
|
+
if benchmarks is not None:
|
|
51
|
+
for b in benchmarks:
|
|
52
|
+
all_tasks.extend(b.tasks)
|
|
53
|
+
|
|
43
54
|
benchmark_results = results_cache.load_results(
|
|
44
|
-
[model_name],
|
|
55
|
+
[model_name], all_tasks if all_tasks else None, only_main_score=True
|
|
45
56
|
)
|
|
46
57
|
eval_results = []
|
|
47
58
|
for models_results in benchmark_results.model_results:
|
|
48
59
|
for task_result in models_results.task_results:
|
|
49
60
|
eval_results.extend(task_result.get_hf_eval_results())
|
|
50
61
|
|
|
51
|
-
existing_model_card_data = (
|
|
52
|
-
existing_model_card.data if existing_model_card else ModelCardData()
|
|
62
|
+
existing_model_card_data: ModelCardData = (
|
|
63
|
+
existing_model_card.data if existing_model_card else ModelCardData() # type: ignore[assignment]
|
|
53
64
|
)
|
|
54
65
|
|
|
55
66
|
if existing_model_card_data.eval_results is None:
|
|
@@ -79,17 +90,16 @@ def generate_model_card(
|
|
|
79
90
|
card_data=existing_model_card_data
|
|
80
91
|
)
|
|
81
92
|
|
|
82
|
-
if models_to_compare:
|
|
83
|
-
benchmark_results = results_cache.load_results(
|
|
84
|
-
[model_name, *models_to_compare], tasks, only_main_score=True
|
|
85
|
-
)
|
|
86
|
-
|
|
87
93
|
if add_table_to_model_card:
|
|
88
94
|
existing_model_card = _add_table_to_model_card(
|
|
89
|
-
|
|
95
|
+
results_cache,
|
|
96
|
+
existing_model_card,
|
|
97
|
+
(model_name, *models_to_compare) if models_to_compare else (model_name,),
|
|
98
|
+
benchmarks or [],
|
|
90
99
|
)
|
|
91
100
|
|
|
92
|
-
if push_to_hub:
|
|
101
|
+
if push_to_hub and existing_model_card_id_or_path:
|
|
102
|
+
existing_model_card_id_or_path = str(existing_model_card_id_or_path)
|
|
93
103
|
if repo_exists(existing_model_card_id_or_path):
|
|
94
104
|
existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token)
|
|
95
105
|
else:
|
|
@@ -100,14 +110,23 @@ def generate_model_card(
|
|
|
100
110
|
|
|
101
111
|
|
|
102
112
|
def _add_table_to_model_card(
|
|
103
|
-
|
|
113
|
+
results_cache: ResultCache,
|
|
114
|
+
model_card: ModelCard,
|
|
115
|
+
models: Sequence[str],
|
|
116
|
+
benchmarks: Sequence[Benchmark],
|
|
104
117
|
) -> ModelCard:
|
|
105
118
|
original_content = model_card.content
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
119
|
+
mteb_content = "# MTEB Results\n\n"
|
|
120
|
+
|
|
121
|
+
for benchmark in benchmarks:
|
|
122
|
+
mteb_content += f"## Benchmark: {benchmark.name}\n\n"
|
|
123
|
+
benchmark_results = results_cache.load_results(
|
|
124
|
+
tasks=benchmark,
|
|
125
|
+
models=models,
|
|
126
|
+
only_main_score=True,
|
|
127
|
+
)
|
|
128
|
+
df_results = benchmark_results.get_benchmark_result()
|
|
129
|
+
mteb_content += df_results.to_markdown(index=True) + "\n\n"
|
|
130
|
+
|
|
112
131
|
model_card.content = original_content + "\n\n" + mteb_content
|
|
113
132
|
return model_card
|
mteb/deprecated_evaluator.py
CHANGED
|
@@ -6,23 +6,23 @@ import os
|
|
|
6
6
|
import sys
|
|
7
7
|
import traceback
|
|
8
8
|
import warnings
|
|
9
|
-
from collections.abc import Iterable
|
|
9
|
+
from collections.abc import Iterable, Sequence
|
|
10
10
|
from copy import deepcopy
|
|
11
11
|
from datetime import datetime
|
|
12
12
|
from itertools import chain
|
|
13
13
|
from pathlib import Path
|
|
14
14
|
from time import time
|
|
15
|
-
from typing import TYPE_CHECKING, Any
|
|
15
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
16
16
|
|
|
17
17
|
import datasets
|
|
18
18
|
|
|
19
19
|
import mteb
|
|
20
20
|
from mteb.abstasks import AbsTask
|
|
21
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
21
22
|
from mteb.abstasks.task_metadata import TaskCategory, TaskType
|
|
22
23
|
from mteb.benchmarks import Benchmark
|
|
23
24
|
from mteb.models import (
|
|
24
25
|
CrossEncoderWrapper,
|
|
25
|
-
EncoderProtocol,
|
|
26
26
|
ModelMeta,
|
|
27
27
|
MTEBModels,
|
|
28
28
|
SentenceTransformerEncoderWrapper,
|
|
@@ -53,7 +53,7 @@ class MTEB:
|
|
|
53
53
|
)
|
|
54
54
|
def __init__(
|
|
55
55
|
self,
|
|
56
|
-
tasks: Iterable[AbsTask | Benchmark],
|
|
56
|
+
tasks: Iterable[AbsTask] | Iterable[Benchmark],
|
|
57
57
|
*,
|
|
58
58
|
err_logs_path: str = "error_logs.txt",
|
|
59
59
|
) -> None:
|
|
@@ -64,15 +64,14 @@ class MTEB:
|
|
|
64
64
|
`mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
|
|
65
65
|
err_logs_path: Path to save error logs.
|
|
66
66
|
"""
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
self.tasks = list(tasks)
|
|
70
|
-
if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
|
|
67
|
+
if isinstance(next(iter(tasks)), Benchmark):
|
|
71
68
|
self.benchmarks = tasks
|
|
72
|
-
self.tasks = list(chain.from_iterable(
|
|
69
|
+
self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
|
|
70
|
+
elif isinstance(next(iter(tasks)), AbsTask):
|
|
71
|
+
self.tasks = list(cast(Iterable[AbsTask], tasks))
|
|
73
72
|
|
|
74
73
|
self.err_logs_path = Path(err_logs_path)
|
|
75
|
-
self.
|
|
74
|
+
self._last_evaluated_splits: dict[str, list[str]] = {}
|
|
76
75
|
|
|
77
76
|
@property
|
|
78
77
|
def available_tasks(self) -> list[str]:
|
|
@@ -85,7 +84,7 @@ class MTEB:
|
|
|
85
84
|
return sorted({x.metadata.type for x in self.tasks})
|
|
86
85
|
|
|
87
86
|
@property
|
|
88
|
-
def available_task_categories(self) -> set[TaskCategory]:
|
|
87
|
+
def available_task_categories(self) -> set[TaskCategory | None]:
|
|
89
88
|
"""Set of available task categories."""
|
|
90
89
|
return {x.metadata.category for x in self.tasks}
|
|
91
90
|
|
|
@@ -232,13 +231,14 @@ class MTEB:
|
|
|
232
231
|
merged_kg_co2_emissions = None
|
|
233
232
|
if existing_kg_co2_emissions and new_kg_co2_emissions:
|
|
234
233
|
merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions
|
|
234
|
+
existing_evaluation_time = existing_results.evaluation_time or 0
|
|
235
|
+
new_evaluation_time = new_results.evaluation_time or 0
|
|
235
236
|
merged_results = TaskResult(
|
|
236
237
|
dataset_revision=new_results.dataset_revision,
|
|
237
238
|
task_name=new_results.task_name,
|
|
238
239
|
mteb_version=new_results.mteb_version,
|
|
239
240
|
scores=merged_scores,
|
|
240
|
-
evaluation_time=
|
|
241
|
-
+ new_results.evaluation_time,
|
|
241
|
+
evaluation_time=existing_evaluation_time + new_evaluation_time,
|
|
242
242
|
kg_co2_emissions=merged_kg_co2_emissions,
|
|
243
243
|
)
|
|
244
244
|
|
|
@@ -307,13 +307,16 @@ class MTEB:
|
|
|
307
307
|
elif verbosity == 3:
|
|
308
308
|
datasets.logging.set_verbosity(logging.DEBUG)
|
|
309
309
|
|
|
310
|
-
|
|
311
|
-
output_path = self._create_output_folder(meta, output_folder)
|
|
312
|
-
|
|
310
|
+
mteb_model: MTEBModels
|
|
313
311
|
if isinstance(model, SentenceTransformer):
|
|
314
|
-
|
|
312
|
+
mteb_model = SentenceTransformerEncoderWrapper(model)
|
|
315
313
|
elif isinstance(model, CrossEncoder):
|
|
316
|
-
|
|
314
|
+
mteb_model = CrossEncoderWrapper(model)
|
|
315
|
+
else:
|
|
316
|
+
mteb_model = cast(MTEBModels, model)
|
|
317
|
+
|
|
318
|
+
meta = self.create_model_meta(mteb_model)
|
|
319
|
+
output_path = self._create_output_folder(meta, output_folder)
|
|
317
320
|
|
|
318
321
|
# Disable co2_tracker for API models
|
|
319
322
|
if "API" in meta.framework:
|
|
@@ -334,7 +337,7 @@ class MTEB:
|
|
|
334
337
|
) # save them in case we re-use the object (e.g. for reranking)
|
|
335
338
|
|
|
336
339
|
# To evaluate missing splits, we keep track of the task name and the corresponding splits.
|
|
337
|
-
self.
|
|
340
|
+
self._last_evaluated_splits = {}
|
|
338
341
|
|
|
339
342
|
while len(self.tasks) > 0:
|
|
340
343
|
task = self.tasks[0]
|
|
@@ -343,9 +346,10 @@ class MTEB:
|
|
|
343
346
|
)
|
|
344
347
|
|
|
345
348
|
if task.is_aggregate:
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
+
aggregated_task = cast(AbsTaskAggregate, task)
|
|
350
|
+
self_ = MTEB(tasks=aggregated_task.metadata.tasks)
|
|
351
|
+
aggregated_task_results = self_.run(
|
|
352
|
+
mteb_model,
|
|
349
353
|
verbosity=verbosity - 1,
|
|
350
354
|
output_folder=output_folder,
|
|
351
355
|
eval_splits=eval_splits,
|
|
@@ -356,12 +360,15 @@ class MTEB:
|
|
|
356
360
|
encode_kwargs=encode_kwargs,
|
|
357
361
|
**kwargs,
|
|
358
362
|
)
|
|
359
|
-
new_results =
|
|
363
|
+
new_results = aggregated_task.combine_task_results(
|
|
364
|
+
aggregated_task_results
|
|
365
|
+
)
|
|
360
366
|
evaluation_results.append(new_results)
|
|
361
367
|
|
|
362
368
|
if output_path:
|
|
363
|
-
|
|
364
|
-
|
|
369
|
+
new_results.to_disk(
|
|
370
|
+
output_path / f"{aggregated_task.metadata.name}.json"
|
|
371
|
+
)
|
|
365
372
|
del self.tasks[0]
|
|
366
373
|
continue
|
|
367
374
|
|
|
@@ -383,7 +390,7 @@ class MTEB:
|
|
|
383
390
|
task_subsets = task.hf_subsets
|
|
384
391
|
|
|
385
392
|
existing_results = None
|
|
386
|
-
save_path = None
|
|
393
|
+
save_path: Path | None = None
|
|
387
394
|
final_splits_to_run = task_eval_splits
|
|
388
395
|
missing_evaluations = self._get_missing_evaluations(
|
|
389
396
|
existing_results,
|
|
@@ -433,7 +440,7 @@ class MTEB:
|
|
|
433
440
|
logger.info(
|
|
434
441
|
f"No splits to evaluate for {task.metadata.name}. Skipping evaluation."
|
|
435
442
|
)
|
|
436
|
-
self.
|
|
443
|
+
self._last_evaluated_splits[task.metadata.name] = []
|
|
437
444
|
del self.tasks[0]
|
|
438
445
|
continue
|
|
439
446
|
|
|
@@ -441,11 +448,11 @@ class MTEB:
|
|
|
441
448
|
task.check_if_dataset_is_superseded()
|
|
442
449
|
task.load_data()
|
|
443
450
|
|
|
444
|
-
task_results = {}
|
|
451
|
+
task_results: dict[str, dict[str, dict[str, Any]]] = {}
|
|
445
452
|
evaluation_time = 0
|
|
446
453
|
kg_co2_emissions: int | None = 0 if co2_tracker else None
|
|
447
454
|
|
|
448
|
-
self.
|
|
455
|
+
self._last_evaluated_splits[task.metadata.name] = []
|
|
449
456
|
|
|
450
457
|
for split in final_splits_to_run:
|
|
451
458
|
info = missing_evaluations[split]
|
|
@@ -466,7 +473,9 @@ class MTEB:
|
|
|
466
473
|
|
|
467
474
|
if co2_tracker:
|
|
468
475
|
try:
|
|
469
|
-
from codecarbon import
|
|
476
|
+
from codecarbon import ( # type: ignore[import-untyped]
|
|
477
|
+
EmissionsTracker,
|
|
478
|
+
)
|
|
470
479
|
except ImportError:
|
|
471
480
|
raise ImportError(
|
|
472
481
|
"codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions."
|
|
@@ -482,7 +491,7 @@ class MTEB:
|
|
|
482
491
|
) as tracker:
|
|
483
492
|
results, tick, tock = self._run_eval(
|
|
484
493
|
task,
|
|
485
|
-
|
|
494
|
+
mteb_model,
|
|
486
495
|
split,
|
|
487
496
|
encode_kwargs=encode_kwargs,
|
|
488
497
|
subsets_to_run=subsets_to_run,
|
|
@@ -495,7 +504,7 @@ class MTEB:
|
|
|
495
504
|
else:
|
|
496
505
|
results, tick, tock = self._run_eval(
|
|
497
506
|
task,
|
|
498
|
-
|
|
507
|
+
mteb_model,
|
|
499
508
|
split,
|
|
500
509
|
subsets_to_run=subsets_to_run,
|
|
501
510
|
encode_kwargs=encode_kwargs,
|
|
@@ -511,25 +520,25 @@ class MTEB:
|
|
|
511
520
|
if verbosity >= 1:
|
|
512
521
|
logger.info(f"Scores: {task_results[split]}")
|
|
513
522
|
|
|
514
|
-
self.
|
|
523
|
+
self._last_evaluated_splits[task.metadata.name].append(split)
|
|
515
524
|
|
|
516
525
|
# Create new TaskResult
|
|
517
526
|
new_results = TaskResult.from_task_results(
|
|
518
527
|
task,
|
|
519
|
-
task_results,
|
|
528
|
+
task_results, # type: ignore[arg-type]
|
|
520
529
|
evaluation_time=evaluation_time,
|
|
521
530
|
kg_co2_emissions=kg_co2_emissions,
|
|
522
531
|
)
|
|
523
532
|
|
|
524
533
|
# Merge with existing if needed
|
|
525
|
-
if output_path and save_path.exists():
|
|
534
|
+
if output_path and save_path and save_path.exists():
|
|
526
535
|
existing_results = TaskResult.from_disk(save_path)
|
|
527
536
|
if existing_results:
|
|
528
537
|
merged_results = self._merge_results(existing_results, new_results)
|
|
529
538
|
else:
|
|
530
539
|
merged_results = new_results
|
|
531
540
|
|
|
532
|
-
if output_path:
|
|
541
|
+
if output_path and save_path:
|
|
533
542
|
merged_results.to_disk(save_path)
|
|
534
543
|
|
|
535
544
|
evaluation_results.append(merged_results)
|
|
@@ -556,7 +565,7 @@ class MTEB:
|
|
|
556
565
|
def create_model_meta(model: MTEBModels) -> ModelMeta:
|
|
557
566
|
"""Create a ModelMeta object for the given model."""
|
|
558
567
|
if hasattr(model, "mteb_model_meta") and model.mteb_model_meta is not None:
|
|
559
|
-
meta = model.mteb_model_meta
|
|
568
|
+
meta = model.mteb_model_meta
|
|
560
569
|
else:
|
|
561
570
|
meta = MTEB._get_model_meta(model)
|
|
562
571
|
|
|
@@ -582,7 +591,11 @@ class MTEB:
|
|
|
582
591
|
if output_folder is None:
|
|
583
592
|
return None
|
|
584
593
|
|
|
585
|
-
model_revision: str =
|
|
594
|
+
model_revision: str = (
|
|
595
|
+
model_meta.revision
|
|
596
|
+
if model_meta.revision is not None
|
|
597
|
+
else "no_revision_available"
|
|
598
|
+
)
|
|
586
599
|
model_path_name = model_meta.model_name_as_path()
|
|
587
600
|
|
|
588
601
|
output_path = Path(output_folder) / model_path_name / model_revision
|
|
@@ -604,15 +617,15 @@ class MTEB:
|
|
|
604
617
|
Tasks with empty lists indicate that results already existed and no splits were evaluated.
|
|
605
618
|
"""
|
|
606
619
|
return deepcopy(
|
|
607
|
-
{task: list(splits) for task, splits in self.
|
|
620
|
+
{task: list(splits) for task, splits in self._last_evaluated_splits.items()}
|
|
608
621
|
)
|
|
609
622
|
|
|
610
623
|
@staticmethod
|
|
611
624
|
def _get_missing_evaluations(
|
|
612
625
|
existing_results: TaskResult | None,
|
|
613
|
-
task_eval_splits:
|
|
614
|
-
task_eval_langs:
|
|
615
|
-
eval_subsets:
|
|
626
|
+
task_eval_splits: Sequence[str],
|
|
627
|
+
task_eval_langs: Sequence[str],
|
|
628
|
+
eval_subsets: Sequence[str] | None,
|
|
616
629
|
) -> dict[str, dict[str, Any]]:
|
|
617
630
|
"""Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing."""
|
|
618
631
|
missing_evaluations = {
|
|
@@ -661,7 +674,7 @@ class MTEB:
|
|
|
661
674
|
return missing_evaluations
|
|
662
675
|
|
|
663
676
|
@staticmethod
|
|
664
|
-
def _get_model_meta(model:
|
|
677
|
+
def _get_model_meta(model: MTEBModels) -> ModelMeta:
|
|
665
678
|
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
666
679
|
|
|
667
680
|
if isinstance(model, CrossEncoder):
|