mteb 2.5.2__py3-none-any.whl → 2.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +10 -15
- mteb/_evaluators/any_sts_evaluator.py +1 -4
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +17 -16
- mteb/_evaluators/sklearn_evaluator.py +9 -8
- mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
- mteb/_evaluators/text/summarization_evaluator.py +20 -16
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +33 -27
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +7 -26
- mteb/abstasks/classification.py +10 -4
- mteb/abstasks/clustering.py +18 -14
- mteb/abstasks/clustering_legacy.py +8 -8
- mteb/abstasks/image/image_text_pair_classification.py +5 -3
- mteb/abstasks/multilabel_classification.py +20 -16
- mteb/abstasks/pair_classification.py +18 -9
- mteb/abstasks/regression.py +3 -3
- mteb/abstasks/retrieval.py +12 -9
- mteb/abstasks/sts.py +6 -3
- mteb/abstasks/task_metadata.py +22 -19
- mteb/abstasks/text/bitext_mining.py +36 -25
- mteb/abstasks/text/reranking.py +7 -5
- mteb/abstasks/text/summarization.py +8 -3
- mteb/abstasks/zeroshot_classification.py +5 -2
- mteb/benchmarks/benchmark.py +2 -2
- mteb/cache.py +27 -22
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +15 -10
- mteb/cli/generate_model_card.py +10 -7
- mteb/deprecated_evaluator.py +60 -46
- mteb/evaluate.py +39 -30
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +1 -1
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +7 -5
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +8 -1
- mteb/models/instruct_wrapper.py +11 -5
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/dino_models.py +23 -23
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/gme_v_models.py +4 -3
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/model2vec_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +8 -8
- mteb/models/model_implementations/openclip_models.py +7 -7
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/repllama_models.py +2 -2
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/vlm2vec_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +4 -4
- mteb/models/model_meta.py +14 -13
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +26 -12
- mteb/models/sentence_transformer_wrapper.py +19 -14
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +28 -20
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +55 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/RECORD +104 -103
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
mteb/cache.py
CHANGED
|
@@ -3,8 +3,9 @@ import logging
|
|
|
3
3
|
import os
|
|
4
4
|
import shutil
|
|
5
5
|
import subprocess
|
|
6
|
+
import warnings
|
|
6
7
|
from collections import defaultdict
|
|
7
|
-
from collections.abc import Sequence
|
|
8
|
+
from collections.abc import Iterable, Sequence
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import cast
|
|
10
11
|
|
|
@@ -83,9 +84,9 @@ class ResultCache:
|
|
|
83
84
|
model_path = results_folder / model_name
|
|
84
85
|
|
|
85
86
|
if model_revision is None:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
)
|
|
87
|
+
msg = "`model_revision` is not specified, attempting to load the latest revision. To disable this behavior, specify the 'model_revision` explicitly."
|
|
88
|
+
logger.warning(msg)
|
|
89
|
+
warnings.warn(msg)
|
|
89
90
|
# get revs from paths
|
|
90
91
|
revisions = [p for p in model_path.glob("*") if p.is_dir()]
|
|
91
92
|
if not revisions:
|
|
@@ -281,15 +282,17 @@ class ResultCache:
|
|
|
281
282
|
shutil.rmtree(self.cache_path)
|
|
282
283
|
logger.info(f"Cache directory {self.cache_path} cleared.")
|
|
283
284
|
else:
|
|
284
|
-
|
|
285
|
+
msg = f"Cache directory `{self.cache_path}` does not exist."
|
|
286
|
+
logger.warning(msg)
|
|
287
|
+
warnings.warn(msg)
|
|
285
288
|
|
|
286
289
|
def __repr__(self) -> str:
|
|
287
290
|
return f"ResultCache(cache_path={self.cache_path})"
|
|
288
291
|
|
|
289
292
|
def get_cache_paths(
|
|
290
293
|
self,
|
|
291
|
-
models: Sequence[str] |
|
|
292
|
-
tasks: Sequence[str] |
|
|
294
|
+
models: Sequence[str] | Iterable[ModelMeta] | None = None,
|
|
295
|
+
tasks: Sequence[str] | Iterable[AbsTask] | None = None,
|
|
293
296
|
require_model_meta: bool = True,
|
|
294
297
|
include_remote: bool = True,
|
|
295
298
|
) -> list[Path]:
|
|
@@ -422,7 +425,7 @@ class ResultCache:
|
|
|
422
425
|
@staticmethod
|
|
423
426
|
def _filter_paths_by_model_and_revision(
|
|
424
427
|
paths: list[Path],
|
|
425
|
-
models: Sequence[str] |
|
|
428
|
+
models: Sequence[str] | Iterable[ModelMeta] | None = None,
|
|
426
429
|
) -> list[Path]:
|
|
427
430
|
"""Filter a list of paths by model name and optional revision.
|
|
428
431
|
|
|
@@ -432,8 +435,9 @@ class ResultCache:
|
|
|
432
435
|
if not models:
|
|
433
436
|
return paths
|
|
434
437
|
|
|
435
|
-
|
|
436
|
-
|
|
438
|
+
first_model = next(iter(models))
|
|
439
|
+
if isinstance(first_model, ModelMeta):
|
|
440
|
+
models = cast(Iterable[ModelMeta], models)
|
|
437
441
|
name_and_revision = {
|
|
438
442
|
(m.model_name_as_path(), m.revision or "no_revision_available")
|
|
439
443
|
for m in models
|
|
@@ -444,13 +448,14 @@ class ResultCache:
|
|
|
444
448
|
if (p.parent.parent.name, p.parent.name) in name_and_revision
|
|
445
449
|
]
|
|
446
450
|
|
|
447
|
-
|
|
451
|
+
str_models = cast(Sequence[str], models)
|
|
452
|
+
model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
|
|
448
453
|
return [p for p in paths if p.parent.parent.name in model_names]
|
|
449
454
|
|
|
450
455
|
@staticmethod
|
|
451
456
|
def _filter_paths_by_task(
|
|
452
457
|
paths: list[Path],
|
|
453
|
-
tasks: Sequence[str] |
|
|
458
|
+
tasks: Sequence[str] | Iterable[AbsTask] | None = None,
|
|
454
459
|
) -> list[Path]:
|
|
455
460
|
if tasks is not None:
|
|
456
461
|
task_names = set()
|
|
@@ -466,8 +471,8 @@ class ResultCache:
|
|
|
466
471
|
|
|
467
472
|
def load_results(
|
|
468
473
|
self,
|
|
469
|
-
models: Sequence[str] |
|
|
470
|
-
tasks: Sequence[str] |
|
|
474
|
+
models: Sequence[str] | Iterable[ModelMeta] | None = None,
|
|
475
|
+
tasks: Sequence[str] | Iterable[AbsTask] | str | None = None,
|
|
471
476
|
require_model_meta: bool = True,
|
|
472
477
|
include_remote: bool = True,
|
|
473
478
|
validate_and_filter: bool = False,
|
|
@@ -511,7 +516,7 @@ class ResultCache:
|
|
|
511
516
|
)
|
|
512
517
|
models_results = defaultdict(list)
|
|
513
518
|
|
|
514
|
-
task_names = {}
|
|
519
|
+
task_names: dict[str, AbsTask | None] = {}
|
|
515
520
|
if tasks is not None:
|
|
516
521
|
for task in tasks:
|
|
517
522
|
if isinstance(task, AbsTask):
|
|
@@ -529,9 +534,11 @@ class ResultCache:
|
|
|
529
534
|
)
|
|
530
535
|
|
|
531
536
|
if validate_and_filter:
|
|
532
|
-
|
|
537
|
+
task_instance = task_names[task_result.task_name]
|
|
533
538
|
try:
|
|
534
|
-
task_result = task_result.validate_and_filter_scores(
|
|
539
|
+
task_result = task_result.validate_and_filter_scores(
|
|
540
|
+
task=task_instance
|
|
541
|
+
)
|
|
535
542
|
except Exception as e:
|
|
536
543
|
logger.info(
|
|
537
544
|
f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
|
|
@@ -541,7 +548,7 @@ class ResultCache:
|
|
|
541
548
|
models_results[(model_name, revision)].append(task_result)
|
|
542
549
|
|
|
543
550
|
# create BenchmarkResults object
|
|
544
|
-
|
|
551
|
+
models_results_object = [
|
|
545
552
|
ModelResult(
|
|
546
553
|
model_name=model_name,
|
|
547
554
|
model_revision=revision,
|
|
@@ -550,9 +557,7 @@ class ResultCache:
|
|
|
550
557
|
for (model_name, revision), task_results in models_results.items()
|
|
551
558
|
]
|
|
552
559
|
|
|
553
|
-
|
|
554
|
-
model_results=
|
|
560
|
+
return BenchmarkResults(
|
|
561
|
+
model_results=models_results_object,
|
|
555
562
|
benchmark=tasks if isinstance(tasks, Benchmark) else None,
|
|
556
563
|
)
|
|
557
|
-
|
|
558
|
-
return benchmark_results
|
mteb/cli/_display_tasks.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from collections.abc import Sequence
|
|
1
|
+
from collections.abc import Iterable, Sequence
|
|
2
2
|
|
|
3
3
|
from mteb.abstasks import AbsTask
|
|
4
4
|
from mteb.benchmarks import Benchmark
|
|
@@ -31,7 +31,7 @@ def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
|
|
|
31
31
|
_display_tasks(benchmark.tasks, name=name)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def _display_tasks(task_list:
|
|
34
|
+
def _display_tasks(task_list: Iterable[AbsTask], name: str | None = None) -> None:
|
|
35
35
|
from rich.console import Console
|
|
36
36
|
|
|
37
37
|
console = Console()
|
mteb/cli/build_cli.py
CHANGED
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
import torch
|
|
7
8
|
from rich.logging import RichHandler
|
|
8
9
|
|
|
9
10
|
import mteb
|
|
11
|
+
from mteb.abstasks.abstask import AbsTask
|
|
10
12
|
from mteb.cache import ResultCache
|
|
13
|
+
from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
|
|
11
14
|
from mteb.cli.generate_model_card import generate_model_card
|
|
12
15
|
from mteb.evaluate import OverwriteStrategy
|
|
13
16
|
|
|
14
|
-
from ._display_tasks import _display_benchmarks, _display_tasks
|
|
15
|
-
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
|
|
@@ -53,7 +54,7 @@ def run(args: argparse.Namespace) -> None:
|
|
|
53
54
|
|
|
54
55
|
if args.benchmarks:
|
|
55
56
|
benchmarks = mteb.get_benchmarks(names=args.benchmarks)
|
|
56
|
-
tasks =
|
|
57
|
+
tasks = tuple(t for b in benchmarks for t in b.tasks)
|
|
57
58
|
else:
|
|
58
59
|
tasks = mteb.get_tasks(
|
|
59
60
|
categories=args.categories,
|
|
@@ -69,15 +70,17 @@ def run(args: argparse.Namespace) -> None:
|
|
|
69
70
|
|
|
70
71
|
overwrite_strategy = args.overwrite_strategy
|
|
71
72
|
if args.overwrite:
|
|
72
|
-
|
|
73
|
-
"`--overwrite` is deprecated, please use `--overwrite-strategy 'always'` instead."
|
|
73
|
+
warnings.warn(
|
|
74
|
+
"`--overwrite` is deprecated, please use `--overwrite-strategy 'always'` instead.",
|
|
75
|
+
DeprecationWarning,
|
|
74
76
|
)
|
|
75
77
|
overwrite_strategy = OverwriteStrategy.ALWAYS.value
|
|
76
78
|
|
|
77
79
|
prediction_folder = args.prediction_folder
|
|
78
80
|
if args.save_predictions:
|
|
79
|
-
|
|
80
|
-
"`--save_predictions` is deprecated, please use `--prediction-folder` instead."
|
|
81
|
+
warnings.warn(
|
|
82
|
+
"`--save_predictions` is deprecated, please use `--prediction-folder` instead.",
|
|
83
|
+
DeprecationWarning,
|
|
81
84
|
)
|
|
82
85
|
prediction_folder = args.output_folder
|
|
83
86
|
|
|
@@ -279,15 +282,17 @@ def _create_meta(args: argparse.Namespace) -> None:
|
|
|
279
282
|
from_existing = Path(from_existing)
|
|
280
283
|
|
|
281
284
|
if output_path.exists() and overwrite:
|
|
282
|
-
|
|
285
|
+
msg = "Output path already exists, overwriting."
|
|
286
|
+
logger.warning(msg)
|
|
287
|
+
warnings.warn(msg)
|
|
283
288
|
elif output_path.exists():
|
|
284
289
|
raise FileExistsError(
|
|
285
290
|
"Output path already exists, use --overwrite to overwrite."
|
|
286
291
|
)
|
|
287
292
|
|
|
288
|
-
tasks = []
|
|
293
|
+
tasks: list[AbsTask] = []
|
|
289
294
|
if tasks_names is not None:
|
|
290
|
-
tasks = mteb.get_tasks(tasks_names)
|
|
295
|
+
tasks = list(mteb.get_tasks(tasks_names))
|
|
291
296
|
if benchmarks is not None:
|
|
292
297
|
benchmarks = mteb.get_benchmarks(benchmarks)
|
|
293
298
|
for benchmark in benchmarks:
|
mteb/cli/generate_model_card.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
3
|
+
from collections.abc import Sequence
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
|
|
4
6
|
from huggingface_hub import ModelCard, ModelCardData, repo_exists
|
|
@@ -12,7 +14,7 @@ logger = logging.getLogger(__name__)
|
|
|
12
14
|
|
|
13
15
|
def generate_model_card(
|
|
14
16
|
model_name: str,
|
|
15
|
-
tasks:
|
|
17
|
+
tasks: Sequence[AbsTask] | None = None,
|
|
16
18
|
existing_model_card_id_or_path: str | Path | None = None,
|
|
17
19
|
results_cache: ResultCache = ResultCache(),
|
|
18
20
|
output_path: Path = Path("model_card.md"),
|
|
@@ -47,8 +49,8 @@ def generate_model_card(
|
|
|
47
49
|
for task_result in models_results.task_results:
|
|
48
50
|
eval_results.extend(task_result.get_hf_eval_results())
|
|
49
51
|
|
|
50
|
-
existing_model_card_data = (
|
|
51
|
-
existing_model_card.data if existing_model_card else ModelCardData()
|
|
52
|
+
existing_model_card_data: ModelCardData = (
|
|
53
|
+
existing_model_card.data if existing_model_card else ModelCardData() # type: ignore[assignment]
|
|
52
54
|
)
|
|
53
55
|
|
|
54
56
|
if existing_model_card_data.eval_results is None:
|
|
@@ -88,13 +90,14 @@ def generate_model_card(
|
|
|
88
90
|
benchmark_results, existing_model_card
|
|
89
91
|
)
|
|
90
92
|
|
|
91
|
-
if push_to_hub:
|
|
93
|
+
if push_to_hub and existing_model_card_id_or_path:
|
|
94
|
+
existing_model_card_id_or_path = str(existing_model_card_id_or_path)
|
|
92
95
|
if repo_exists(existing_model_card_id_or_path):
|
|
93
96
|
existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token)
|
|
94
97
|
else:
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
)
|
|
98
|
+
msg = f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
|
|
99
|
+
logger.warning(msg)
|
|
100
|
+
warnings.warn(msg)
|
|
98
101
|
existing_model_card.save(output_path)
|
|
99
102
|
|
|
100
103
|
|
mteb/deprecated_evaluator.py
CHANGED
|
@@ -5,23 +5,24 @@ import logging
|
|
|
5
5
|
import os
|
|
6
6
|
import sys
|
|
7
7
|
import traceback
|
|
8
|
-
|
|
8
|
+
import warnings
|
|
9
|
+
from collections.abc import Iterable, Sequence
|
|
9
10
|
from copy import deepcopy
|
|
10
11
|
from datetime import datetime
|
|
11
12
|
from itertools import chain
|
|
12
13
|
from pathlib import Path
|
|
13
14
|
from time import time
|
|
14
|
-
from typing import TYPE_CHECKING, Any
|
|
15
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
15
16
|
|
|
16
17
|
import datasets
|
|
17
18
|
|
|
18
19
|
import mteb
|
|
19
20
|
from mteb.abstasks import AbsTask
|
|
21
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
20
22
|
from mteb.abstasks.task_metadata import TaskCategory, TaskType
|
|
21
23
|
from mteb.benchmarks import Benchmark
|
|
22
24
|
from mteb.models import (
|
|
23
25
|
CrossEncoderWrapper,
|
|
24
|
-
EncoderProtocol,
|
|
25
26
|
ModelMeta,
|
|
26
27
|
MTEBModels,
|
|
27
28
|
SentenceTransformerEncoderWrapper,
|
|
@@ -52,7 +53,7 @@ class MTEB:
|
|
|
52
53
|
)
|
|
53
54
|
def __init__(
|
|
54
55
|
self,
|
|
55
|
-
tasks: Iterable[AbsTask | Benchmark],
|
|
56
|
+
tasks: Iterable[AbsTask] | Iterable[Benchmark],
|
|
56
57
|
*,
|
|
57
58
|
err_logs_path: str = "error_logs.txt",
|
|
58
59
|
) -> None:
|
|
@@ -63,15 +64,14 @@ class MTEB:
|
|
|
63
64
|
`mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
|
|
64
65
|
err_logs_path: Path to save error logs.
|
|
65
66
|
"""
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
self.tasks = list(tasks)
|
|
69
|
-
if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
|
|
67
|
+
if isinstance(next(iter(tasks)), Benchmark):
|
|
70
68
|
self.benchmarks = tasks
|
|
71
|
-
self.tasks = list(chain.from_iterable(
|
|
69
|
+
self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
|
|
70
|
+
elif isinstance(next(iter(tasks)), AbsTask):
|
|
71
|
+
self.tasks = list(cast(Iterable[AbsTask], tasks))
|
|
72
72
|
|
|
73
73
|
self.err_logs_path = Path(err_logs_path)
|
|
74
|
-
self.
|
|
74
|
+
self._last_evaluated_splits: dict[str, list[str]] = {}
|
|
75
75
|
|
|
76
76
|
@property
|
|
77
77
|
def available_tasks(self) -> list[str]:
|
|
@@ -84,7 +84,7 @@ class MTEB:
|
|
|
84
84
|
return sorted({x.metadata.type for x in self.tasks})
|
|
85
85
|
|
|
86
86
|
@property
|
|
87
|
-
def available_task_categories(self) -> set[TaskCategory]:
|
|
87
|
+
def available_task_categories(self) -> set[TaskCategory | None]:
|
|
88
88
|
"""Set of available task categories."""
|
|
89
89
|
return {x.metadata.category for x in self.tasks}
|
|
90
90
|
|
|
@@ -231,13 +231,14 @@ class MTEB:
|
|
|
231
231
|
merged_kg_co2_emissions = None
|
|
232
232
|
if existing_kg_co2_emissions and new_kg_co2_emissions:
|
|
233
233
|
merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions
|
|
234
|
+
existing_evaluation_time = existing_results.evaluation_time or 0
|
|
235
|
+
new_evaluation_time = new_results.evaluation_time or 0
|
|
234
236
|
merged_results = TaskResult(
|
|
235
237
|
dataset_revision=new_results.dataset_revision,
|
|
236
238
|
task_name=new_results.task_name,
|
|
237
239
|
mteb_version=new_results.mteb_version,
|
|
238
240
|
scores=merged_scores,
|
|
239
|
-
evaluation_time=
|
|
240
|
-
+ new_results.evaluation_time,
|
|
241
|
+
evaluation_time=existing_evaluation_time + new_evaluation_time,
|
|
241
242
|
kg_co2_emissions=merged_kg_co2_emissions,
|
|
242
243
|
)
|
|
243
244
|
|
|
@@ -306,13 +307,16 @@ class MTEB:
|
|
|
306
307
|
elif verbosity == 3:
|
|
307
308
|
datasets.logging.set_verbosity(logging.DEBUG)
|
|
308
309
|
|
|
309
|
-
|
|
310
|
-
output_path = self._create_output_folder(meta, output_folder)
|
|
311
|
-
|
|
310
|
+
mteb_model: MTEBModels
|
|
312
311
|
if isinstance(model, SentenceTransformer):
|
|
313
|
-
|
|
312
|
+
mteb_model = SentenceTransformerEncoderWrapper(model)
|
|
314
313
|
elif isinstance(model, CrossEncoder):
|
|
315
|
-
|
|
314
|
+
mteb_model = CrossEncoderWrapper(model)
|
|
315
|
+
else:
|
|
316
|
+
mteb_model = cast(MTEBModels, model)
|
|
317
|
+
|
|
318
|
+
meta = self.create_model_meta(mteb_model)
|
|
319
|
+
output_path = self._create_output_folder(meta, output_folder)
|
|
316
320
|
|
|
317
321
|
# Disable co2_tracker for API models
|
|
318
322
|
if "API" in meta.framework:
|
|
@@ -333,7 +337,7 @@ class MTEB:
|
|
|
333
337
|
) # save them in case we re-use the object (e.g. for reranking)
|
|
334
338
|
|
|
335
339
|
# To evaluate missing splits, we keep track of the task name and the corresponding splits.
|
|
336
|
-
self.
|
|
340
|
+
self._last_evaluated_splits = {}
|
|
337
341
|
|
|
338
342
|
while len(self.tasks) > 0:
|
|
339
343
|
task = self.tasks[0]
|
|
@@ -342,9 +346,10 @@ class MTEB:
|
|
|
342
346
|
)
|
|
343
347
|
|
|
344
348
|
if task.is_aggregate:
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
349
|
+
aggregated_task = cast(AbsTaskAggregate, task)
|
|
350
|
+
self_ = MTEB(tasks=aggregated_task.metadata.tasks)
|
|
351
|
+
aggregated_task_results = self_.run(
|
|
352
|
+
mteb_model,
|
|
348
353
|
verbosity=verbosity - 1,
|
|
349
354
|
output_folder=output_folder,
|
|
350
355
|
eval_splits=eval_splits,
|
|
@@ -355,12 +360,15 @@ class MTEB:
|
|
|
355
360
|
encode_kwargs=encode_kwargs,
|
|
356
361
|
**kwargs,
|
|
357
362
|
)
|
|
358
|
-
new_results =
|
|
363
|
+
new_results = aggregated_task.combine_task_results(
|
|
364
|
+
aggregated_task_results
|
|
365
|
+
)
|
|
359
366
|
evaluation_results.append(new_results)
|
|
360
367
|
|
|
361
368
|
if output_path:
|
|
362
|
-
|
|
363
|
-
|
|
369
|
+
new_results.to_disk(
|
|
370
|
+
output_path / f"{aggregated_task.metadata.name}.json"
|
|
371
|
+
)
|
|
364
372
|
del self.tasks[0]
|
|
365
373
|
continue
|
|
366
374
|
|
|
@@ -382,7 +390,7 @@ class MTEB:
|
|
|
382
390
|
task_subsets = task.hf_subsets
|
|
383
391
|
|
|
384
392
|
existing_results = None
|
|
385
|
-
save_path = None
|
|
393
|
+
save_path: Path | None = None
|
|
386
394
|
final_splits_to_run = task_eval_splits
|
|
387
395
|
missing_evaluations = self._get_missing_evaluations(
|
|
388
396
|
existing_results,
|
|
@@ -432,7 +440,7 @@ class MTEB:
|
|
|
432
440
|
logger.info(
|
|
433
441
|
f"No splits to evaluate for {task.metadata.name}. Skipping evaluation."
|
|
434
442
|
)
|
|
435
|
-
self.
|
|
443
|
+
self._last_evaluated_splits[task.metadata.name] = []
|
|
436
444
|
del self.tasks[0]
|
|
437
445
|
continue
|
|
438
446
|
|
|
@@ -440,11 +448,11 @@ class MTEB:
|
|
|
440
448
|
task.check_if_dataset_is_superseded()
|
|
441
449
|
task.load_data()
|
|
442
450
|
|
|
443
|
-
task_results = {}
|
|
451
|
+
task_results: dict[str, dict[str, dict[str, Any]]] = {}
|
|
444
452
|
evaluation_time = 0
|
|
445
453
|
kg_co2_emissions: int | None = 0 if co2_tracker else None
|
|
446
454
|
|
|
447
|
-
self.
|
|
455
|
+
self._last_evaluated_splits[task.metadata.name] = []
|
|
448
456
|
|
|
449
457
|
for split in final_splits_to_run:
|
|
450
458
|
info = missing_evaluations[split]
|
|
@@ -465,14 +473,16 @@ class MTEB:
|
|
|
465
473
|
|
|
466
474
|
if co2_tracker:
|
|
467
475
|
try:
|
|
468
|
-
from codecarbon import
|
|
476
|
+
from codecarbon import ( # type: ignore[import-untyped]
|
|
477
|
+
EmissionsTracker,
|
|
478
|
+
)
|
|
469
479
|
except ImportError:
|
|
470
480
|
raise ImportError(
|
|
471
481
|
"codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions."
|
|
472
482
|
)
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
)
|
|
483
|
+
msg = "Evaluating multiple MTEB runs simultaneously will produce incorrect CO₂ results"
|
|
484
|
+
logger.warning(msg)
|
|
485
|
+
warnings.warn(msg)
|
|
476
486
|
with EmissionsTracker(
|
|
477
487
|
save_to_file=False,
|
|
478
488
|
save_to_api=False,
|
|
@@ -481,7 +491,7 @@ class MTEB:
|
|
|
481
491
|
) as tracker:
|
|
482
492
|
results, tick, tock = self._run_eval(
|
|
483
493
|
task,
|
|
484
|
-
|
|
494
|
+
mteb_model,
|
|
485
495
|
split,
|
|
486
496
|
encode_kwargs=encode_kwargs,
|
|
487
497
|
subsets_to_run=subsets_to_run,
|
|
@@ -494,7 +504,7 @@ class MTEB:
|
|
|
494
504
|
else:
|
|
495
505
|
results, tick, tock = self._run_eval(
|
|
496
506
|
task,
|
|
497
|
-
|
|
507
|
+
mteb_model,
|
|
498
508
|
split,
|
|
499
509
|
subsets_to_run=subsets_to_run,
|
|
500
510
|
encode_kwargs=encode_kwargs,
|
|
@@ -510,25 +520,25 @@ class MTEB:
|
|
|
510
520
|
if verbosity >= 1:
|
|
511
521
|
logger.info(f"Scores: {task_results[split]}")
|
|
512
522
|
|
|
513
|
-
self.
|
|
523
|
+
self._last_evaluated_splits[task.metadata.name].append(split)
|
|
514
524
|
|
|
515
525
|
# Create new TaskResult
|
|
516
526
|
new_results = TaskResult.from_task_results(
|
|
517
527
|
task,
|
|
518
|
-
task_results,
|
|
528
|
+
task_results, # type: ignore[arg-type]
|
|
519
529
|
evaluation_time=evaluation_time,
|
|
520
530
|
kg_co2_emissions=kg_co2_emissions,
|
|
521
531
|
)
|
|
522
532
|
|
|
523
533
|
# Merge with existing if needed
|
|
524
|
-
if output_path and save_path.exists():
|
|
534
|
+
if output_path and save_path and save_path.exists():
|
|
525
535
|
existing_results = TaskResult.from_disk(save_path)
|
|
526
536
|
if existing_results:
|
|
527
537
|
merged_results = self._merge_results(existing_results, new_results)
|
|
528
538
|
else:
|
|
529
539
|
merged_results = new_results
|
|
530
540
|
|
|
531
|
-
if output_path:
|
|
541
|
+
if output_path and save_path:
|
|
532
542
|
merged_results.to_disk(save_path)
|
|
533
543
|
|
|
534
544
|
evaluation_results.append(merged_results)
|
|
@@ -555,7 +565,7 @@ class MTEB:
|
|
|
555
565
|
def create_model_meta(model: MTEBModels) -> ModelMeta:
|
|
556
566
|
"""Create a ModelMeta object for the given model."""
|
|
557
567
|
if hasattr(model, "mteb_model_meta") and model.mteb_model_meta is not None:
|
|
558
|
-
meta = model.mteb_model_meta
|
|
568
|
+
meta = model.mteb_model_meta
|
|
559
569
|
else:
|
|
560
570
|
meta = MTEB._get_model_meta(model)
|
|
561
571
|
|
|
@@ -581,7 +591,11 @@ class MTEB:
|
|
|
581
591
|
if output_folder is None:
|
|
582
592
|
return None
|
|
583
593
|
|
|
584
|
-
model_revision: str =
|
|
594
|
+
model_revision: str = (
|
|
595
|
+
model_meta.revision
|
|
596
|
+
if model_meta.revision is not None
|
|
597
|
+
else "no_revision_available"
|
|
598
|
+
)
|
|
585
599
|
model_path_name = model_meta.model_name_as_path()
|
|
586
600
|
|
|
587
601
|
output_path = Path(output_folder) / model_path_name / model_revision
|
|
@@ -603,15 +617,15 @@ class MTEB:
|
|
|
603
617
|
Tasks with empty lists indicate that results already existed and no splits were evaluated.
|
|
604
618
|
"""
|
|
605
619
|
return deepcopy(
|
|
606
|
-
{task: list(splits) for task, splits in self.
|
|
620
|
+
{task: list(splits) for task, splits in self._last_evaluated_splits.items()}
|
|
607
621
|
)
|
|
608
622
|
|
|
609
623
|
@staticmethod
|
|
610
624
|
def _get_missing_evaluations(
|
|
611
625
|
existing_results: TaskResult | None,
|
|
612
|
-
task_eval_splits:
|
|
613
|
-
task_eval_langs:
|
|
614
|
-
eval_subsets:
|
|
626
|
+
task_eval_splits: Sequence[str],
|
|
627
|
+
task_eval_langs: Sequence[str],
|
|
628
|
+
eval_subsets: Sequence[str] | None,
|
|
615
629
|
) -> dict[str, dict[str, Any]]:
|
|
616
630
|
"""Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing."""
|
|
617
631
|
missing_evaluations = {
|
|
@@ -660,7 +674,7 @@ class MTEB:
|
|
|
660
674
|
return missing_evaluations
|
|
661
675
|
|
|
662
676
|
@staticmethod
|
|
663
|
-
def _get_model_meta(model:
|
|
677
|
+
def _get_model_meta(model: MTEBModels) -> ModelMeta:
|
|
664
678
|
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
665
679
|
|
|
666
680
|
if isinstance(model, CrossEncoder):
|