mteb 2.5.3__py3-none-any.whl → 2.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +10 -15
- mteb/_evaluators/any_sts_evaluator.py +1 -4
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +17 -16
- mteb/_evaluators/sklearn_evaluator.py +9 -8
- mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
- mteb/_evaluators/text/summarization_evaluator.py +20 -16
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +27 -21
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +3 -16
- mteb/abstasks/classification.py +10 -4
- mteb/abstasks/clustering.py +18 -14
- mteb/abstasks/clustering_legacy.py +8 -8
- mteb/abstasks/image/image_text_pair_classification.py +5 -3
- mteb/abstasks/multilabel_classification.py +20 -16
- mteb/abstasks/pair_classification.py +18 -9
- mteb/abstasks/regression.py +3 -3
- mteb/abstasks/retrieval.py +12 -9
- mteb/abstasks/sts.py +6 -3
- mteb/abstasks/task_metadata.py +20 -16
- mteb/abstasks/text/bitext_mining.py +36 -25
- mteb/abstasks/text/reranking.py +7 -5
- mteb/abstasks/text/summarization.py +8 -3
- mteb/abstasks/zeroshot_classification.py +5 -2
- mteb/benchmarks/benchmark.py +4 -2
- mteb/benchmarks/benchmarks/benchmarks.py +22 -1
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +21 -18
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +8 -8
- mteb/cli/generate_model_card.py +39 -20
- mteb/deprecated_evaluator.py +56 -43
- mteb/evaluate.py +35 -29
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +25 -27
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +1 -1
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +2 -2
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +8 -1
- mteb/models/instruct_wrapper.py +11 -5
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/dino_models.py +23 -23
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/model2vec_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +8 -8
- mteb/models/model_implementations/openclip_models.py +7 -7
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/repllama_models.py +2 -2
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/vlm2vec_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +4 -4
- mteb/models/model_meta.py +30 -14
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
- mteb/models/search_wrappers.py +22 -10
- mteb/models/sentence_transformer_wrapper.py +9 -4
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +25 -19
- mteb/results/model_result.py +49 -21
- mteb/results/task_result.py +45 -51
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/METADATA +1 -1
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/RECORD +105 -104
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/WHEEL +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/top_level.txt +0 -0
mteb/abstasks/regression.py
CHANGED
|
@@ -87,7 +87,7 @@ class AbsTaskRegression(AbsTaskClassification):
|
|
|
87
87
|
Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol].
|
|
88
88
|
"""
|
|
89
89
|
|
|
90
|
-
evaluator: type[
|
|
90
|
+
evaluator: type[SklearnEvaluator] = SklearnEvaluator
|
|
91
91
|
evaluator_model: SklearnModelProtocol = LinearRegression(n_jobs=-1)
|
|
92
92
|
|
|
93
93
|
train_split: str = "train"
|
|
@@ -113,7 +113,7 @@ class AbsTaskRegression(AbsTaskClassification):
|
|
|
113
113
|
)["train"]
|
|
114
114
|
return train_split_sampled, []
|
|
115
115
|
|
|
116
|
-
def _calculate_scores(
|
|
116
|
+
def _calculate_scores( # type: ignore[override]
|
|
117
117
|
self,
|
|
118
118
|
y_test: np.ndarray | list[int],
|
|
119
119
|
y_pred: np.ndarray,
|
|
@@ -183,7 +183,7 @@ class AbsTaskRegression(AbsTaskClassification):
|
|
|
183
183
|
|
|
184
184
|
return dataset_dict
|
|
185
185
|
|
|
186
|
-
def _calculate_descriptive_statistics_from_split(
|
|
186
|
+
def _calculate_descriptive_statistics_from_split( # type: ignore[override]
|
|
187
187
|
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
|
|
188
188
|
) -> RegressionDescriptiveStatistics:
|
|
189
189
|
train_text = []
|
mteb/abstasks/retrieval.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
from collections import defaultdict
|
|
4
|
-
from collections.abc import Callable, Sequence
|
|
4
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from time import time
|
|
7
7
|
from typing import Any, Literal
|
|
@@ -286,7 +286,7 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
286
286
|
encode_kwargs: dict[str, Any],
|
|
287
287
|
prediction_folder: Path | None = None,
|
|
288
288
|
**kwargs,
|
|
289
|
-
) ->
|
|
289
|
+
) -> Mapping[HFSubset, ScoresDict]:
|
|
290
290
|
"""Evaluate the model on the retrieval task.
|
|
291
291
|
|
|
292
292
|
Args:
|
|
@@ -357,6 +357,8 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
357
357
|
**kwargs,
|
|
358
358
|
)
|
|
359
359
|
|
|
360
|
+
search_model: SearchProtocol
|
|
361
|
+
|
|
360
362
|
if isinstance(model, EncoderProtocol) and not isinstance(model, SearchProtocol):
|
|
361
363
|
search_model = SearchEncoderWrapper(model)
|
|
362
364
|
elif isinstance(model, CrossEncoderProtocol):
|
|
@@ -578,11 +580,12 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
578
580
|
if isinstance(data[split][subset_item], Dataset):
|
|
579
581
|
sections[split] = data[split][subset_item]
|
|
580
582
|
elif converter is not None:
|
|
583
|
+
subset_data = data[split][subset_item]
|
|
584
|
+
if subset_data is None:
|
|
585
|
+
continue
|
|
586
|
+
|
|
581
587
|
sections[split] = Dataset.from_list(
|
|
582
|
-
[
|
|
583
|
-
converter(idx, item)
|
|
584
|
-
for idx, item in data[split][subset_item].items()
|
|
585
|
-
]
|
|
588
|
+
[converter(idx, item) for idx, item in subset_data.items()]
|
|
586
589
|
)
|
|
587
590
|
else:
|
|
588
591
|
raise ValueError(
|
|
@@ -680,7 +683,7 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
680
683
|
|
|
681
684
|
top_k_sorted = defaultdict(list)
|
|
682
685
|
for query_id, values in top_ranked.items():
|
|
683
|
-
sorted_keys = sorted(values, key=values
|
|
686
|
+
sorted_keys = sorted(values, key=lambda k: values[k], reverse=True)
|
|
684
687
|
top_k_sorted[query_id] = sorted_keys[: self._top_k]
|
|
685
688
|
|
|
686
689
|
self.dataset[subset][split]["top_ranked"] = top_k_sorted
|
|
@@ -688,10 +691,10 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
688
691
|
|
|
689
692
|
|
|
690
693
|
def _process_relevant_docs(
|
|
691
|
-
collection:
|
|
694
|
+
collection: Mapping[str, Mapping[str, int]],
|
|
692
695
|
hf_subset: str,
|
|
693
696
|
split: str,
|
|
694
|
-
) -> dict[str, dict[str,
|
|
697
|
+
) -> dict[str, dict[str, int]]:
|
|
695
698
|
"""Collections can contain overlapping ids in different splits. Prepend split and subset to avoid this
|
|
696
699
|
|
|
697
700
|
Returns:
|
mteb/abstasks/sts.py
CHANGED
|
@@ -7,7 +7,7 @@ from scipy.stats import pearsonr, spearmanr
|
|
|
7
7
|
|
|
8
8
|
from mteb._evaluators import AnySTSEvaluator
|
|
9
9
|
from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
|
|
10
|
-
from mteb.models import EncoderProtocol
|
|
10
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
11
11
|
from mteb.types import PromptType
|
|
12
12
|
from mteb.types.statistics import (
|
|
13
13
|
ImageStatistics,
|
|
@@ -103,7 +103,7 @@ class AbsTaskSTS(AbsTask):
|
|
|
103
103
|
|
|
104
104
|
def _evaluate_subset(
|
|
105
105
|
self,
|
|
106
|
-
model:
|
|
106
|
+
model: MTEBModels,
|
|
107
107
|
data_split: Dataset,
|
|
108
108
|
encode_kwargs: dict[str, Any],
|
|
109
109
|
hf_split: str,
|
|
@@ -111,6 +111,9 @@ class AbsTaskSTS(AbsTask):
|
|
|
111
111
|
prediction_folder: Path | None = None,
|
|
112
112
|
**kwargs: Any,
|
|
113
113
|
) -> STSMetrics:
|
|
114
|
+
if not isinstance(model, EncoderProtocol):
|
|
115
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
116
|
+
|
|
114
117
|
normalized_scores = list(map(self._normalize, data_split["score"]))
|
|
115
118
|
data_split = data_split.select_columns(list(self.column_names))
|
|
116
119
|
|
|
@@ -142,7 +145,7 @@ class AbsTaskSTS(AbsTask):
|
|
|
142
145
|
) -> STSMetrics:
|
|
143
146
|
def compute_corr(x: list[float], y: list[float]) -> tuple[float, float]:
|
|
144
147
|
"""Return (pearson, spearman) correlations between x and y."""
|
|
145
|
-
return pearsonr(x, y)[0], spearmanr(x, y)[0]
|
|
148
|
+
return float(pearsonr(x, y)[0]), float(spearmanr(x, y)[0])
|
|
146
149
|
|
|
147
150
|
cosine_pearson, cosine_spearman = compute_corr(
|
|
148
151
|
normalized_scores, scores["cosine_scores"]
|
mteb/abstasks/task_metadata.py
CHANGED
|
@@ -2,9 +2,10 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from collections.abc import Sequence
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Literal
|
|
5
|
+
from typing import Any, Literal, cast
|
|
6
6
|
|
|
7
7
|
from huggingface_hub import (
|
|
8
|
+
CardData,
|
|
8
9
|
DatasetCard,
|
|
9
10
|
DatasetCardData,
|
|
10
11
|
constants,
|
|
@@ -150,7 +151,7 @@ _TASK_TYPE = (
|
|
|
150
151
|
"InstructionReranking",
|
|
151
152
|
) + MIEB_TASK_TYPE
|
|
152
153
|
|
|
153
|
-
TaskType = Literal[_TASK_TYPE]
|
|
154
|
+
TaskType = Literal[_TASK_TYPE] # type: ignore[valid-type]
|
|
154
155
|
"""The type of the task. E.g. includes "Classification", "Retrieval" and "Clustering"."""
|
|
155
156
|
|
|
156
157
|
|
|
@@ -192,8 +193,10 @@ AnnotatorType = Literal[
|
|
|
192
193
|
"""The type of the annotators. Is often important for understanding the quality of a dataset."""
|
|
193
194
|
|
|
194
195
|
|
|
195
|
-
PromptDict = TypedDict(
|
|
196
|
-
"PromptDict",
|
|
196
|
+
PromptDict = TypedDict( # type: ignore[misc]
|
|
197
|
+
"PromptDict",
|
|
198
|
+
{prompt_type.value: str for prompt_type in PromptType},
|
|
199
|
+
total=False,
|
|
197
200
|
)
|
|
198
201
|
"""A dictionary containing the prompt used for the task.
|
|
199
202
|
|
|
@@ -365,7 +368,7 @@ class TaskMetadata(BaseModel):
|
|
|
365
368
|
"""Return a dictionary mapping huggingface subsets to languages."""
|
|
366
369
|
if isinstance(self.eval_langs, dict):
|
|
367
370
|
return self.eval_langs
|
|
368
|
-
return {"default": self.eval_langs}
|
|
371
|
+
return {"default": cast(list[str], self.eval_langs)}
|
|
369
372
|
|
|
370
373
|
@property
|
|
371
374
|
def intext_citation(self, include_cite: bool = True) -> str:
|
|
@@ -413,7 +416,7 @@ class TaskMetadata(BaseModel):
|
|
|
413
416
|
for subset, subset_value in stats.items():
|
|
414
417
|
if subset == "hf_subset_descriptive_stats":
|
|
415
418
|
continue
|
|
416
|
-
n_samples[subset] = subset_value["num_samples"]
|
|
419
|
+
n_samples[subset] = subset_value["num_samples"]
|
|
417
420
|
return n_samples
|
|
418
421
|
|
|
419
422
|
@property
|
|
@@ -446,7 +449,7 @@ class TaskMetadata(BaseModel):
|
|
|
446
449
|
Raises:
|
|
447
450
|
ValueError: If the prompt type is not recognized.
|
|
448
451
|
"""
|
|
449
|
-
if prompt_type is None:
|
|
452
|
+
if prompt_type is None or self.category is None:
|
|
450
453
|
return self.modalities
|
|
451
454
|
query_modalities, doc_modalities = self.category.split("2")
|
|
452
455
|
category_to_modality: dict[str, Modalities] = {
|
|
@@ -466,7 +469,7 @@ class TaskMetadata(BaseModel):
|
|
|
466
469
|
|
|
467
470
|
def _create_dataset_card_data(
|
|
468
471
|
self,
|
|
469
|
-
existing_dataset_card_data:
|
|
472
|
+
existing_dataset_card_data: CardData | None = None,
|
|
470
473
|
) -> tuple[DatasetCardData, dict[str, Any]]:
|
|
471
474
|
"""Create a DatasetCardData object from the task metadata.
|
|
472
475
|
|
|
@@ -501,12 +504,13 @@ class TaskMetadata(BaseModel):
|
|
|
501
504
|
|
|
502
505
|
tags = ["mteb"] + self.modalities
|
|
503
506
|
|
|
504
|
-
descriptive_stats =
|
|
505
|
-
if descriptive_stats is not None:
|
|
506
|
-
|
|
507
|
+
descriptive_stats = ""
|
|
508
|
+
if self.descriptive_stats is not None:
|
|
509
|
+
descriptive_stats_ = self.descriptive_stats
|
|
510
|
+
for split, split_stat in descriptive_stats_.items():
|
|
507
511
|
if len(split_stat.get("hf_subset_descriptive_stats", {})) > 10:
|
|
508
512
|
split_stat.pop("hf_subset_descriptive_stats", {})
|
|
509
|
-
descriptive_stats = json.dumps(
|
|
513
|
+
descriptive_stats = json.dumps(descriptive_stats_, indent=4)
|
|
510
514
|
|
|
511
515
|
dataset_card_data_params = existing_dataset_card_data.to_dict()
|
|
512
516
|
# override the existing values
|
|
@@ -694,11 +698,11 @@ class TaskMetadata(BaseModel):
|
|
|
694
698
|
|
|
695
699
|
def _hf_languages(self) -> list[str]:
|
|
696
700
|
languages: list[str] = []
|
|
697
|
-
if self.is_multilingual:
|
|
698
|
-
for val in
|
|
701
|
+
if self.is_multilingual and isinstance(self.eval_langs, dict):
|
|
702
|
+
for val in self.eval_langs.values():
|
|
699
703
|
languages.extend(val)
|
|
700
704
|
else:
|
|
701
|
-
languages = self.eval_langs
|
|
705
|
+
languages = cast(list[str], self.eval_langs)
|
|
702
706
|
# value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters),
|
|
703
707
|
# or a special value like "code", "multilingual".
|
|
704
708
|
readme_langs = []
|
|
@@ -710,7 +714,7 @@ class TaskMetadata(BaseModel):
|
|
|
710
714
|
readme_langs.append(lang_name)
|
|
711
715
|
return sorted(set(readme_langs))
|
|
712
716
|
|
|
713
|
-
def _hf_license(self) -> str:
|
|
717
|
+
def _hf_license(self) -> str | None:
|
|
714
718
|
dataset_license = self.license
|
|
715
719
|
if dataset_license:
|
|
716
720
|
license_mapping = {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, ClassVar, TypedDict
|
|
4
|
+
from typing import Any, ClassVar, TypedDict, cast
|
|
5
5
|
|
|
6
6
|
from datasets import Dataset, DatasetDict
|
|
7
7
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
|
@@ -78,6 +78,9 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
78
78
|
**kwargs: Any,
|
|
79
79
|
) -> dict[HFSubset, ScoresDict]:
|
|
80
80
|
"""Added load for "parallel" datasets"""
|
|
81
|
+
if not isinstance(model, EncoderProtocol):
|
|
82
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
83
|
+
|
|
81
84
|
if not self.data_loaded:
|
|
82
85
|
self.load_data()
|
|
83
86
|
|
|
@@ -87,11 +90,16 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
87
90
|
if subsets_to_run is not None:
|
|
88
91
|
hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
|
|
89
92
|
|
|
90
|
-
|
|
93
|
+
encoder_model = cast(EncoderProtocol, model)
|
|
94
|
+
|
|
95
|
+
if self.dataset is None:
|
|
96
|
+
raise ValueError("Dataset is not loaded.")
|
|
97
|
+
|
|
98
|
+
scores: dict[str, BitextMiningMetrics] = {}
|
|
91
99
|
if self.parallel_subsets:
|
|
92
|
-
scores = self._evaluate_subset(
|
|
93
|
-
|
|
94
|
-
self.dataset[split],
|
|
100
|
+
scores = self._evaluate_subset( # type: ignore[assignment]
|
|
101
|
+
encoder_model,
|
|
102
|
+
self.dataset[split],
|
|
95
103
|
parallel=True,
|
|
96
104
|
hf_split=split,
|
|
97
105
|
hf_subset="parallel",
|
|
@@ -109,8 +117,8 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
109
117
|
data_split = self.dataset[split]
|
|
110
118
|
else:
|
|
111
119
|
data_split = self.dataset[hf_subset][split]
|
|
112
|
-
scores[hf_subset] = self._evaluate_subset(
|
|
113
|
-
|
|
120
|
+
scores[hf_subset] = self._evaluate_subset( # type: ignore[assignment]
|
|
121
|
+
encoder_model,
|
|
114
122
|
data_split,
|
|
115
123
|
hf_split=split,
|
|
116
124
|
hf_subset=hf_subset,
|
|
@@ -119,32 +127,32 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
119
127
|
**kwargs,
|
|
120
128
|
)
|
|
121
129
|
|
|
122
|
-
return scores
|
|
130
|
+
return cast(dict[HFSubset, ScoresDict], scores)
|
|
123
131
|
|
|
124
132
|
def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]:
|
|
125
133
|
pairs = self._DEFAULT_PAIR
|
|
126
134
|
if parallel:
|
|
127
|
-
pairs = [langpair.split("-") for langpair in self.hf_subsets]
|
|
135
|
+
pairs = [langpair.split("-") for langpair in self.hf_subsets] # type: ignore[misc]
|
|
128
136
|
return pairs
|
|
129
137
|
|
|
130
|
-
def _evaluate_subset(
|
|
138
|
+
def _evaluate_subset( # type: ignore[override]
|
|
131
139
|
self,
|
|
132
140
|
model: EncoderProtocol,
|
|
133
141
|
data_split: Dataset,
|
|
134
142
|
*,
|
|
135
143
|
hf_split: str,
|
|
136
144
|
hf_subset: str,
|
|
137
|
-
parallel: bool = False,
|
|
138
145
|
encode_kwargs: dict[str, Any],
|
|
139
146
|
prediction_folder: Path | None = None,
|
|
147
|
+
parallel: bool = False,
|
|
140
148
|
**kwargs,
|
|
141
|
-
) ->
|
|
149
|
+
) -> BitextMiningMetrics | dict[str, BitextMiningMetrics]:
|
|
142
150
|
pairs = self._get_pairs(parallel)
|
|
143
151
|
|
|
144
152
|
evaluator = BitextMiningEvaluator(
|
|
145
153
|
data_split,
|
|
146
154
|
task_metadata=self.metadata,
|
|
147
|
-
pair_columns=pairs,
|
|
155
|
+
pair_columns=pairs,
|
|
148
156
|
hf_split=hf_split,
|
|
149
157
|
hf_subset=hf_subset,
|
|
150
158
|
**kwargs,
|
|
@@ -168,16 +176,16 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
168
176
|
)
|
|
169
177
|
|
|
170
178
|
if parallel:
|
|
171
|
-
|
|
179
|
+
parallel_metrics = {}
|
|
172
180
|
for keys, nearest_neighbors in neighbours.items():
|
|
173
|
-
|
|
181
|
+
parallel_metrics[keys] = self._compute_metrics(nearest_neighbors, gold)
|
|
174
182
|
|
|
175
|
-
for v in
|
|
183
|
+
for v in parallel_metrics.values():
|
|
176
184
|
self._add_main_score(v)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
185
|
+
return parallel_metrics
|
|
186
|
+
def_pair_str = "-".join(self._DEFAULT_PAIR[0])
|
|
187
|
+
metrics = self._compute_metrics(neighbours[def_pair_str], gold)
|
|
188
|
+
self._add_main_score(metrics)
|
|
181
189
|
return metrics
|
|
182
190
|
|
|
183
191
|
def _compute_metrics(
|
|
@@ -250,8 +258,11 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
250
258
|
)
|
|
251
259
|
|
|
252
260
|
def _push_dataset_to_hub(self, repo_name: str) -> None:
|
|
261
|
+
if self.dataset is None:
|
|
262
|
+
raise ValueError("Dataset is not loaded.")
|
|
263
|
+
|
|
253
264
|
if self.metadata.is_multilingual:
|
|
254
|
-
dataset = defaultdict(dict)
|
|
265
|
+
dataset: dict[str, dict[str, list[str]]] = defaultdict(dict)
|
|
255
266
|
for config in self.metadata.eval_langs:
|
|
256
267
|
logger.info(f"Converting {config} of {self.metadata.name}")
|
|
257
268
|
|
|
@@ -266,10 +277,10 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
266
277
|
for split in self.dataset[config]:
|
|
267
278
|
dataset[split][lang_1] = self.dataset[config][split][sent_1]
|
|
268
279
|
dataset[split][lang_2] = self.dataset[config][split][sent_2]
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
280
|
+
dataset_dict = DatasetDict(
|
|
281
|
+
{split: Dataset.from_dict(dataset[split]) for split in dataset}
|
|
282
|
+
)
|
|
283
|
+
dataset_dict.push_to_hub(repo_name)
|
|
273
284
|
else:
|
|
274
285
|
sentences = {}
|
|
275
286
|
for split in self.dataset:
|
mteb/abstasks/text/reranking.py
CHANGED
|
@@ -16,7 +16,7 @@ else:
|
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
19
|
-
OLD_FORMAT_RERANKING_TASKS = []
|
|
19
|
+
OLD_FORMAT_RERANKING_TASKS: list[str] = []
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
@deprecated(
|
|
@@ -105,7 +105,9 @@ class AbsTaskReranking(AbsTaskRetrieval):
|
|
|
105
105
|
)
|
|
106
106
|
|
|
107
107
|
given_dataset = copy(given_dataset)
|
|
108
|
-
self.dataset = defaultdict(
|
|
108
|
+
self.dataset: dict[str, dict[str, RetrievalSplitData]] = defaultdict(
|
|
109
|
+
lambda: defaultdict(dict) # type: ignore[arg-type]
|
|
110
|
+
)
|
|
109
111
|
|
|
110
112
|
hf_subsets = self.hf_subsets
|
|
111
113
|
|
|
@@ -115,19 +117,19 @@ class AbsTaskReranking(AbsTaskRetrieval):
|
|
|
115
117
|
if hf_subset in cur_dataset:
|
|
116
118
|
cur_dataset = cur_dataset[hf_subset]
|
|
117
119
|
elif "name" in self.metadata.dataset:
|
|
118
|
-
cur_dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
120
|
+
cur_dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
119
121
|
assert hf_subset == "default", (
|
|
120
122
|
f"Only default subset is supported for {self.metadata.name} since `name` is given in the metadata."
|
|
121
123
|
)
|
|
122
124
|
else:
|
|
123
125
|
cur_dataset = datasets.load_dataset(
|
|
124
126
|
**self.metadata.dataset, name=hf_subset
|
|
125
|
-
)
|
|
127
|
+
)
|
|
126
128
|
|
|
127
129
|
for split in cur_dataset:
|
|
128
130
|
corpus = []
|
|
129
131
|
queries = []
|
|
130
|
-
relevant_docs = defaultdict(dict)
|
|
132
|
+
relevant_docs: dict[str, dict[str, int]] = defaultdict(dict)
|
|
131
133
|
top_ranked = defaultdict(list)
|
|
132
134
|
|
|
133
135
|
# Create an enumerated dataset to pass indices
|
|
@@ -12,7 +12,7 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
12
12
|
calculate_text_statistics,
|
|
13
13
|
)
|
|
14
14
|
from mteb.abstasks.abstask import AbsTask
|
|
15
|
-
from mteb.models import EncoderProtocol
|
|
15
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
16
16
|
from mteb.types.statistics import (
|
|
17
17
|
ScoreStatistics,
|
|
18
18
|
SplitDescriptiveStatistics,
|
|
@@ -77,7 +77,7 @@ class AbsTaskSummarization(AbsTask):
|
|
|
77
77
|
|
|
78
78
|
def _evaluate_subset(
|
|
79
79
|
self,
|
|
80
|
-
model:
|
|
80
|
+
model: MTEBModels,
|
|
81
81
|
data_split: Dataset,
|
|
82
82
|
*,
|
|
83
83
|
hf_split: str,
|
|
@@ -86,8 +86,13 @@ class AbsTaskSummarization(AbsTask):
|
|
|
86
86
|
prediction_folder: Path | None = None,
|
|
87
87
|
**kwargs,
|
|
88
88
|
) -> SummarizationMetrics:
|
|
89
|
+
if not isinstance(model, EncoderProtocol):
|
|
90
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
91
|
+
|
|
89
92
|
normalized_scores = [
|
|
90
|
-
(
|
|
93
|
+
(
|
|
94
|
+
(np.array(x) - self.min_score) / (self.max_score - self.min_score)
|
|
95
|
+
).tolist()
|
|
91
96
|
for x in data_split[self.relevancy_column_name]
|
|
92
97
|
]
|
|
93
98
|
evaluator = self.evaluator(
|
|
@@ -7,7 +7,7 @@ from datasets import Dataset
|
|
|
7
7
|
from sklearn import metrics
|
|
8
8
|
|
|
9
9
|
from mteb._evaluators import ZeroShotClassificationEvaluator
|
|
10
|
-
from mteb.models import EncoderProtocol
|
|
10
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
11
11
|
from mteb.types.statistics import (
|
|
12
12
|
ImageStatistics,
|
|
13
13
|
LabelStatistics,
|
|
@@ -111,7 +111,7 @@ class AbsTaskZeroShotClassification(AbsTask):
|
|
|
111
111
|
|
|
112
112
|
def _evaluate_subset(
|
|
113
113
|
self,
|
|
114
|
-
model:
|
|
114
|
+
model: MTEBModels,
|
|
115
115
|
data_split: Dataset,
|
|
116
116
|
*,
|
|
117
117
|
hf_split: str,
|
|
@@ -120,6 +120,9 @@ class AbsTaskZeroShotClassification(AbsTask):
|
|
|
120
120
|
prediction_folder: Path | None = None,
|
|
121
121
|
**kwargs,
|
|
122
122
|
) -> ZeroShotClassificationMetrics:
|
|
123
|
+
if not isinstance(model, EncoderProtocol):
|
|
124
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
125
|
+
|
|
123
126
|
candidate_labels = self.get_candidate_labels()
|
|
124
127
|
data_split = data_split.select_columns(
|
|
125
128
|
[self.input_column_name, self.label_column_name]
|
mteb/benchmarks/benchmark.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from collections.abc import
|
|
3
|
+
from collections.abc import Iterator, Sequence
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from typing import TYPE_CHECKING, Literal
|
|
6
6
|
|
|
@@ -19,6 +19,7 @@ class Benchmark:
|
|
|
19
19
|
|
|
20
20
|
Args:
|
|
21
21
|
name: The name of the benchmark
|
|
22
|
+
aliases: Alternative names for the benchmark
|
|
22
23
|
tasks: The tasks within the benchmark.
|
|
23
24
|
description: A description of the benchmark, should include its intended goal and potentially a description of its construction
|
|
24
25
|
reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
|
|
@@ -38,6 +39,7 @@ class Benchmark:
|
|
|
38
39
|
|
|
39
40
|
name: str
|
|
40
41
|
tasks: Sequence[AbsTask]
|
|
42
|
+
aliases: Sequence[str] = field(default_factory=tuple)
|
|
41
43
|
description: str | None = None
|
|
42
44
|
reference: StrURL | None = None
|
|
43
45
|
citation: str | None = None
|
|
@@ -47,7 +49,7 @@ class Benchmark:
|
|
|
47
49
|
display_name: str | None = None
|
|
48
50
|
language_view: list[str] | Literal["all"] = field(default_factory=list)
|
|
49
51
|
|
|
50
|
-
def __iter__(self) ->
|
|
52
|
+
def __iter__(self) -> Iterator[AbsTask]:
|
|
51
53
|
return iter(self.tasks)
|
|
52
54
|
|
|
53
55
|
def __len__(self) -> int:
|
|
@@ -18,6 +18,7 @@ MMTEB_CITATION = r"""@article{enevoldsen2025mmtebmassivemultilingualtext,
|
|
|
18
18
|
|
|
19
19
|
MTEB_EN = Benchmark(
|
|
20
20
|
name="MTEB(eng, v2)",
|
|
21
|
+
aliases=["MTEB(eng)"],
|
|
21
22
|
display_name="English",
|
|
22
23
|
icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg",
|
|
23
24
|
tasks=MTEBTasks(
|
|
@@ -89,6 +90,7 @@ The original MTEB leaderboard is available under the [MTEB(eng, v1)](http://mteb
|
|
|
89
90
|
|
|
90
91
|
MTEB_ENG_CLASSIC = Benchmark(
|
|
91
92
|
name="MTEB(eng, v1)",
|
|
93
|
+
aliases=["MTEB(eng, classic)", "MTEB"],
|
|
92
94
|
display_name="English Legacy",
|
|
93
95
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/gb.svg",
|
|
94
96
|
tasks=MTEBTasks(
|
|
@@ -185,6 +187,7 @@ We recommend that you use [MTEB(eng, v2)](http://mteb-leaderboard.hf.space/?benc
|
|
|
185
187
|
|
|
186
188
|
MTEB_MAIN_RU = Benchmark(
|
|
187
189
|
name="MTEB(rus, v1)",
|
|
190
|
+
aliases=["MTEB(rus)"],
|
|
188
191
|
display_name="Russian legacy",
|
|
189
192
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg",
|
|
190
193
|
tasks=MTEBTasks(
|
|
@@ -344,6 +347,7 @@ RU_SCI_BENCH = Benchmark(
|
|
|
344
347
|
|
|
345
348
|
MTEB_RETRIEVAL_WITH_INSTRUCTIONS = Benchmark(
|
|
346
349
|
name="FollowIR",
|
|
350
|
+
aliases=["MTEB(Retrieval w/Instructions)"],
|
|
347
351
|
display_name="Instruction Following",
|
|
348
352
|
tasks=get_tasks(
|
|
349
353
|
tasks=[
|
|
@@ -394,7 +398,9 @@ MTEB_RETRIEVAL_WITH_DOMAIN_INSTRUCTIONS = Benchmark(
|
|
|
394
398
|
)
|
|
395
399
|
|
|
396
400
|
MTEB_RETRIEVAL_LAW = Benchmark(
|
|
397
|
-
|
|
401
|
+
# This benchmark is likely in the need of an update
|
|
402
|
+
name="MTEB(Law, v1)",
|
|
403
|
+
aliases=["MTEB(law)"],
|
|
398
404
|
display_name="Legal",
|
|
399
405
|
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg",
|
|
400
406
|
tasks=get_tasks(
|
|
@@ -416,6 +422,7 @@ MTEB_RETRIEVAL_LAW = Benchmark(
|
|
|
416
422
|
|
|
417
423
|
MTEB_RETRIEVAL_MEDICAL = Benchmark(
|
|
418
424
|
name="MTEB(Medical, v1)",
|
|
425
|
+
aliases=["MTEB(Medical)"],
|
|
419
426
|
display_name="Medical",
|
|
420
427
|
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg",
|
|
421
428
|
tasks=get_tasks(
|
|
@@ -469,6 +476,7 @@ MTEB_MINERS_BITEXT_MINING = Benchmark(
|
|
|
469
476
|
|
|
470
477
|
SEB = Benchmark(
|
|
471
478
|
name="MTEB(Scandinavian, v1)",
|
|
479
|
+
aliases=["MTEB(Scandinavian)", "SEB"],
|
|
472
480
|
display_name="Scandinavian",
|
|
473
481
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/dk.svg",
|
|
474
482
|
language_view=["dan-Latn", "swe-Latn", "nno-Latn", "nob-Latn"],
|
|
@@ -595,6 +603,7 @@ RAR_b = Benchmark(
|
|
|
595
603
|
|
|
596
604
|
MTEB_FRA = Benchmark(
|
|
597
605
|
name="MTEB(fra, v1)",
|
|
606
|
+
aliases=["MTEB(fra)"],
|
|
598
607
|
display_name="French",
|
|
599
608
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg",
|
|
600
609
|
tasks=MTEBTasks(
|
|
@@ -653,6 +662,7 @@ MTEB_FRA = Benchmark(
|
|
|
653
662
|
|
|
654
663
|
MTEB_DEU = Benchmark(
|
|
655
664
|
name="MTEB(deu, v1)",
|
|
665
|
+
aliases=["MTEB(deu)"],
|
|
656
666
|
display_name="German",
|
|
657
667
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg",
|
|
658
668
|
tasks=get_tasks(
|
|
@@ -704,6 +714,7 @@ MTEB_DEU = Benchmark(
|
|
|
704
714
|
|
|
705
715
|
MTEB_KOR = Benchmark(
|
|
706
716
|
name="MTEB(kor, v1)",
|
|
717
|
+
aliases=["MTEB(kor)"],
|
|
707
718
|
display_name="Korean",
|
|
708
719
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
|
|
709
720
|
tasks=get_tasks(
|
|
@@ -728,6 +739,7 @@ MTEB_KOR = Benchmark(
|
|
|
728
739
|
|
|
729
740
|
MTEB_POL = Benchmark(
|
|
730
741
|
name="MTEB(pol, v1)",
|
|
742
|
+
aliases=["MTEB(pol)"],
|
|
731
743
|
display_name="Polish",
|
|
732
744
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/pl.svg",
|
|
733
745
|
tasks=MTEBTasks(
|
|
@@ -777,6 +789,7 @@ two novel clustering tasks.""", # Rephrased from the abstract
|
|
|
777
789
|
|
|
778
790
|
MTEB_code = Benchmark(
|
|
779
791
|
name="MTEB(Code, v1)",
|
|
792
|
+
aliases=["MTEB(code)"],
|
|
780
793
|
display_name="Code",
|
|
781
794
|
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg",
|
|
782
795
|
tasks=get_tasks(
|
|
@@ -953,6 +966,7 @@ MTEB_multilingual_v1 = Benchmark(
|
|
|
953
966
|
|
|
954
967
|
MTEB_multilingual_v2 = Benchmark(
|
|
955
968
|
name="MTEB(Multilingual, v2)",
|
|
969
|
+
aliases=["MTEB(Multilingual)", "MMTEB"],
|
|
956
970
|
display_name="Multilingual",
|
|
957
971
|
language_view=[
|
|
958
972
|
"eng-Latn", # English
|
|
@@ -986,6 +1000,7 @@ MTEB_multilingual_v2 = Benchmark(
|
|
|
986
1000
|
|
|
987
1001
|
MTEB_JPN = Benchmark(
|
|
988
1002
|
name="MTEB(jpn, v1)",
|
|
1003
|
+
aliases=["MTEB(jpn)"],
|
|
989
1004
|
display_name="Japanese Legacy",
|
|
990
1005
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
|
|
991
1006
|
tasks=get_tasks(
|
|
@@ -1056,6 +1071,7 @@ indic_languages = [
|
|
|
1056
1071
|
|
|
1057
1072
|
MTEB_INDIC = Benchmark(
|
|
1058
1073
|
name="MTEB(Indic, v1)",
|
|
1074
|
+
aliases=["MTEB(Indic)"],
|
|
1059
1075
|
display_name="Indic",
|
|
1060
1076
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/in.svg",
|
|
1061
1077
|
tasks=MTEBTasks(
|
|
@@ -1146,6 +1162,7 @@ eu_languages = [
|
|
|
1146
1162
|
|
|
1147
1163
|
MTEB_EU = Benchmark(
|
|
1148
1164
|
name="MTEB(Europe, v1)",
|
|
1165
|
+
aliases=["MTEB(Europe)"],
|
|
1149
1166
|
display_name="European",
|
|
1150
1167
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/eu.svg",
|
|
1151
1168
|
tasks=get_tasks(
|
|
@@ -1285,6 +1302,7 @@ BRIGHT = Benchmark(
|
|
|
1285
1302
|
|
|
1286
1303
|
BRIGHT_LONG = Benchmark(
|
|
1287
1304
|
name="BRIGHT (long)",
|
|
1305
|
+
aliases=["BRIGHT(long)"],
|
|
1288
1306
|
tasks=MTEBTasks(
|
|
1289
1307
|
(
|
|
1290
1308
|
get_task(
|
|
@@ -1400,6 +1418,7 @@ NANOBEIR = Benchmark(
|
|
|
1400
1418
|
|
|
1401
1419
|
C_MTEB = Benchmark(
|
|
1402
1420
|
name="MTEB(cmn, v1)",
|
|
1421
|
+
aliases=["MTEB(Chinese)", "CMTEB"],
|
|
1403
1422
|
display_name="Chinese",
|
|
1404
1423
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/cn.svg",
|
|
1405
1424
|
tasks=MTEBTasks(
|
|
@@ -1466,6 +1485,7 @@ C_MTEB = Benchmark(
|
|
|
1466
1485
|
|
|
1467
1486
|
FA_MTEB = Benchmark(
|
|
1468
1487
|
name="MTEB(fas, v1)",
|
|
1488
|
+
aliases=["FaMTEB(fas, beta)"],
|
|
1469
1489
|
display_name="Farsi Legacy",
|
|
1470
1490
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ir.svg",
|
|
1471
1491
|
tasks=get_tasks(
|
|
@@ -2347,6 +2367,7 @@ VIDORE_V3 = VidoreBenchmark(
|
|
|
2347
2367
|
|
|
2348
2368
|
VISUAL_DOCUMENT_RETRIEVAL = VidoreBenchmark(
|
|
2349
2369
|
name="ViDoRe(v1&v2)",
|
|
2370
|
+
aliases=["VisualDocumentRetrieval"],
|
|
2350
2371
|
display_name="ViDoRe (V1&V2)",
|
|
2351
2372
|
tasks=get_tasks(
|
|
2352
2373
|
tasks=[
|