mteb 2.5.3__py3-none-any.whl → 2.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +10 -15
- mteb/_evaluators/any_sts_evaluator.py +1 -4
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +17 -16
- mteb/_evaluators/sklearn_evaluator.py +9 -8
- mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
- mteb/_evaluators/text/summarization_evaluator.py +20 -16
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +27 -21
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +3 -16
- mteb/abstasks/classification.py +10 -4
- mteb/abstasks/clustering.py +18 -14
- mteb/abstasks/clustering_legacy.py +8 -8
- mteb/abstasks/image/image_text_pair_classification.py +5 -3
- mteb/abstasks/multilabel_classification.py +20 -16
- mteb/abstasks/pair_classification.py +18 -9
- mteb/abstasks/regression.py +3 -3
- mteb/abstasks/retrieval.py +12 -9
- mteb/abstasks/sts.py +6 -3
- mteb/abstasks/task_metadata.py +20 -16
- mteb/abstasks/text/bitext_mining.py +36 -25
- mteb/abstasks/text/reranking.py +7 -5
- mteb/abstasks/text/summarization.py +8 -3
- mteb/abstasks/zeroshot_classification.py +5 -2
- mteb/benchmarks/benchmark.py +2 -2
- mteb/cache.py +20 -18
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +5 -5
- mteb/cli/generate_model_card.py +6 -4
- mteb/deprecated_evaluator.py +56 -43
- mteb/evaluate.py +35 -29
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +25 -27
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +1 -1
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +2 -2
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +8 -1
- mteb/models/instruct_wrapper.py +11 -5
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/dino_models.py +23 -23
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/model2vec_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +8 -8
- mteb/models/model_implementations/openclip_models.py +7 -7
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/repllama_models.py +2 -2
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/vlm2vec_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +4 -4
- mteb/models/model_meta.py +11 -12
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
- mteb/models/search_wrappers.py +22 -10
- mteb/models/sentence_transformer_wrapper.py +9 -4
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +25 -19
- mteb/results/model_result.py +49 -21
- mteb/results/task_result.py +45 -51
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
- {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/RECORD +102 -101
- {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
mteb/results/task_result.py
CHANGED
|
@@ -3,9 +3,8 @@ from __future__ import annotations
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import warnings
|
|
6
|
-
from argparse import Namespace
|
|
7
6
|
from collections import defaultdict
|
|
8
|
-
from collections.abc import Callable, Iterable
|
|
7
|
+
from collections.abc import Callable, Iterable, Mapping
|
|
9
8
|
from functools import cached_property
|
|
10
9
|
from importlib.metadata import version
|
|
11
10
|
from pathlib import Path
|
|
@@ -17,8 +16,11 @@ from packaging.version import Version
|
|
|
17
16
|
from pydantic import BaseModel, field_validator
|
|
18
17
|
from typing_extensions import Self
|
|
19
18
|
|
|
19
|
+
from mteb import TaskMetadata
|
|
20
20
|
from mteb._helpful_enum import HelpfulStrEnum
|
|
21
|
+
from mteb.abstasks import AbsTaskClassification
|
|
21
22
|
from mteb.abstasks.abstask import AbsTask
|
|
23
|
+
from mteb.abstasks.task_metadata import TaskDomain
|
|
22
24
|
from mteb.languages import LanguageScripts
|
|
23
25
|
from mteb.models.model_meta import ScoringFunction
|
|
24
26
|
from mteb.types import (
|
|
@@ -40,67 +42,59 @@ class Criteria(HelpfulStrEnum):
|
|
|
40
42
|
DATASET_REVISION = "dataset_revision"
|
|
41
43
|
|
|
42
44
|
|
|
43
|
-
class ScalaNbClassificationDummy:
|
|
45
|
+
class ScalaNbClassificationDummy(AbsTaskClassification):
|
|
44
46
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
45
47
|
|
|
46
|
-
metadata =
|
|
48
|
+
metadata = TaskMetadata(
|
|
47
49
|
name="ScalaNbClassification",
|
|
50
|
+
description="A dummy",
|
|
48
51
|
main_score="accuracy",
|
|
49
52
|
type="Classification",
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
},
|
|
53
|
-
dataset={"revision": "revision_not_applicable"},
|
|
54
|
-
revision="revision_not_applicable",
|
|
53
|
+
eval_langs=["nob-Latn"],
|
|
54
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
55
55
|
)
|
|
56
56
|
|
|
57
57
|
|
|
58
|
-
class ScalaNnClassificationDummy:
|
|
58
|
+
class ScalaNnClassificationDummy(AbsTaskClassification):
|
|
59
59
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
60
60
|
|
|
61
|
-
metadata =
|
|
61
|
+
metadata = TaskMetadata(
|
|
62
62
|
name="ScalaNnClassification",
|
|
63
|
+
description="A dummy",
|
|
63
64
|
main_score="accuracy",
|
|
64
65
|
type="Classification",
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
},
|
|
68
|
-
dataset={"revision": "revision_not_applicable"},
|
|
69
|
-
revision="revision_not_applicable",
|
|
66
|
+
eval_langs=["nob-Latn"],
|
|
67
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
70
68
|
)
|
|
71
69
|
|
|
72
70
|
|
|
73
|
-
class ScalaDaClassificationDummy:
|
|
71
|
+
class ScalaDaClassificationDummy(AbsTaskClassification):
|
|
74
72
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
75
73
|
|
|
76
|
-
metadata =
|
|
74
|
+
metadata = TaskMetadata(
|
|
77
75
|
name="ScalaDaClassification",
|
|
76
|
+
description="A dummy",
|
|
78
77
|
main_score="accuracy",
|
|
79
78
|
type="Classification",
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
},
|
|
83
|
-
dataset={"revision": "revision_not_applicable"},
|
|
84
|
-
revision="revision_not_applicable",
|
|
79
|
+
eval_langs=["dan-Latn"],
|
|
80
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
85
81
|
)
|
|
86
82
|
|
|
87
83
|
|
|
88
|
-
class ScalaSvClassificationDummy:
|
|
84
|
+
class ScalaSvClassificationDummy(AbsTaskClassification):
|
|
89
85
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
90
86
|
|
|
91
|
-
metadata =
|
|
87
|
+
metadata = TaskMetadata(
|
|
92
88
|
name="ScalaSvClassification",
|
|
89
|
+
description="A dummy",
|
|
93
90
|
main_score="accuracy",
|
|
94
91
|
type="Classification",
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
},
|
|
98
|
-
dataset={"revision": "revision_not_applicable"},
|
|
99
|
-
revision="revision_not_applicable",
|
|
92
|
+
eval_langs=["swe-Latn"],
|
|
93
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
100
94
|
)
|
|
101
95
|
|
|
102
96
|
|
|
103
|
-
outdated_tasks = {
|
|
97
|
+
outdated_tasks: dict[str, type[AbsTask]] = {
|
|
104
98
|
"ScalaNbClassification": ScalaNbClassificationDummy,
|
|
105
99
|
"ScalaNnClassification": ScalaNnClassificationDummy,
|
|
106
100
|
"ScalaDaClassification": ScalaDaClassificationDummy,
|
|
@@ -167,10 +161,10 @@ class TaskResult(BaseModel):
|
|
|
167
161
|
def from_task_results(
|
|
168
162
|
cls,
|
|
169
163
|
task: AbsTask | type[AbsTask],
|
|
170
|
-
scores: dict[SplitName,
|
|
164
|
+
scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
|
|
171
165
|
evaluation_time: float,
|
|
172
166
|
kg_co2_emissions: float | None = None,
|
|
173
|
-
) ->
|
|
167
|
+
) -> TaskResult:
|
|
174
168
|
"""Create a TaskResult from the task and scores.
|
|
175
169
|
|
|
176
170
|
Args:
|
|
@@ -247,12 +241,12 @@ class TaskResult(BaseModel):
|
|
|
247
241
|
return get_task(self.task_name)
|
|
248
242
|
|
|
249
243
|
@property
|
|
250
|
-
def domains(self) -> list[
|
|
244
|
+
def domains(self) -> list[TaskDomain]:
|
|
251
245
|
"""Get the domains of the task."""
|
|
252
246
|
doms = self.task.metadata.domains
|
|
253
247
|
if doms is None:
|
|
254
248
|
doms = []
|
|
255
|
-
return doms
|
|
249
|
+
return doms
|
|
256
250
|
|
|
257
251
|
@property
|
|
258
252
|
def task_type(self) -> str:
|
|
@@ -308,7 +302,7 @@ class TaskResult(BaseModel):
|
|
|
308
302
|
if isinstance(v, dict):
|
|
309
303
|
self._round_scores(v, n)
|
|
310
304
|
elif isinstance(v, float):
|
|
311
|
-
value[i] = round(v, n)
|
|
305
|
+
value[i] = round(v, n) # type: ignore[call-overload]
|
|
312
306
|
|
|
313
307
|
elif isinstance(value, float):
|
|
314
308
|
scores[key] = round(value, n)
|
|
@@ -326,7 +320,7 @@ class TaskResult(BaseModel):
|
|
|
326
320
|
json.dump(json_obj, f, indent=2)
|
|
327
321
|
|
|
328
322
|
@classmethod
|
|
329
|
-
def from_disk(cls, path: Path, load_historic_data: bool = True) ->
|
|
323
|
+
def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
|
|
330
324
|
"""Load TaskResult from disk.
|
|
331
325
|
|
|
332
326
|
Args:
|
|
@@ -357,7 +351,7 @@ class TaskResult(BaseModel):
|
|
|
357
351
|
) # assume it is before 1.11.0 if the version is not present
|
|
358
352
|
|
|
359
353
|
try:
|
|
360
|
-
obj = cls.model_validate(data)
|
|
354
|
+
obj: TaskResult = cls.model_validate(data)
|
|
361
355
|
except Exception as e:
|
|
362
356
|
if not pre_1_11_load:
|
|
363
357
|
raise e
|
|
@@ -382,6 +376,7 @@ class TaskResult(BaseModel):
|
|
|
382
376
|
from mteb import get_task
|
|
383
377
|
|
|
384
378
|
task_name = obj.task_name
|
|
379
|
+
task: AbsTask | type[AbsTask]
|
|
385
380
|
if task_name in outdated_tasks:
|
|
386
381
|
task = outdated_tasks[task_name]
|
|
387
382
|
else:
|
|
@@ -394,11 +389,11 @@ class TaskResult(BaseModel):
|
|
|
394
389
|
for key in list(hf_subset_scores.keys()):
|
|
395
390
|
if isinstance(hf_subset_scores[key], dict):
|
|
396
391
|
for k, v in hf_subset_scores[key].items():
|
|
397
|
-
hf_subset_scores[f"{key}_{k}"] = v
|
|
398
|
-
hf_subset_scores.pop(key)
|
|
392
|
+
hf_subset_scores[f"{key}_{k}"] = v # type: ignore[index]
|
|
393
|
+
hf_subset_scores.pop(key) # type: ignore[attr-defined]
|
|
399
394
|
|
|
400
395
|
@classmethod
|
|
401
|
-
def _convert_from_before_v1_11_0(cls, data: dict) ->
|
|
396
|
+
def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult:
|
|
402
397
|
from mteb.get_tasks import _TASKS_REGISTRY
|
|
403
398
|
|
|
404
399
|
# in case the task name is not found in the registry, try to find a lower case version
|
|
@@ -484,7 +479,7 @@ class TaskResult(BaseModel):
|
|
|
484
479
|
scores["test"]["fra-fra"] = scores["test"].pop("fr")
|
|
485
480
|
|
|
486
481
|
result: TaskResult = TaskResult.from_task_results(
|
|
487
|
-
task,
|
|
482
|
+
task,
|
|
488
483
|
scores,
|
|
489
484
|
evaluation_time,
|
|
490
485
|
kg_co2_emissions=None,
|
|
@@ -535,7 +530,7 @@ class TaskResult(BaseModel):
|
|
|
535
530
|
def _get_score_fast(
|
|
536
531
|
self,
|
|
537
532
|
splits: Iterable[str] | None = None,
|
|
538
|
-
languages:
|
|
533
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
539
534
|
subsets: Iterable[str] | None = None,
|
|
540
535
|
) -> float:
|
|
541
536
|
"""Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.
|
|
@@ -584,7 +579,7 @@ class TaskResult(BaseModel):
|
|
|
584
579
|
return val_sum / n_val
|
|
585
580
|
|
|
586
581
|
@classmethod
|
|
587
|
-
def from_validated(cls, **data) ->
|
|
582
|
+
def from_validated(cls, **data) -> TaskResult:
|
|
588
583
|
"""Create a TaskResult from validated data.
|
|
589
584
|
|
|
590
585
|
Returns:
|
|
@@ -595,13 +590,13 @@ class TaskResult(BaseModel):
|
|
|
595
590
|
def __repr__(self) -> str:
|
|
596
591
|
return f"TaskResult(task_name={self.task_name}, scores=...)"
|
|
597
592
|
|
|
598
|
-
def only_main_score(self) ->
|
|
593
|
+
def only_main_score(self) -> TaskResult:
|
|
599
594
|
"""Return a new TaskResult object with only the main score.
|
|
600
595
|
|
|
601
596
|
Returns:
|
|
602
597
|
A new TaskResult object with only the main score.
|
|
603
598
|
"""
|
|
604
|
-
new_scores = {}
|
|
599
|
+
new_scores: dict[str, list[Score]] = {}
|
|
605
600
|
for split in self.scores:
|
|
606
601
|
new_scores[split] = []
|
|
607
602
|
for subset_scores in self.scores[split]:
|
|
@@ -613,10 +608,9 @@ class TaskResult(BaseModel):
|
|
|
613
608
|
}
|
|
614
609
|
)
|
|
615
610
|
new_res = {**self.to_dict(), "scores": new_scores}
|
|
616
|
-
|
|
617
|
-
return new_res
|
|
611
|
+
return TaskResult.from_validated(**new_res)
|
|
618
612
|
|
|
619
|
-
def validate_and_filter_scores(self, task: AbsTask | None = None) ->
|
|
613
|
+
def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult:
|
|
620
614
|
"""Validate and filter the scores against the task metadata.
|
|
621
615
|
|
|
622
616
|
This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
|
|
@@ -638,7 +632,7 @@ class TaskResult(BaseModel):
|
|
|
638
632
|
splits = task.eval_splits
|
|
639
633
|
hf_subsets = set(task.hf_subsets) # Convert to set once
|
|
640
634
|
|
|
641
|
-
new_scores = {}
|
|
635
|
+
new_scores: dict[str, list[Score]] = {}
|
|
642
636
|
seen_splits = set()
|
|
643
637
|
for split in self.scores:
|
|
644
638
|
if split not in splits:
|
|
@@ -739,7 +733,7 @@ class TaskResult(BaseModel):
|
|
|
739
733
|
"mteb_version",
|
|
740
734
|
"dataset_revision",
|
|
741
735
|
],
|
|
742
|
-
) ->
|
|
736
|
+
) -> TaskResult:
|
|
743
737
|
"""Merges two TaskResult objects.
|
|
744
738
|
|
|
745
739
|
Args:
|
mteb/similarity_functions.py
CHANGED
|
@@ -186,7 +186,7 @@ def max_sim(a: Array, b: Array) -> torch.Tensor:
|
|
|
186
186
|
b,
|
|
187
187
|
)
|
|
188
188
|
|
|
189
|
-
return scores.max(axis=-1).values.sum(axis=-1)
|
|
189
|
+
return scores.max(axis=-1).values.sum(axis=-1) # type: ignore[call-overload]
|
|
190
190
|
|
|
191
191
|
|
|
192
192
|
# https://github.com/lightonai/pylate/blob/2d094a724866d6e15701781528368438081c0157/pylate/scores/scores.py#L67C1-L122C38
|
|
@@ -217,7 +217,7 @@ def pairwise_max_sim(
|
|
|
217
217
|
document_embedding,
|
|
218
218
|
)
|
|
219
219
|
|
|
220
|
-
scores.append(query_document_score.max(axis=-1).values.sum())
|
|
220
|
+
scores.append(query_document_score.max(axis=-1).values.sum()) # type: ignore[call-overload]
|
|
221
221
|
|
|
222
222
|
return torch.stack(scores, dim=0)
|
|
223
223
|
|
|
@@ -317,11 +317,15 @@ def similarity(text_embeddings: Array, input_embeddings: Array) -> Array:
|
|
|
317
317
|
Returns:
|
|
318
318
|
Matrix with similarities
|
|
319
319
|
"""
|
|
320
|
-
|
|
321
|
-
|
|
320
|
+
text_embeddings_tensor = _convert_to_tensor(text_embeddings)
|
|
321
|
+
input_embeddings_tensor = _convert_to_tensor(input_embeddings)
|
|
322
322
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
323
|
+
text_embeddings_tensor = text_embeddings_tensor / text_embeddings_tensor.norm(
|
|
324
|
+
dim=-1, keepdim=True
|
|
325
|
+
)
|
|
326
|
+
input_embeddings_tensor = input_embeddings_tensor / input_embeddings_tensor.norm(
|
|
327
|
+
dim=-1, keepdim=True
|
|
328
|
+
)
|
|
329
|
+
logits = torch.matmul(input_embeddings_tensor, text_embeddings_tensor.T)
|
|
326
330
|
probs = (logits * 100).softmax(dim=-1)
|
|
327
331
|
return probs
|
|
@@ -62,7 +62,7 @@ Piperidis, Stelios},
|
|
|
62
62
|
|
|
63
63
|
def dataset_transform(self):
|
|
64
64
|
# convert label to a 0/1 label
|
|
65
|
-
labels = self.dataset["train"]["label"]
|
|
65
|
+
labels = self.dataset["train"]["label"]
|
|
66
66
|
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
|
|
67
67
|
self.dataset = self.dataset.map(
|
|
68
68
|
lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
|
|
@@ -45,7 +45,7 @@ class EstonianValenceClassification(AbsTaskClassification):
|
|
|
45
45
|
"valence", "label"
|
|
46
46
|
)
|
|
47
47
|
# convert label to a numbers
|
|
48
|
-
labels = self.dataset["train"]["label"]
|
|
48
|
+
labels = self.dataset["train"]["label"]
|
|
49
49
|
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
|
|
50
50
|
self.dataset = self.dataset.map(
|
|
51
51
|
lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
|
|
@@ -57,7 +57,7 @@ Fishel, Mark},
|
|
|
57
57
|
def dataset_transform(self):
|
|
58
58
|
for lang in self.dataset.keys():
|
|
59
59
|
# convert label to a 0/1 label
|
|
60
|
-
labels = self.dataset[lang]["train"]["label"]
|
|
60
|
+
labels = self.dataset[lang]["train"]["label"]
|
|
61
61
|
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
|
|
62
62
|
self.dataset[lang] = self.dataset[lang].map(
|
|
63
63
|
lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
|
|
@@ -49,7 +49,7 @@ class SugarCrepe(AbsTaskImageTextPairClassification):
|
|
|
49
49
|
"""Load dataset from HuggingFace hub"""
|
|
50
50
|
if self.data_loaded:
|
|
51
51
|
return
|
|
52
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
52
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
53
53
|
self.dataset = datasets.DatasetDict({"test": self.dataset["train"]})
|
|
54
54
|
self.dataset_transform()
|
|
55
55
|
self.data_loaded = True
|
|
@@ -48,14 +48,14 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
48
48
|
"path": "code-rag-bench/programming-solutions",
|
|
49
49
|
"revision": "1064f7bba54d5400d4836f5831fe4c2332a566a6",
|
|
50
50
|
},
|
|
51
|
-
**common_args,
|
|
51
|
+
**common_args,
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
def load_data(self) -> None:
|
|
55
55
|
"""Load dataset from HuggingFace hub"""
|
|
56
56
|
if self.data_loaded:
|
|
57
57
|
return
|
|
58
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
58
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
59
59
|
self.dataset_transform()
|
|
60
60
|
self.data_loaded = True
|
|
61
61
|
|
|
@@ -71,7 +71,7 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
71
71
|
self.queries = {}
|
|
72
72
|
|
|
73
73
|
split = self.metadata.eval_splits[0]
|
|
74
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
74
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
75
75
|
ds = ds.shuffle(seed=42)
|
|
76
76
|
|
|
77
77
|
self.queries[split] = {}
|
|
@@ -105,14 +105,14 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
|
|
|
105
105
|
"path": "code-rag-bench/online-tutorials",
|
|
106
106
|
"revision": "095bb77130082e4690d6c3a031997b03487bf6e2",
|
|
107
107
|
},
|
|
108
|
-
**common_args,
|
|
108
|
+
**common_args,
|
|
109
109
|
)
|
|
110
110
|
|
|
111
111
|
def load_data(self) -> None:
|
|
112
112
|
"""Load dataset from HuggingFace hub"""
|
|
113
113
|
if self.data_loaded:
|
|
114
114
|
return
|
|
115
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
115
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
116
116
|
self.dataset_transform()
|
|
117
117
|
self.data_loaded = True
|
|
118
118
|
|
|
@@ -128,7 +128,7 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
|
|
|
128
128
|
self.queries = {}
|
|
129
129
|
|
|
130
130
|
split = self.metadata.eval_splits[0]
|
|
131
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
131
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
132
132
|
ds = ds.shuffle(seed=42)
|
|
133
133
|
|
|
134
134
|
self.queries[split] = {}
|
|
@@ -165,14 +165,14 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
165
165
|
"path": "code-rag-bench/library-documentation",
|
|
166
166
|
"revision": "b530d3b5a25087d2074e731b76232db85b9e9107",
|
|
167
167
|
},
|
|
168
|
-
**common_args,
|
|
168
|
+
**common_args,
|
|
169
169
|
)
|
|
170
170
|
|
|
171
171
|
def load_data(self) -> None:
|
|
172
172
|
"""Load dataset from HuggingFace hub"""
|
|
173
173
|
if self.data_loaded:
|
|
174
174
|
return
|
|
175
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
175
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
176
176
|
self.dataset_transform()
|
|
177
177
|
self.data_loaded = True
|
|
178
178
|
|
|
@@ -188,7 +188,7 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
188
188
|
self.queries = {}
|
|
189
189
|
|
|
190
190
|
split = self.metadata.eval_splits[0]
|
|
191
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
191
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
192
192
|
ds = ds.shuffle(seed=42)
|
|
193
193
|
|
|
194
194
|
self.queries[split] = {}
|
|
@@ -222,14 +222,14 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
|
|
|
222
222
|
"path": "code-rag-bench/stackoverflow-posts",
|
|
223
223
|
"revision": "04e05d86cb0ac467b29a5d87f4c56eac99dfc0a4",
|
|
224
224
|
},
|
|
225
|
-
**common_args,
|
|
225
|
+
**common_args,
|
|
226
226
|
)
|
|
227
227
|
|
|
228
228
|
def load_data(self) -> None:
|
|
229
229
|
"""Load dataset from HuggingFace hub"""
|
|
230
230
|
if self.data_loaded:
|
|
231
231
|
return
|
|
232
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
232
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
233
233
|
self.dataset_transform()
|
|
234
234
|
self.data_loaded = True
|
|
235
235
|
|
|
@@ -245,7 +245,7 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
|
|
|
245
245
|
self.queries = {}
|
|
246
246
|
|
|
247
247
|
split = self.metadata.eval_splits[0]
|
|
248
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
248
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
249
249
|
ds = ds.shuffle(seed=42)
|
|
250
250
|
|
|
251
251
|
self.queries[split] = {}
|
|
@@ -51,7 +51,7 @@ Derczynski, Leon},
|
|
|
51
51
|
"""Load dataset from HuggingFace hub"""
|
|
52
52
|
if self.data_loaded:
|
|
53
53
|
return
|
|
54
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
54
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
55
55
|
self.dataset_transform()
|
|
56
56
|
self.data_loaded = True
|
|
57
57
|
|
|
@@ -64,7 +64,7 @@ Piperidis, Stelios},
|
|
|
64
64
|
"""Load dataset from HuggingFace hub"""
|
|
65
65
|
if self.data_loaded:
|
|
66
66
|
return
|
|
67
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
67
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
68
68
|
self.dataset_transform()
|
|
69
69
|
self.data_loaded = True
|
|
70
70
|
|
|
@@ -81,7 +81,7 @@ Piperidis, Stelios},
|
|
|
81
81
|
text2id = {}
|
|
82
82
|
|
|
83
83
|
for split in self.dataset:
|
|
84
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
84
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
85
85
|
ds = ds.shuffle(seed=42)
|
|
86
86
|
ds = ds.select(
|
|
87
87
|
range(2048)
|
|
@@ -40,7 +40,7 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
|
|
|
40
40
|
"""Load dataset from HuggingFace hub"""
|
|
41
41
|
if self.data_loaded:
|
|
42
42
|
return
|
|
43
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
43
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
44
44
|
self.dataset_transform()
|
|
45
45
|
self.data_loaded = True
|
|
46
46
|
|
|
@@ -57,7 +57,7 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
|
|
|
57
57
|
text2id = {}
|
|
58
58
|
|
|
59
59
|
for split in self.dataset:
|
|
60
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
60
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
61
61
|
ds = ds.map(answers_to_list)
|
|
62
62
|
|
|
63
63
|
self.queries[split] = {}
|
|
@@ -54,7 +54,7 @@ Fishel, Mark},
|
|
|
54
54
|
"""Load dataset from HuggingFace hub"""
|
|
55
55
|
if self.data_loaded:
|
|
56
56
|
return
|
|
57
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
57
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
58
58
|
self.dataset_transform()
|
|
59
59
|
self.data_loaded = True
|
|
60
60
|
|
|
@@ -71,7 +71,7 @@ Fishel, Mark},
|
|
|
71
71
|
text2id = {}
|
|
72
72
|
|
|
73
73
|
for split in self.dataset:
|
|
74
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
74
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
75
75
|
ds = ds.shuffle(seed=42)
|
|
76
76
|
max_samples = min(1024, len(ds))
|
|
77
77
|
ds = ds.select(
|
|
@@ -41,7 +41,7 @@ class SNLRetrieval(AbsTaskRetrieval):
|
|
|
41
41
|
"""Load dataset from HuggingFace hub"""
|
|
42
42
|
if self.data_loaded:
|
|
43
43
|
return
|
|
44
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
44
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
45
45
|
self.dataset_transform()
|
|
46
46
|
self.data_loaded = True
|
|
47
47
|
|
|
@@ -58,7 +58,7 @@ class SNLRetrieval(AbsTaskRetrieval):
|
|
|
58
58
|
text2id = {}
|
|
59
59
|
|
|
60
60
|
for split in self.dataset:
|
|
61
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
61
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
62
62
|
ds = ds.shuffle(seed=42)
|
|
63
63
|
|
|
64
64
|
self.queries[split] = {}
|
|
@@ -59,7 +59,7 @@ class TurHistQuadRetrieval(AbsTaskRetrieval):
|
|
|
59
59
|
text2id = {}
|
|
60
60
|
|
|
61
61
|
for split in self.metadata.eval_splits:
|
|
62
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
62
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
63
63
|
ds = ds.shuffle(seed=42)
|
|
64
64
|
max_samples = min(1024, len(ds))
|
|
65
65
|
ds = ds.select(
|
mteb/types/_result.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
1
2
|
from typing import Any, NamedTuple
|
|
2
3
|
|
|
3
4
|
HFSubset = str
|
|
@@ -8,7 +9,7 @@ SplitName = str
|
|
|
8
9
|
Score = Any
|
|
9
10
|
"""A score value, could e.g. be accuracy. Normally it is a float or int, but it can take on any value. Should be json serializable."""
|
|
10
11
|
|
|
11
|
-
ScoresDict =
|
|
12
|
+
ScoresDict = Mapping[str, Score]
|
|
12
13
|
"""A dictionary of scores, typically also include metadata, e.g {'main_score': 0.5, 'accuracy': 0.5, 'f1': 0.6, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']}"""
|
|
13
14
|
|
|
14
15
|
|
mteb/types/statistics.py
CHANGED
|
@@ -10,8 +10,14 @@ class SplitDescriptiveStatistics(TypedDict):
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class DescriptiveStatistics(TypedDict, SplitDescriptiveStatistics):
|
|
13
|
-
"""Class for descriptive statistics for the full task.
|
|
13
|
+
"""Class for descriptive statistics for the full task.
|
|
14
14
|
|
|
15
|
+
Attributes:
|
|
16
|
+
num_samples: Total number of samples
|
|
17
|
+
hf_subset_descriptive_stats: HFSubset descriptive statistics (only for multilingual datasets)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
num_samples: int
|
|
15
21
|
hf_subset_descriptive_stats: NotRequired[dict[HFSubset, SplitDescriptiveStatistics]]
|
|
16
22
|
|
|
17
23
|
|
|
@@ -88,9 +94,9 @@ class ScoreStatistics(TypedDict):
|
|
|
88
94
|
max_score: Maximum score
|
|
89
95
|
"""
|
|
90
96
|
|
|
91
|
-
min_score: int
|
|
97
|
+
min_score: int | float
|
|
92
98
|
avg_score: float
|
|
93
|
-
max_score: int
|
|
99
|
+
max_score: int | float
|
|
94
100
|
|
|
95
101
|
|
|
96
102
|
class TopRankedStatistics(TypedDict):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.4
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|