mteb 2.6.6__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +7 -3
- mteb/_evaluators/any_sts_evaluator.py +6 -3
- mteb/_evaluators/clustering_evaluator.py +2 -2
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -5
- mteb/_evaluators/pair_classification_evaluator.py +2 -2
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/sklearn_evaluator.py +3 -3
- mteb/_evaluators/text/bitext_mining_evaluator.py +5 -3
- mteb/_evaluators/text/summarization_evaluator.py +3 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
- mteb/abstasks/abstask.py +3 -2
- mteb/abstasks/aggregated_task.py +3 -3
- mteb/abstasks/classification.py +3 -3
- mteb/abstasks/clustering.py +2 -2
- mteb/abstasks/clustering_legacy.py +2 -2
- mteb/abstasks/image/image_text_pair_classification.py +2 -1
- mteb/abstasks/multilabel_classification.py +2 -2
- mteb/abstasks/pair_classification.py +2 -2
- mteb/abstasks/retrieval.py +15 -14
- mteb/abstasks/sts.py +2 -2
- mteb/abstasks/text/bitext_mining.py +3 -3
- mteb/abstasks/text/summarization.py +2 -2
- mteb/abstasks/zeroshot_classification.py +3 -2
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +24 -0
- mteb/cli/build_cli.py +2 -1
- mteb/deprecated_evaluator.py +3 -3
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/evaluate.py +5 -3
- mteb/models/abs_encoder.py +3 -1
- mteb/models/instruct_wrapper.py +1 -1
- mteb/models/model_implementations/bm25.py +3 -3
- mteb/models/model_implementations/jina_clip.py +46 -8
- mteb/models/model_implementations/mxbai_models.py +118 -1
- mteb/models/model_implementations/nvidia_models.py +73 -5
- mteb/models/model_implementations/octen_models.py +30 -0
- mteb/models/model_implementations/pylate_models.py +5 -4
- mteb/models/model_implementations/sentence_transformers_models.py +66 -0
- mteb/models/models_protocols.py +6 -4
- mteb/models/search_wrappers.py +7 -6
- mteb/models/sentence_transformer_wrapper.py +5 -4
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +12 -0
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/METADATA +1 -1
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/RECORD +55 -50
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/WHEEL +0 -0
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/entry_points.txt +0 -0
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/top_level.txt +0 -0
mteb/_create_dataloaders.py
CHANGED
|
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
|
|
|
23
23
|
def _create_dataloader_from_texts(
|
|
24
24
|
text: list[str],
|
|
25
25
|
batch_size: int = 32,
|
|
26
|
-
**kwargs:
|
|
26
|
+
**kwargs: Any,
|
|
27
27
|
) -> DataLoader[TextInput]:
|
|
28
28
|
"""Create a dataloader from a list of text.
|
|
29
29
|
|
|
@@ -191,7 +191,8 @@ def _create_dataloader_for_queries_conversation(
|
|
|
191
191
|
"""
|
|
192
192
|
return DataLoader(
|
|
193
193
|
queries.map(
|
|
194
|
-
_convert_conv_history_to_query,
|
|
194
|
+
_convert_conv_history_to_query,
|
|
195
|
+
desc="Converting conversations to queries",
|
|
195
196
|
),
|
|
196
197
|
collate_fn=_custom_collate_fn,
|
|
197
198
|
batch_size=batch_size,
|
|
@@ -361,6 +362,9 @@ def _create_document_dataloader(
|
|
|
361
362
|
task_metadata: Metadata of the task to determine the document type.
|
|
362
363
|
input_column: The column to use as input. If None, it will use the first column that matches the modality.
|
|
363
364
|
batch_size: Batch size for the dataloader.
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
A dataloader for the documents.
|
|
364
368
|
"""
|
|
365
369
|
document_type = task_metadata.get_modalities(PromptType.document)
|
|
366
370
|
if document_type == ["text"]: # text only
|
|
@@ -383,7 +387,7 @@ def create_dataloader(
|
|
|
383
387
|
prompt_type: PromptType | None = None,
|
|
384
388
|
input_column: str | None = None,
|
|
385
389
|
batch_size: int = 32,
|
|
386
|
-
**kwargs:
|
|
390
|
+
**kwargs: Any,
|
|
387
391
|
) -> DataLoader[BatchedInput]:
|
|
388
392
|
"""Create a dataloader from a dataset.
|
|
389
393
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import TypedDict
|
|
3
3
|
|
|
4
4
|
from datasets import Dataset
|
|
5
5
|
from sklearn.metrics.pairwise import (
|
|
@@ -12,7 +12,7 @@ from mteb._create_dataloaders import create_dataloader
|
|
|
12
12
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.models import EncoderProtocol
|
|
14
14
|
from mteb.similarity_functions import compute_pairwise_similarity
|
|
15
|
-
from mteb.types import PromptType
|
|
15
|
+
from mteb.types import EncodeKwargs, PromptType
|
|
16
16
|
|
|
17
17
|
from .evaluator import Evaluator
|
|
18
18
|
|
|
@@ -57,7 +57,10 @@ class AnySTSEvaluator(Evaluator):
|
|
|
57
57
|
self.input2_prompt_type = input2_prompt_type
|
|
58
58
|
|
|
59
59
|
def __call__(
|
|
60
|
-
self,
|
|
60
|
+
self,
|
|
61
|
+
model: EncoderProtocol,
|
|
62
|
+
*,
|
|
63
|
+
encode_kwargs: EncodeKwargs,
|
|
61
64
|
) -> STSEvaluatorScores:
|
|
62
65
|
logger.info("Running semantic similarity - Encoding samples (1/2)")
|
|
63
66
|
embeddings1 = model.encode(
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any
|
|
3
2
|
|
|
4
3
|
from datasets import Dataset
|
|
5
4
|
from sklearn import cluster
|
|
@@ -7,6 +6,7 @@ from sklearn import cluster
|
|
|
7
6
|
from mteb._create_dataloaders import create_dataloader
|
|
8
7
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
8
|
from mteb.models import EncoderProtocol
|
|
9
|
+
from mteb.types import EncodeKwargs
|
|
10
10
|
|
|
11
11
|
from .evaluator import Evaluator
|
|
12
12
|
|
|
@@ -38,7 +38,7 @@ class ClusteringEvaluator(Evaluator):
|
|
|
38
38
|
self,
|
|
39
39
|
model: EncoderProtocol,
|
|
40
40
|
*,
|
|
41
|
-
encode_kwargs:
|
|
41
|
+
encode_kwargs: EncodeKwargs,
|
|
42
42
|
) -> list[int]:
|
|
43
43
|
data_loader = create_dataloader(
|
|
44
44
|
self.dataset,
|
mteb/_evaluators/evaluator.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from mteb.abstasks.abstask import _set_seed
|
|
6
6
|
from mteb.models import EncoderProtocol
|
|
7
|
+
from mteb.types import EncodeKwargs
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class Evaluator(ABC):
|
|
@@ -18,7 +19,7 @@ class Evaluator(ABC):
|
|
|
18
19
|
|
|
19
20
|
@abstractmethod
|
|
20
21
|
def __call__(
|
|
21
|
-
self, model: EncoderProtocol, *, encode_kwargs:
|
|
22
|
+
self, model: EncoderProtocol, *, encode_kwargs: EncodeKwargs
|
|
22
23
|
) -> Mapping[str, float] | Iterable[Any]:
|
|
23
24
|
"""This is called during training to evaluate the model.
|
|
24
25
|
|
|
@@ -6,16 +6,17 @@ from typing import TYPE_CHECKING, Any
|
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
8
|
import torch.nn.functional as F
|
|
9
|
-
from datasets import Dataset
|
|
10
9
|
from torch.utils.data import DataLoader
|
|
11
10
|
|
|
12
11
|
from mteb._create_dataloaders import (
|
|
12
|
+
_create_dataloader_from_texts,
|
|
13
13
|
_transform_image_to_rgb,
|
|
14
14
|
)
|
|
15
15
|
from mteb._evaluators.evaluator import Evaluator
|
|
16
16
|
from mteb._requires_package import requires_image_dependencies
|
|
17
17
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
18
18
|
from mteb.models.models_protocols import EncoderProtocol
|
|
19
|
+
from mteb.types import EncodeKwargs
|
|
19
20
|
|
|
20
21
|
if TYPE_CHECKING:
|
|
21
22
|
from PIL.Image import Image
|
|
@@ -84,7 +85,10 @@ class ImageTextPairClassificationEvaluator(Evaluator):
|
|
|
84
85
|
self.hf_subset = hf_subset
|
|
85
86
|
|
|
86
87
|
def __call__( # type: ignore[override]
|
|
87
|
-
self,
|
|
88
|
+
self,
|
|
89
|
+
model: EncoderProtocol,
|
|
90
|
+
*,
|
|
91
|
+
encode_kwargs: EncodeKwargs,
|
|
88
92
|
) -> list[torch.Tensor]:
|
|
89
93
|
images = []
|
|
90
94
|
if isinstance(self.images_column_names, str):
|
|
@@ -105,8 +109,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
|
|
|
105
109
|
texts.append(row[col])
|
|
106
110
|
|
|
107
111
|
text_embeddings = model.encode(
|
|
108
|
-
|
|
109
|
-
|
|
112
|
+
_create_dataloader_from_texts(
|
|
113
|
+
texts,
|
|
110
114
|
**encode_kwargs,
|
|
111
115
|
),
|
|
112
116
|
task_metadata=self.task_metadata,
|
|
@@ -127,7 +131,6 @@ class ImageTextPairClassificationEvaluator(Evaluator):
|
|
|
127
131
|
DataLoader(
|
|
128
132
|
CustomImageDataset(images),
|
|
129
133
|
collate_fn=lambda x: {"image": [item["image"] for item in x]},
|
|
130
|
-
**encode_kwargs,
|
|
131
134
|
),
|
|
132
135
|
task_metadata=self.task_metadata,
|
|
133
136
|
hf_subset=self.hf_subset,
|
|
@@ -14,7 +14,7 @@ from mteb._evaluators.evaluator import Evaluator
|
|
|
14
14
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
15
|
from mteb.models import EncoderProtocol
|
|
16
16
|
from mteb.similarity_functions import compute_pairwise_similarity
|
|
17
|
-
from mteb.types import PromptType
|
|
17
|
+
from mteb.types import EncodeKwargs, PromptType
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
20
|
|
|
@@ -85,7 +85,7 @@ class PairClassificationEvaluator(Evaluator):
|
|
|
85
85
|
def __call__(
|
|
86
86
|
self,
|
|
87
87
|
model: EncoderProtocol,
|
|
88
|
-
encode_kwargs:
|
|
88
|
+
encode_kwargs: EncodeKwargs,
|
|
89
89
|
) -> PairClassificationDistances:
|
|
90
90
|
logger.info("Running pair classification - Encoding samples (1/2)")
|
|
91
91
|
embeddings1 = model.encode(
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections.abc import Sequence
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
6
5
|
from mteb.models import SearchProtocol
|
|
7
6
|
from mteb.types import (
|
|
8
7
|
CorpusDatasetType,
|
|
8
|
+
EncodeKwargs,
|
|
9
9
|
QueryDatasetType,
|
|
10
10
|
RelevantDocumentsType,
|
|
11
11
|
RetrievalEvaluationResult,
|
|
@@ -48,7 +48,7 @@ class RetrievalEvaluator(Evaluator):
|
|
|
48
48
|
def __call__( # type: ignore[override]
|
|
49
49
|
self,
|
|
50
50
|
search_model: SearchProtocol,
|
|
51
|
-
encode_kwargs:
|
|
51
|
+
encode_kwargs: EncodeKwargs,
|
|
52
52
|
) -> RetrievalOutputType:
|
|
53
53
|
logger.info("Running retrieval task - Indexing corpus...")
|
|
54
54
|
search_model.index(
|
|
@@ -9,7 +9,7 @@ from typing_extensions import Self
|
|
|
9
9
|
from mteb._create_dataloaders import create_dataloader
|
|
10
10
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
11
|
from mteb.models import EncoderProtocol
|
|
12
|
-
from mteb.types import Array, BatchedInput
|
|
12
|
+
from mteb.types import Array, BatchedInput, EncodeKwargs
|
|
13
13
|
|
|
14
14
|
from .evaluator import Evaluator
|
|
15
15
|
|
|
@@ -50,7 +50,7 @@ class SklearnEvaluator(Evaluator):
|
|
|
50
50
|
self.evaluator_model = evaluator_model
|
|
51
51
|
|
|
52
52
|
def create_dataloaders(
|
|
53
|
-
self, encode_kwargs:
|
|
53
|
+
self, encode_kwargs: EncodeKwargs
|
|
54
54
|
) -> tuple[DataLoader[BatchedInput], DataLoader[BatchedInput]]:
|
|
55
55
|
dataloader_train = create_dataloader(
|
|
56
56
|
self.train_dataset,
|
|
@@ -70,7 +70,7 @@ class SklearnEvaluator(Evaluator):
|
|
|
70
70
|
self,
|
|
71
71
|
model: EncoderProtocol,
|
|
72
72
|
*,
|
|
73
|
-
encode_kwargs:
|
|
73
|
+
encode_kwargs: EncodeKwargs,
|
|
74
74
|
test_cache: Array | None = None,
|
|
75
75
|
) -> tuple[np.ndarray, Array]:
|
|
76
76
|
"""Classification evaluation by training a sklearn classifier on the embeddings of the training set and evaluating on the embeddings of the test set.
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any
|
|
3
2
|
|
|
4
3
|
import torch
|
|
5
4
|
from datasets import Dataset
|
|
@@ -9,7 +8,7 @@ from mteb._create_dataloaders import _create_dataloader_from_texts
|
|
|
9
8
|
from mteb._evaluators.evaluator import Evaluator
|
|
10
9
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
10
|
from mteb.models import EncoderProtocol
|
|
12
|
-
from mteb.types import Array
|
|
11
|
+
from mteb.types import Array, EncodeKwargs
|
|
13
12
|
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
|
|
@@ -33,7 +32,10 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
33
32
|
self.task_metadata = task_metadata
|
|
34
33
|
|
|
35
34
|
def __call__(
|
|
36
|
-
self,
|
|
35
|
+
self,
|
|
36
|
+
model: EncoderProtocol,
|
|
37
|
+
*,
|
|
38
|
+
encode_kwargs: EncodeKwargs,
|
|
37
39
|
) -> dict[str, list[dict[str, float]]]:
|
|
38
40
|
pair_elements = {p for pair in self.pairs for p in pair}
|
|
39
41
|
if isinstance(self.sentences, Dataset):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import sys
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import TypedDict
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import torch
|
|
@@ -12,6 +12,7 @@ from mteb._evaluators.evaluator import Evaluator
|
|
|
12
12
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.models import EncoderProtocol
|
|
14
14
|
from mteb.similarity_functions import cos_sim, dot_score
|
|
15
|
+
from mteb.types import EncodeKwargs
|
|
15
16
|
|
|
16
17
|
# if later than python 3.13 use typing module
|
|
17
18
|
if sys.version_info >= (3, 13):
|
|
@@ -94,7 +95,7 @@ class SummarizationEvaluator(Evaluator):
|
|
|
94
95
|
self,
|
|
95
96
|
model: EncoderProtocol,
|
|
96
97
|
*,
|
|
97
|
-
encode_kwargs:
|
|
98
|
+
encode_kwargs: EncodeKwargs,
|
|
98
99
|
) -> SummarizationDistances:
|
|
99
100
|
# Get the human & machine summaries for the text in one go for all
|
|
100
101
|
human_lens = [len(human_summaries) for human_summaries in self.human_summaries]
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any
|
|
3
2
|
|
|
4
3
|
from datasets import Dataset
|
|
5
4
|
|
|
@@ -10,7 +9,7 @@ from mteb._create_dataloaders import (
|
|
|
10
9
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
10
|
from mteb.models import EncoderProtocol
|
|
12
11
|
from mteb.similarity_functions import similarity
|
|
13
|
-
from mteb.types import Array
|
|
12
|
+
from mteb.types import Array, EncodeKwargs
|
|
14
13
|
|
|
15
14
|
from .evaluator import Evaluator
|
|
16
15
|
|
|
@@ -38,7 +37,10 @@ class ZeroShotClassificationEvaluator(Evaluator):
|
|
|
38
37
|
self.hf_subset = hf_subset
|
|
39
38
|
|
|
40
39
|
def __call__(
|
|
41
|
-
self,
|
|
40
|
+
self,
|
|
41
|
+
model: EncoderProtocol,
|
|
42
|
+
*,
|
|
43
|
+
encode_kwargs: EncodeKwargs,
|
|
42
44
|
) -> Array:
|
|
43
45
|
dataloader = create_dataloader(
|
|
44
46
|
self.dataset,
|
mteb/abstasks/abstask.py
CHANGED
|
@@ -23,6 +23,7 @@ from mteb.models import (
|
|
|
23
23
|
SearchProtocol,
|
|
24
24
|
)
|
|
25
25
|
from mteb.types import HFSubset, Modalities, ScoresDict
|
|
26
|
+
from mteb.types._encoder_io import EncodeKwargs
|
|
26
27
|
from mteb.types.statistics import DescriptiveStatistics, SplitDescriptiveStatistics
|
|
27
28
|
|
|
28
29
|
logger = logging.getLogger(__name__)
|
|
@@ -121,7 +122,7 @@ class AbsTask(ABC):
|
|
|
121
122
|
split: str = "test",
|
|
122
123
|
subsets_to_run: list[HFSubset] | None = None,
|
|
123
124
|
*,
|
|
124
|
-
encode_kwargs:
|
|
125
|
+
encode_kwargs: EncodeKwargs,
|
|
125
126
|
prediction_folder: Path | None = None,
|
|
126
127
|
**kwargs: Any,
|
|
127
128
|
) -> Mapping[HFSubset, ScoresDict]:
|
|
@@ -201,7 +202,7 @@ class AbsTask(ABC):
|
|
|
201
202
|
*,
|
|
202
203
|
hf_split: str,
|
|
203
204
|
hf_subset: str,
|
|
204
|
-
encode_kwargs:
|
|
205
|
+
encode_kwargs: EncodeKwargs,
|
|
205
206
|
prediction_folder: Path | None = None,
|
|
206
207
|
**kwargs: Any,
|
|
207
208
|
) -> ScoresDict:
|
mteb/abstasks/aggregated_task.py
CHANGED
|
@@ -9,7 +9,7 @@ from datasets import Dataset, DatasetDict
|
|
|
9
9
|
|
|
10
10
|
from mteb.models.models_protocols import MTEBModels
|
|
11
11
|
from mteb.results.task_result import TaskResult
|
|
12
|
-
from mteb.types import HFSubset, ScoresDict
|
|
12
|
+
from mteb.types import EncodeKwargs, HFSubset, ScoresDict
|
|
13
13
|
from mteb.types.statistics import DescriptiveStatistics
|
|
14
14
|
|
|
15
15
|
from .abstask import AbsTask
|
|
@@ -127,7 +127,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
127
127
|
split: str = "test",
|
|
128
128
|
subsets_to_run: list[HFSubset] | None = None,
|
|
129
129
|
*,
|
|
130
|
-
encode_kwargs:
|
|
130
|
+
encode_kwargs: EncodeKwargs,
|
|
131
131
|
prediction_folder: Path | None = None,
|
|
132
132
|
**kwargs: Any,
|
|
133
133
|
) -> dict[HFSubset, ScoresDict]:
|
|
@@ -141,7 +141,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
141
141
|
self,
|
|
142
142
|
model: MTEBModels,
|
|
143
143
|
data_split: DatasetDict | Dataset,
|
|
144
|
-
encode_kwargs:
|
|
144
|
+
encode_kwargs: EncodeKwargs,
|
|
145
145
|
**kwargs: Any,
|
|
146
146
|
) -> ScoresDict:
|
|
147
147
|
raise NotImplementedError(
|
mteb/abstasks/classification.py
CHANGED
|
@@ -16,7 +16,7 @@ from sklearn.metrics import (
|
|
|
16
16
|
|
|
17
17
|
from mteb._evaluators.sklearn_evaluator import SklearnEvaluator, SklearnModelProtocol
|
|
18
18
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
19
|
-
from mteb.types import HFSubset, ScoresDict
|
|
19
|
+
from mteb.types import EncodeKwargs, HFSubset, ScoresDict
|
|
20
20
|
from mteb.types.statistics import (
|
|
21
21
|
ImageStatistics,
|
|
22
22
|
LabelStatistics,
|
|
@@ -125,7 +125,7 @@ class AbsTaskClassification(AbsTask):
|
|
|
125
125
|
split: str = "test",
|
|
126
126
|
subsets_to_run: list[HFSubset] | None = None,
|
|
127
127
|
*,
|
|
128
|
-
encode_kwargs:
|
|
128
|
+
encode_kwargs: EncodeKwargs,
|
|
129
129
|
prediction_folder: Path | None = None,
|
|
130
130
|
**kwargs: Any,
|
|
131
131
|
) -> dict[HFSubset, ScoresDict]:
|
|
@@ -184,7 +184,7 @@ class AbsTaskClassification(AbsTask):
|
|
|
184
184
|
model: MTEBModels,
|
|
185
185
|
data_split: DatasetDict,
|
|
186
186
|
*,
|
|
187
|
-
encode_kwargs:
|
|
187
|
+
encode_kwargs: EncodeKwargs,
|
|
188
188
|
hf_split: str,
|
|
189
189
|
hf_subset: str,
|
|
190
190
|
prediction_folder: Path | None = None,
|
mteb/abstasks/clustering.py
CHANGED
|
@@ -12,7 +12,7 @@ from sklearn.metrics.cluster import v_measure_score
|
|
|
12
12
|
|
|
13
13
|
from mteb._create_dataloaders import create_dataloader
|
|
14
14
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
15
|
-
from mteb.types import Array, HFSubset, ScoresDict
|
|
15
|
+
from mteb.types import Array, EncodeKwargs, HFSubset, ScoresDict
|
|
16
16
|
from mteb.types.statistics import (
|
|
17
17
|
ImageStatistics,
|
|
18
18
|
LabelStatistics,
|
|
@@ -156,7 +156,7 @@ class AbsTaskClustering(AbsTask):
|
|
|
156
156
|
model: MTEBModels,
|
|
157
157
|
data_split: Dataset,
|
|
158
158
|
*,
|
|
159
|
-
encode_kwargs:
|
|
159
|
+
encode_kwargs: EncodeKwargs,
|
|
160
160
|
hf_split: str,
|
|
161
161
|
hf_subset: str,
|
|
162
162
|
prediction_folder: Path | None = None,
|
|
@@ -9,7 +9,7 @@ from sklearn import metrics
|
|
|
9
9
|
|
|
10
10
|
from mteb._evaluators import ClusteringEvaluator
|
|
11
11
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
12
|
-
from mteb.types import ScoresDict
|
|
12
|
+
from mteb.types import EncodeKwargs, ScoresDict
|
|
13
13
|
from mteb.types.statistics import (
|
|
14
14
|
ImageStatistics,
|
|
15
15
|
LabelStatistics,
|
|
@@ -83,7 +83,7 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
83
83
|
model: MTEBModels,
|
|
84
84
|
data_split: Dataset,
|
|
85
85
|
*,
|
|
86
|
-
encode_kwargs:
|
|
86
|
+
encode_kwargs: EncodeKwargs,
|
|
87
87
|
hf_split: str,
|
|
88
88
|
hf_subset: str,
|
|
89
89
|
prediction_folder: Path | None = None,
|
|
@@ -13,6 +13,7 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
13
13
|
)
|
|
14
14
|
from mteb.abstasks.abstask import AbsTask
|
|
15
15
|
from mteb.models.models_protocols import EncoderProtocol, MTEBModels
|
|
16
|
+
from mteb.types import EncodeKwargs
|
|
16
17
|
from mteb.types.statistics import (
|
|
17
18
|
ImageStatistics,
|
|
18
19
|
SplitDescriptiveStatistics,
|
|
@@ -119,7 +120,7 @@ class AbsTaskImageTextPairClassification(AbsTask):
|
|
|
119
120
|
model: MTEBModels,
|
|
120
121
|
data_split: Dataset,
|
|
121
122
|
*,
|
|
122
|
-
encode_kwargs:
|
|
123
|
+
encode_kwargs: EncodeKwargs,
|
|
123
124
|
hf_split: str,
|
|
124
125
|
hf_subset: str,
|
|
125
126
|
prediction_folder: Path | None = None,
|
|
@@ -17,7 +17,7 @@ from mteb._create_dataloaders import create_dataloader
|
|
|
17
17
|
from mteb._evaluators.classification_metrics import hamming_score
|
|
18
18
|
from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
|
|
19
19
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
20
|
-
from mteb.types import Array
|
|
20
|
+
from mteb.types import Array, EncodeKwargs
|
|
21
21
|
|
|
22
22
|
from .classification import AbsTaskClassification
|
|
23
23
|
|
|
@@ -83,7 +83,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
83
83
|
model: MTEBModels,
|
|
84
84
|
data_split: DatasetDict,
|
|
85
85
|
*,
|
|
86
|
-
encode_kwargs:
|
|
86
|
+
encode_kwargs: EncodeKwargs,
|
|
87
87
|
hf_split: str,
|
|
88
88
|
hf_subset: str,
|
|
89
89
|
prediction_folder: Path | None = None,
|
|
@@ -19,7 +19,7 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
19
19
|
from mteb.abstasks.abstask import AbsTask
|
|
20
20
|
from mteb.models.model_meta import ScoringFunction
|
|
21
21
|
from mteb.models.models_protocols import EncoderProtocol, MTEBModels
|
|
22
|
-
from mteb.types import PromptType
|
|
22
|
+
from mteb.types import EncodeKwargs, PromptType
|
|
23
23
|
from mteb.types.statistics import (
|
|
24
24
|
ImageStatistics,
|
|
25
25
|
LabelStatistics,
|
|
@@ -84,7 +84,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
84
84
|
*,
|
|
85
85
|
hf_split: str,
|
|
86
86
|
hf_subset: str,
|
|
87
|
-
encode_kwargs:
|
|
87
|
+
encode_kwargs: EncodeKwargs,
|
|
88
88
|
prediction_folder: Path | None = None,
|
|
89
89
|
**kwargs,
|
|
90
90
|
) -> dict[str, float]:
|
mteb/abstasks/retrieval.py
CHANGED
|
@@ -25,6 +25,7 @@ from mteb.models import (
|
|
|
25
25
|
SearchProtocol,
|
|
26
26
|
)
|
|
27
27
|
from mteb.types import (
|
|
28
|
+
EncodeKwargs,
|
|
28
29
|
HFSubset,
|
|
29
30
|
QueryDatasetType,
|
|
30
31
|
RelevantDocumentsType,
|
|
@@ -184,17 +185,17 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
184
185
|
return queries, corpus
|
|
185
186
|
|
|
186
187
|
if self.metadata.is_multilingual:
|
|
187
|
-
for subset in self.queries:
|
|
188
|
-
for split in self.queries[subset]:
|
|
189
|
-
queries = self.queries[subset][split]
|
|
190
|
-
corpus = self.corpus[subset][split]
|
|
188
|
+
for subset in self.queries: # type: ignore[attr-defined]
|
|
189
|
+
for split in self.queries[subset]: # type: ignore[attr-defined]
|
|
190
|
+
queries = self.queries[subset][split] # type: ignore[attr-defined]
|
|
191
|
+
corpus = self.corpus[subset][split] # type: ignore[attr-defined]
|
|
191
192
|
|
|
192
193
|
(
|
|
193
194
|
self.dataset[subset][split]["queries"],
|
|
194
195
|
self.dataset[subset][split]["corpus"],
|
|
195
196
|
) = _process_split(queries, corpus)
|
|
196
197
|
|
|
197
|
-
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
|
|
198
|
+
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
|
|
198
199
|
subset
|
|
199
200
|
][split]
|
|
200
201
|
if hasattr(self, "instructions"):
|
|
@@ -211,15 +212,15 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
211
212
|
][split]
|
|
212
213
|
else:
|
|
213
214
|
subset = "default"
|
|
214
|
-
for split in self.queries:
|
|
215
|
-
queries = self.queries[split]
|
|
216
|
-
corpus = self.corpus[split]
|
|
215
|
+
for split in self.queries: # type: ignore[attr-defined]
|
|
216
|
+
queries = self.queries[split] # type: ignore[attr-defined]
|
|
217
|
+
corpus = self.corpus[split] # type: ignore[attr-defined]
|
|
217
218
|
(
|
|
218
219
|
self.dataset[subset][split]["queries"],
|
|
219
220
|
self.dataset[subset][split]["corpus"],
|
|
220
221
|
) = _process_split(queries, corpus)
|
|
221
222
|
|
|
222
|
-
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
|
|
223
|
+
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
|
|
223
224
|
split
|
|
224
225
|
].copy()
|
|
225
226
|
if hasattr(self, "instructions"):
|
|
@@ -235,9 +236,9 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
235
236
|
split
|
|
236
237
|
].copy()
|
|
237
238
|
|
|
238
|
-
del self.queries
|
|
239
|
-
del self.corpus
|
|
240
|
-
del self.relevant_docs
|
|
239
|
+
del self.queries # type: ignore[attr-defined]
|
|
240
|
+
del self.corpus # type: ignore[attr-defined]
|
|
241
|
+
del self.relevant_docs # type: ignore[attr-defined]
|
|
241
242
|
if hasattr(self, "instructions"):
|
|
242
243
|
del self.instructions
|
|
243
244
|
if hasattr(self, "top_ranked"):
|
|
@@ -283,7 +284,7 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
283
284
|
split: str = "test",
|
|
284
285
|
subsets_to_run: list[HFSubset] | None = None,
|
|
285
286
|
*,
|
|
286
|
-
encode_kwargs:
|
|
287
|
+
encode_kwargs: EncodeKwargs,
|
|
287
288
|
prediction_folder: Path | None = None,
|
|
288
289
|
**kwargs: Any,
|
|
289
290
|
) -> Mapping[HFSubset, ScoresDict]:
|
|
@@ -320,7 +321,7 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
320
321
|
self,
|
|
321
322
|
model: MTEBModels,
|
|
322
323
|
data_split: RetrievalSplitData,
|
|
323
|
-
encode_kwargs:
|
|
324
|
+
encode_kwargs: EncodeKwargs,
|
|
324
325
|
hf_split: str,
|
|
325
326
|
hf_subset: str,
|
|
326
327
|
prediction_folder: Path | None = None,
|
mteb/abstasks/sts.py
CHANGED
|
@@ -8,7 +8,7 @@ from scipy.stats import pearsonr, spearmanr
|
|
|
8
8
|
from mteb._evaluators import AnySTSEvaluator
|
|
9
9
|
from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
|
|
10
10
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
11
|
-
from mteb.types import PromptType
|
|
11
|
+
from mteb.types import EncodeKwargs, PromptType
|
|
12
12
|
from mteb.types.statistics import (
|
|
13
13
|
ImageStatistics,
|
|
14
14
|
ScoreStatistics,
|
|
@@ -105,7 +105,7 @@ class AbsTaskSTS(AbsTask):
|
|
|
105
105
|
self,
|
|
106
106
|
model: MTEBModels,
|
|
107
107
|
data_split: Dataset,
|
|
108
|
-
encode_kwargs:
|
|
108
|
+
encode_kwargs: EncodeKwargs,
|
|
109
109
|
hf_split: str,
|
|
110
110
|
hf_subset: str,
|
|
111
111
|
prediction_folder: Path | None = None,
|
|
@@ -10,7 +10,7 @@ from mteb._evaluators import BitextMiningEvaluator
|
|
|
10
10
|
from mteb.abstasks._statistics_calculation import calculate_text_statistics
|
|
11
11
|
from mteb.abstasks.abstask import AbsTask
|
|
12
12
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
13
|
-
from mteb.types import HFSubset, ScoresDict
|
|
13
|
+
from mteb.types import EncodeKwargs, HFSubset, ScoresDict
|
|
14
14
|
from mteb.types.statistics import SplitDescriptiveStatistics, TextStatistics
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
@@ -73,7 +73,7 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
73
73
|
split: str = "test",
|
|
74
74
|
subsets_to_run: list[HFSubset] | None = None,
|
|
75
75
|
*,
|
|
76
|
-
encode_kwargs:
|
|
76
|
+
encode_kwargs: EncodeKwargs,
|
|
77
77
|
prediction_folder: Path | None = None,
|
|
78
78
|
**kwargs: Any,
|
|
79
79
|
) -> dict[HFSubset, ScoresDict]:
|
|
@@ -142,7 +142,7 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
142
142
|
*,
|
|
143
143
|
hf_split: str,
|
|
144
144
|
hf_subset: str,
|
|
145
|
-
encode_kwargs:
|
|
145
|
+
encode_kwargs: EncodeKwargs,
|
|
146
146
|
prediction_folder: Path | None = None,
|
|
147
147
|
parallel: bool = False,
|
|
148
148
|
**kwargs,
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
import numpy as np
|
|
6
5
|
from datasets import Dataset
|
|
@@ -13,6 +12,7 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
13
12
|
)
|
|
14
13
|
from mteb.abstasks.abstask import AbsTask
|
|
15
14
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
15
|
+
from mteb.types import EncodeKwargs
|
|
16
16
|
from mteb.types.statistics import (
|
|
17
17
|
ScoreStatistics,
|
|
18
18
|
SplitDescriptiveStatistics,
|
|
@@ -82,7 +82,7 @@ class AbsTaskSummarization(AbsTask):
|
|
|
82
82
|
*,
|
|
83
83
|
hf_split: str,
|
|
84
84
|
hf_subset: str,
|
|
85
|
-
encode_kwargs:
|
|
85
|
+
encode_kwargs: EncodeKwargs,
|
|
86
86
|
prediction_folder: Path | None = None,
|
|
87
87
|
**kwargs,
|
|
88
88
|
) -> SummarizationMetrics:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import TypedDict
|
|
4
4
|
|
|
5
5
|
import torch
|
|
6
6
|
from datasets import Dataset
|
|
@@ -8,6 +8,7 @@ from sklearn import metrics
|
|
|
8
8
|
|
|
9
9
|
from mteb._evaluators import ZeroShotClassificationEvaluator
|
|
10
10
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
11
|
+
from mteb.types import EncodeKwargs
|
|
11
12
|
from mteb.types.statistics import (
|
|
12
13
|
ImageStatistics,
|
|
13
14
|
LabelStatistics,
|
|
@@ -116,7 +117,7 @@ class AbsTaskZeroShotClassification(AbsTask):
|
|
|
116
117
|
*,
|
|
117
118
|
hf_split: str,
|
|
118
119
|
hf_subset: str,
|
|
119
|
-
encode_kwargs:
|
|
120
|
+
encode_kwargs: EncodeKwargs,
|
|
120
121
|
prediction_folder: Path | None = None,
|
|
121
122
|
**kwargs,
|
|
122
123
|
) -> ZeroShotClassificationMetrics:
|
|
@@ -14,6 +14,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
14
14
|
JINA_VDR,
|
|
15
15
|
JMTEB_LITE_V1,
|
|
16
16
|
JMTEB_V2,
|
|
17
|
+
KOVIDORE_V2,
|
|
17
18
|
LONG_EMBED,
|
|
18
19
|
MIEB_ENG,
|
|
19
20
|
MIEB_IMG,
|
|
@@ -79,6 +80,7 @@ __all__ = [
|
|
|
79
80
|
"JINA_VDR",
|
|
80
81
|
"JMTEB_LITE_V1",
|
|
81
82
|
"JMTEB_V2",
|
|
83
|
+
"KOVIDORE_V2",
|
|
82
84
|
"LONG_EMBED",
|
|
83
85
|
"MIEB_ENG",
|
|
84
86
|
"MIEB_IMG",
|