mteb 2.5.2__py3-none-any.whl → 2.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +10 -15
- mteb/_evaluators/any_sts_evaluator.py +1 -4
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +17 -16
- mteb/_evaluators/sklearn_evaluator.py +9 -8
- mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
- mteb/_evaluators/text/summarization_evaluator.py +20 -16
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +33 -27
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +7 -26
- mteb/abstasks/classification.py +10 -4
- mteb/abstasks/clustering.py +18 -14
- mteb/abstasks/clustering_legacy.py +8 -8
- mteb/abstasks/image/image_text_pair_classification.py +5 -3
- mteb/abstasks/multilabel_classification.py +20 -16
- mteb/abstasks/pair_classification.py +18 -9
- mteb/abstasks/regression.py +3 -3
- mteb/abstasks/retrieval.py +12 -9
- mteb/abstasks/sts.py +6 -3
- mteb/abstasks/task_metadata.py +22 -19
- mteb/abstasks/text/bitext_mining.py +36 -25
- mteb/abstasks/text/reranking.py +7 -5
- mteb/abstasks/text/summarization.py +8 -3
- mteb/abstasks/zeroshot_classification.py +5 -2
- mteb/benchmarks/benchmark.py +2 -2
- mteb/cache.py +27 -22
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +15 -10
- mteb/cli/generate_model_card.py +10 -7
- mteb/deprecated_evaluator.py +60 -46
- mteb/evaluate.py +39 -30
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +1 -1
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +7 -5
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +8 -1
- mteb/models/instruct_wrapper.py +11 -5
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/dino_models.py +23 -23
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/gme_v_models.py +4 -3
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/model2vec_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +8 -8
- mteb/models/model_implementations/openclip_models.py +7 -7
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/repllama_models.py +2 -2
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/vlm2vec_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +4 -4
- mteb/models/model_meta.py +14 -13
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +26 -12
- mteb/models/sentence_transformer_wrapper.py +19 -14
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +28 -20
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +55 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/METADATA +1 -1
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/RECORD +104 -103
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.5.4.dist-info}/top_level.txt +0 -0
mteb/models/search_wrappers.py
CHANGED
|
@@ -200,7 +200,7 @@ class SearchEncoderWrapper:
|
|
|
200
200
|
# Reset the task corpus dataloader to None to free up memory
|
|
201
201
|
self.task_corpus = None
|
|
202
202
|
|
|
203
|
-
results = {qid: {} for qid in query_idx_to_id.values()}
|
|
203
|
+
results: RetrievalOutputType = {qid: {} for qid in query_idx_to_id.values()}
|
|
204
204
|
for qid in result_heaps:
|
|
205
205
|
for score, corpus_id in result_heaps[qid]:
|
|
206
206
|
results[qid][corpus_id] = score
|
|
@@ -218,13 +218,19 @@ class SearchEncoderWrapper:
|
|
|
218
218
|
encode_kwargs: dict[str, Any],
|
|
219
219
|
) -> dict[str, list[tuple[float, str]]]:
|
|
220
220
|
logger.info("Encoding Corpus in batches (this might take a while)...")
|
|
221
|
+
if self.task_corpus is None:
|
|
222
|
+
raise ValueError("Corpus must be indexed before searching.")
|
|
223
|
+
|
|
221
224
|
itr = range(0, len(self.task_corpus), self.corpus_chunk_size)
|
|
222
225
|
|
|
223
|
-
result_heaps
|
|
226
|
+
result_heaps: dict[str, list[tuple[float, str]]] = {
|
|
227
|
+
qid: [] for qid in query_idx_to_id.values()
|
|
228
|
+
}
|
|
224
229
|
for batch_num, corpus_start_idx in enumerate(itr):
|
|
225
230
|
logger.info(f"Encoding Batch {batch_num + 1}/{len(itr)}...")
|
|
226
231
|
corpus_end_idx = min(
|
|
227
|
-
corpus_start_idx + self.corpus_chunk_size,
|
|
232
|
+
corpus_start_idx + self.corpus_chunk_size,
|
|
233
|
+
len(self.task_corpus),
|
|
228
234
|
)
|
|
229
235
|
sub_corpus = self.task_corpus.select(
|
|
230
236
|
range(corpus_start_idx, corpus_end_idx)
|
|
@@ -249,7 +255,7 @@ class SearchEncoderWrapper:
|
|
|
249
255
|
scores = self.model.similarity(query_embeddings, sub_corpus_embeddings)
|
|
250
256
|
|
|
251
257
|
# get top-k values
|
|
252
|
-
|
|
258
|
+
cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = torch.topk(
|
|
253
259
|
torch.as_tensor(scores),
|
|
254
260
|
min(
|
|
255
261
|
top_k + 1,
|
|
@@ -258,8 +264,8 @@ class SearchEncoderWrapper:
|
|
|
258
264
|
dim=1,
|
|
259
265
|
largest=True,
|
|
260
266
|
)
|
|
261
|
-
cos_scores_top_k_idx =
|
|
262
|
-
cos_scores_top_k_values =
|
|
267
|
+
cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
|
|
268
|
+
cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
|
|
263
269
|
|
|
264
270
|
sub_corpus_ids = list(sub_corpus_ids)
|
|
265
271
|
result_heaps = self._sort_full_corpus_results(
|
|
@@ -319,7 +325,11 @@ class SearchEncoderWrapper:
|
|
|
319
325
|
Returns:
|
|
320
326
|
A dictionary mapping query IDs to a list of tuples, each containing a relevance score and a document ID.
|
|
321
327
|
"""
|
|
322
|
-
|
|
328
|
+
if self.task_corpus is None:
|
|
329
|
+
raise ValueError("Corpus must be indexed before searching.")
|
|
330
|
+
result_heaps: dict[str, list[tuple[float, str]]] = {
|
|
331
|
+
qid: [] for qid in query_idx_to_id.values()
|
|
332
|
+
}
|
|
323
333
|
doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)}
|
|
324
334
|
|
|
325
335
|
all_doc_embeddings = self.model.encode(
|
|
@@ -340,7 +350,8 @@ class SearchEncoderWrapper:
|
|
|
340
350
|
for query_idx, query_embedding in enumerate(query_embeddings):
|
|
341
351
|
query_id = query_idx_to_id[query_idx]
|
|
342
352
|
if query_id not in top_ranked:
|
|
343
|
-
|
|
353
|
+
msg = f"No pre-ranked documents found for query {query_id}"
|
|
354
|
+
logger.warning(msg)
|
|
344
355
|
continue
|
|
345
356
|
|
|
346
357
|
ranked_ids = top_ranked[query_id]
|
|
@@ -386,12 +397,12 @@ class SearchEncoderWrapper:
|
|
|
386
397
|
|
|
387
398
|
def _rerank_sort_results(
|
|
388
399
|
self,
|
|
389
|
-
result_heaps: list[tuple[float, str]],
|
|
400
|
+
result_heaps: dict[str, list[tuple[float, str]]],
|
|
390
401
|
query_id: str,
|
|
391
402
|
ranked_ids: list[str],
|
|
392
403
|
scores_top_k_idx: torch.Tensor,
|
|
393
404
|
scores_top_k_values: torch.Tensor,
|
|
394
|
-
) -> list[tuple[float, str]]:
|
|
405
|
+
) -> dict[str, list[tuple[float, str]]]:
|
|
395
406
|
"""Sort the heap into descending order list.
|
|
396
407
|
|
|
397
408
|
Returns:
|
|
@@ -502,6 +513,8 @@ class SearchCrossEncoderWrapper:
|
|
|
502
513
|
raise ValueError(
|
|
503
514
|
"CrossEncoder search requires top_ranked documents for reranking."
|
|
504
515
|
)
|
|
516
|
+
if self.task_corpus is None:
|
|
517
|
+
raise ValueError("Corpus must be indexed before searching.")
|
|
505
518
|
|
|
506
519
|
query_id_to_idx = {row["id"]: i for i, row in enumerate(queries)}
|
|
507
520
|
doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)}
|
|
@@ -511,7 +524,8 @@ class SearchCrossEncoderWrapper:
|
|
|
511
524
|
doc_pairs_ids: list[tuple[str, str]] = []
|
|
512
525
|
for query_id, corpus_ids in top_ranked.items():
|
|
513
526
|
if query_id not in top_ranked:
|
|
514
|
-
|
|
527
|
+
msg = f"No pre-ranked documents found for query {query_id}"
|
|
528
|
+
logger.warning(msg)
|
|
515
529
|
continue
|
|
516
530
|
|
|
517
531
|
query_idx = query_id_to_idx[query_id]
|
|
@@ -540,7 +554,7 @@ class SearchCrossEncoderWrapper:
|
|
|
540
554
|
hf_subset=hf_subset,
|
|
541
555
|
)
|
|
542
556
|
|
|
543
|
-
results = {qid: {} for qid in queries["id"]}
|
|
557
|
+
results: RetrievalOutputType = {qid: {} for qid in queries["id"]}
|
|
544
558
|
for (query_id, corpus_id), score in zip(doc_pairs_ids, predictions):
|
|
545
559
|
results[query_id][corpus_id] = float(score)
|
|
546
560
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import warnings
|
|
4
5
|
from typing import TYPE_CHECKING, Any
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
@@ -75,9 +76,9 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
|
|
|
75
76
|
if built_in_prompts and not model_prompts:
|
|
76
77
|
model_prompts = built_in_prompts
|
|
77
78
|
elif model_prompts and built_in_prompts:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
)
|
|
79
|
+
msg = f"Model prompts specified, these will overwrite the default model prompts. Current prompts will be:\n {model_prompts}"
|
|
80
|
+
logger.warning(msg)
|
|
81
|
+
warnings.warn(msg)
|
|
81
82
|
self.model.prompts = model_prompts
|
|
82
83
|
|
|
83
84
|
self.model_prompts, invalid_prompts = self.validate_task_to_prompt_name(
|
|
@@ -86,9 +87,9 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
|
|
|
86
87
|
|
|
87
88
|
if invalid_prompts:
|
|
88
89
|
invalid_prompts = "\n".join(invalid_prompts)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
)
|
|
90
|
+
msg = f"Some prompts are not in the expected format and will be ignored. Problems:\n\n{invalid_prompts}"
|
|
91
|
+
logger.warning(msg)
|
|
92
|
+
warnings.warn(msg)
|
|
92
93
|
|
|
93
94
|
if (
|
|
94
95
|
self.model_prompts
|
|
@@ -98,13 +99,15 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
|
|
|
98
99
|
or PromptType.document.value not in self.model_prompts
|
|
99
100
|
)
|
|
100
101
|
):
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
)
|
|
102
|
+
msg = f"SentenceTransformers that use prompts most often need to be configured with at least 'query' and 'document' prompts to ensure optimal performance. Received {self.model_prompts}"
|
|
103
|
+
logger.warning(msg)
|
|
104
|
+
warnings.warn(msg)
|
|
105
105
|
|
|
106
|
+
def similarity(self, embeddings1: Array, embeddings2: Array) -> Array:
|
|
107
|
+
"""Compute the similarity between two collections of embeddings."""
|
|
106
108
|
if hasattr(self.model, "similarity") and callable(self.model.similarity):
|
|
107
|
-
|
|
109
|
+
return self.model.similarity(embeddings1, embeddings2)
|
|
110
|
+
return super().similarity(embeddings1, embeddings2)
|
|
108
111
|
|
|
109
112
|
def encode(
|
|
110
113
|
self,
|
|
@@ -150,7 +153,7 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
|
|
|
150
153
|
prompt_name = None
|
|
151
154
|
if self.model_prompts is not None:
|
|
152
155
|
prompt_name = self.get_prompt_name(task_metadata, prompt_type)
|
|
153
|
-
prompt = self.model_prompts.get(prompt_name, None)
|
|
156
|
+
prompt = self.model_prompts.get(prompt_name, None) # type: ignore[arg-type]
|
|
154
157
|
if prompt_name:
|
|
155
158
|
prompt_log = f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}"
|
|
156
159
|
else:
|
|
@@ -221,7 +224,7 @@ class SentenceTransformerMultimodalEncoderWrapper(SentenceTransformerEncoderWrap
|
|
|
221
224
|
prompt_name = None
|
|
222
225
|
if self.model_prompts is not None:
|
|
223
226
|
prompt_name = self.get_prompt_name(task_metadata, prompt_type)
|
|
224
|
-
prompt = self.model_prompts.get(prompt_name, None)
|
|
227
|
+
prompt = self.model_prompts.get(prompt_name, None) # type: ignore[arg-type]
|
|
225
228
|
if prompt_name:
|
|
226
229
|
logger.info(
|
|
227
230
|
f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}"
|
|
@@ -234,7 +237,9 @@ class SentenceTransformerMultimodalEncoderWrapper(SentenceTransformerEncoderWrap
|
|
|
234
237
|
all_embeddings = []
|
|
235
238
|
for batch in inputs:
|
|
236
239
|
batch_column = next(iter(batch.keys()))
|
|
237
|
-
batched_input
|
|
240
|
+
batched_input: list[dict[str, Any]] = [
|
|
241
|
+
dict() for _ in range(len(batch[batch_column]))
|
|
242
|
+
]
|
|
238
243
|
|
|
239
244
|
# transform from {"text": [text1, text2], "image": [image1, image2]} to
|
|
240
245
|
# [{"text": text1, "image": image1}, {"text": text2, "image": image2}]
|
mteb/py.typed
ADDED
|
File without changes
|
|
@@ -1,10 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import functools
|
|
2
4
|
import json
|
|
3
5
|
import logging
|
|
4
6
|
import warnings
|
|
5
|
-
from collections.abc import Callable, Iterable, Iterator
|
|
7
|
+
from collections.abc import Callable, Iterable, Iterator
|
|
6
8
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Literal
|
|
9
|
+
from typing import Any, Literal, cast
|
|
8
10
|
|
|
9
11
|
import pandas as pd
|
|
10
12
|
from packaging.version import InvalidVersion, Version
|
|
@@ -33,11 +35,12 @@ from .model_result import ModelResult, _aggregate_and_pivot
|
|
|
33
35
|
logger = logging.getLogger(__name__)
|
|
34
36
|
|
|
35
37
|
|
|
36
|
-
# Global cache for model metas and version parsing
|
|
37
38
|
@functools.lru_cache
|
|
38
39
|
def _get_cached_model_metas() -> dict[str, str | None]:
|
|
39
40
|
"""Cache model metas to avoid repeated calls."""
|
|
40
|
-
return {
|
|
41
|
+
return {
|
|
42
|
+
meta.name: meta.revision for meta in get_model_metas() if meta.name is not None
|
|
43
|
+
}
|
|
41
44
|
|
|
42
45
|
|
|
43
46
|
@functools.lru_cache(maxsize=10000)
|
|
@@ -77,10 +80,10 @@ class BenchmarkResults(BaseModel):
|
|
|
77
80
|
task_names: list[str] | None = None,
|
|
78
81
|
languages: list[str] | None = None,
|
|
79
82
|
domains: list[TaskDomain] | None = None,
|
|
80
|
-
task_types: list[TaskType] | None = None,
|
|
83
|
+
task_types: list[TaskType] | None = None,
|
|
81
84
|
modalities: list[Modalities] | None = None,
|
|
82
85
|
is_public: bool | None = None,
|
|
83
|
-
) ->
|
|
86
|
+
) -> BenchmarkResults:
|
|
84
87
|
# TODO: Same as filter_models
|
|
85
88
|
model_results = [
|
|
86
89
|
res._filter_tasks(
|
|
@@ -97,7 +100,7 @@ class BenchmarkResults(BaseModel):
|
|
|
97
100
|
model_results=[res for res in model_results if res.task_results]
|
|
98
101
|
)
|
|
99
102
|
|
|
100
|
-
def select_tasks(self, tasks:
|
|
103
|
+
def select_tasks(self, tasks: Iterable[AbsTask]) -> BenchmarkResults:
|
|
101
104
|
"""Select tasks from the benchmark results.
|
|
102
105
|
|
|
103
106
|
Args:
|
|
@@ -115,7 +118,7 @@ class BenchmarkResults(BaseModel):
|
|
|
115
118
|
self,
|
|
116
119
|
names: list[str] | list[ModelMeta],
|
|
117
120
|
revisions: list[str | None] | None = None,
|
|
118
|
-
) ->
|
|
121
|
+
) -> BenchmarkResults:
|
|
119
122
|
"""Get models by name and revision.
|
|
120
123
|
|
|
121
124
|
Args:
|
|
@@ -128,7 +131,7 @@ class BenchmarkResults(BaseModel):
|
|
|
128
131
|
models_res = []
|
|
129
132
|
_revisions = revisions if revisions is not None else [None] * len(names)
|
|
130
133
|
|
|
131
|
-
name_rev = {}
|
|
134
|
+
name_rev: dict[str, str | None] = {}
|
|
132
135
|
|
|
133
136
|
if len(names) != len(_revisions):
|
|
134
137
|
raise ValueError(
|
|
@@ -137,9 +140,12 @@ class BenchmarkResults(BaseModel):
|
|
|
137
140
|
|
|
138
141
|
for name, revision in zip(names, _revisions):
|
|
139
142
|
if isinstance(name, ModelMeta):
|
|
143
|
+
if name.name is None:
|
|
144
|
+
raise ValueError("name in ModelMeta is None. It must be a string.")
|
|
140
145
|
name_rev[name.name] = name.revision
|
|
141
146
|
else:
|
|
142
|
-
|
|
147
|
+
name_ = cast(str, name)
|
|
148
|
+
name_rev[name_] = revision
|
|
143
149
|
|
|
144
150
|
for model_res in self.model_results:
|
|
145
151
|
model_name = model_res.model_name
|
|
@@ -159,7 +165,7 @@ class BenchmarkResults(BaseModel):
|
|
|
159
165
|
n_parameters_range: tuple[int | None, int | None] = (None, None),
|
|
160
166
|
use_instructions: bool | None = None,
|
|
161
167
|
zero_shot_on: list[AbsTask] | None = None,
|
|
162
|
-
) ->
|
|
168
|
+
) -> BenchmarkResults:
|
|
163
169
|
# mostly a utility function for the leaderboard app.
|
|
164
170
|
# I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter.
|
|
165
171
|
# interface would then be the same as the get_models function
|
|
@@ -182,7 +188,7 @@ class BenchmarkResults(BaseModel):
|
|
|
182
188
|
|
|
183
189
|
return type(self).model_construct(model_results=new_model_results)
|
|
184
190
|
|
|
185
|
-
def join_revisions(self) ->
|
|
191
|
+
def join_revisions(self) -> BenchmarkResults:
|
|
186
192
|
"""Join revisions of the same model.
|
|
187
193
|
|
|
188
194
|
In case of conflicts, the following rules are applied:
|
|
@@ -212,10 +218,10 @@ class BenchmarkResults(BaseModel):
|
|
|
212
218
|
|
|
213
219
|
# Use cached model metas
|
|
214
220
|
model_to_main_revision = _get_cached_model_metas()
|
|
215
|
-
task_df["main_revision"] = task_df["model"].map(model_to_main_revision)
|
|
221
|
+
task_df["main_revision"] = task_df["model"].map(model_to_main_revision)
|
|
216
222
|
|
|
217
223
|
# Use cached version parsing
|
|
218
|
-
task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)
|
|
224
|
+
task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)
|
|
219
225
|
|
|
220
226
|
# Filter out rows without scores first
|
|
221
227
|
task_df = task_df[task_df["has_scores"]]
|
|
@@ -259,8 +265,8 @@ class BenchmarkResults(BaseModel):
|
|
|
259
265
|
# so grouping by original revision ensures consistent ModelResult creation
|
|
260
266
|
for (model, model_revision), group in task_df.groupby(["model", "revision"]):
|
|
261
267
|
model_result = ModelResult.model_construct(
|
|
262
|
-
model_name=model,
|
|
263
|
-
model_revision=model_revision,
|
|
268
|
+
model_name=model, # type: ignore[arg-type]
|
|
269
|
+
model_revision=model_revision, # type: ignore[arg-type]
|
|
264
270
|
task_results=list(group["task_result"]),
|
|
265
271
|
)
|
|
266
272
|
model_results.append(model_result)
|
|
@@ -291,7 +297,7 @@ class BenchmarkResults(BaseModel):
|
|
|
291
297
|
{
|
|
292
298
|
"model": model_res.model_name,
|
|
293
299
|
"revision": model_res.model_revision,
|
|
294
|
-
**model_scores,
|
|
300
|
+
**model_scores,
|
|
295
301
|
}
|
|
296
302
|
)
|
|
297
303
|
except Exception as e:
|
|
@@ -364,7 +370,9 @@ class BenchmarkResults(BaseModel):
|
|
|
364
370
|
scores_data.extend(model_result._get_score_for_table())
|
|
365
371
|
|
|
366
372
|
if not scores_data:
|
|
367
|
-
|
|
373
|
+
msg = "No scores data available. Returning empty DataFrame."
|
|
374
|
+
logger.warning(msg)
|
|
375
|
+
warnings.warn(msg)
|
|
368
376
|
return pd.DataFrame()
|
|
369
377
|
|
|
370
378
|
# Create DataFrame
|
|
@@ -402,7 +410,7 @@ class BenchmarkResults(BaseModel):
|
|
|
402
410
|
|
|
403
411
|
return self.benchmark._create_summary_table(self)
|
|
404
412
|
|
|
405
|
-
def __iter__(self) -> Iterator[ModelResult]:
|
|
413
|
+
def __iter__(self) -> Iterator[ModelResult]: # type: ignore[override]
|
|
406
414
|
return iter(self.model_results)
|
|
407
415
|
|
|
408
416
|
def __getitem__(self, index: int) -> ModelResult:
|
|
@@ -424,7 +432,7 @@ class BenchmarkResults(BaseModel):
|
|
|
424
432
|
out_file.write(self.model_dump_json(indent=2))
|
|
425
433
|
|
|
426
434
|
@classmethod
|
|
427
|
-
def from_validated(cls, **data) ->
|
|
435
|
+
def from_validated(cls, **data) -> BenchmarkResults:
|
|
428
436
|
"""Create BenchmarkResults from validated data.
|
|
429
437
|
|
|
430
438
|
Args:
|
mteb/results/model_result.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import warnings
|
|
3
|
-
from collections.abc import Callable, Iterable
|
|
4
|
-
from typing import Any, Literal
|
|
5
|
+
from collections.abc import Callable, Iterable
|
|
6
|
+
from typing import Any, Literal, cast
|
|
5
7
|
|
|
6
8
|
import numpy as np
|
|
7
9
|
import pandas as pd
|
|
8
10
|
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
-
from typing_extensions import
|
|
11
|
+
from typing_extensions import overload
|
|
10
12
|
|
|
11
13
|
from mteb.abstasks.abstask import AbsTask
|
|
12
14
|
from mteb.abstasks.task_metadata import (
|
|
@@ -58,7 +60,7 @@ def _aggregate_and_pivot(
|
|
|
58
60
|
index=index_columns,
|
|
59
61
|
columns=columns,
|
|
60
62
|
values="score",
|
|
61
|
-
aggfunc=aggregation_fn,
|
|
63
|
+
aggfunc=aggregation_fn, # type: ignore[arg-type]
|
|
62
64
|
).reset_index()
|
|
63
65
|
elif format == "long":
|
|
64
66
|
return (
|
|
@@ -81,7 +83,7 @@ class ModelResult(BaseModel):
|
|
|
81
83
|
model_revision: str | None
|
|
82
84
|
task_results: list[TaskResult]
|
|
83
85
|
default_modalities: list[Modalities] = Field(
|
|
84
|
-
default_factory=lambda: ["text"], alias="modalities"
|
|
86
|
+
default_factory=lambda: [cast(Modalities, "text")], alias="modalities"
|
|
85
87
|
)
|
|
86
88
|
model_config = (
|
|
87
89
|
ConfigDict( # to free up the name model_* which is otherwise protected
|
|
@@ -95,16 +97,17 @@ class ModelResult(BaseModel):
|
|
|
95
97
|
return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))"
|
|
96
98
|
|
|
97
99
|
@classmethod
|
|
98
|
-
def from_validated(cls, **data: dict[str, Any]) ->
|
|
100
|
+
def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
|
|
99
101
|
"""Create a ModelResult from validated data.
|
|
100
102
|
|
|
101
103
|
Args:
|
|
102
104
|
data: The validated data.
|
|
103
105
|
"""
|
|
104
|
-
data["task_results"] = [
|
|
105
|
-
TaskResult.from_validated(**res)
|
|
106
|
+
data["task_results"] = [ # type: ignore[assignment]
|
|
107
|
+
TaskResult.from_validated(**res) # type: ignore[arg-type]
|
|
108
|
+
for res in data["task_results"]
|
|
106
109
|
]
|
|
107
|
-
return cls.model_construct(**data)
|
|
110
|
+
return cls.model_construct(**data) # type: ignore[arg-type]
|
|
108
111
|
|
|
109
112
|
def _filter_tasks(
|
|
110
113
|
self,
|
|
@@ -114,7 +117,7 @@ class ModelResult(BaseModel):
|
|
|
114
117
|
task_types: list[TaskType] | None = None,
|
|
115
118
|
modalities: list[Modalities] | None = None,
|
|
116
119
|
is_public: bool | None = None,
|
|
117
|
-
) ->
|
|
120
|
+
) -> ModelResult:
|
|
118
121
|
new_task_results = []
|
|
119
122
|
for task_result in self.task_results:
|
|
120
123
|
if (task_names is not None) and (task_result.task_name not in task_names):
|
|
@@ -142,7 +145,7 @@ class ModelResult(BaseModel):
|
|
|
142
145
|
task_results=new_task_results,
|
|
143
146
|
)
|
|
144
147
|
|
|
145
|
-
def select_tasks(self, tasks:
|
|
148
|
+
def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
|
|
146
149
|
"""Select tasks from the ModelResult based on a list of AbsTask objects.
|
|
147
150
|
|
|
148
151
|
Args:
|
|
@@ -160,6 +163,28 @@ class ModelResult(BaseModel):
|
|
|
160
163
|
task_results=new_task_results,
|
|
161
164
|
)
|
|
162
165
|
|
|
166
|
+
@overload
|
|
167
|
+
def _get_scores(
|
|
168
|
+
self,
|
|
169
|
+
splits: list[SplitName] | None = None,
|
|
170
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
171
|
+
scripts: list[ISOLanguageScript] | None = None,
|
|
172
|
+
getter: Callable[[ScoresDict], Score] | None = None,
|
|
173
|
+
aggregation: Callable[[list[Score]], Any] | None = None,
|
|
174
|
+
format: Literal["wide"] = "wide",
|
|
175
|
+
) -> dict: ...
|
|
176
|
+
|
|
177
|
+
@overload
|
|
178
|
+
def _get_scores(
|
|
179
|
+
self,
|
|
180
|
+
splits: list[SplitName] | None = None,
|
|
181
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
182
|
+
scripts: list[ISOLanguageScript] | None = None,
|
|
183
|
+
getter: Callable[[ScoresDict], Score] | None = None,
|
|
184
|
+
aggregation: Callable[[list[Score]], Any] | None = None,
|
|
185
|
+
format: Literal["long"] = "long",
|
|
186
|
+
) -> list: ...
|
|
187
|
+
|
|
163
188
|
def _get_scores(
|
|
164
189
|
self,
|
|
165
190
|
splits: list[SplitName] | None = None,
|
|
@@ -177,21 +202,24 @@ class ModelResult(BaseModel):
|
|
|
177
202
|
aggregation = aggregation if aggregation is not None else np.mean
|
|
178
203
|
else:
|
|
179
204
|
use_fast = True
|
|
205
|
+
aggregation = cast(Callable[[list[Score]], Any], aggregation)
|
|
206
|
+
getter = cast(Callable[[ScoresDict], Score], getter)
|
|
207
|
+
|
|
180
208
|
if format == "wide":
|
|
181
209
|
scores = {}
|
|
182
210
|
for res in self.task_results:
|
|
183
211
|
try:
|
|
184
212
|
if use_fast:
|
|
185
213
|
scores[res.task_name] = res._get_score_fast(
|
|
186
|
-
splits=splits,
|
|
187
|
-
languages=languages,
|
|
214
|
+
splits=splits,
|
|
215
|
+
languages=languages,
|
|
188
216
|
)
|
|
189
217
|
else:
|
|
190
218
|
scores[res.task_name] = res.get_score(
|
|
191
219
|
splits=splits,
|
|
192
220
|
languages=languages,
|
|
193
|
-
aggregation=aggregation,
|
|
194
|
-
getter=getter,
|
|
221
|
+
aggregation=aggregation,
|
|
222
|
+
getter=getter,
|
|
195
223
|
scripts=scripts,
|
|
196
224
|
)
|
|
197
225
|
except Exception as e:
|
|
@@ -206,14 +234,14 @@ class ModelResult(BaseModel):
|
|
|
206
234
|
if use_fast:
|
|
207
235
|
score = task_res._get_score_fast(
|
|
208
236
|
splits=splits,
|
|
209
|
-
languages=languages,
|
|
237
|
+
languages=languages,
|
|
210
238
|
)
|
|
211
239
|
else:
|
|
212
240
|
score = task_res.get_score(
|
|
213
241
|
splits=splits,
|
|
214
242
|
languages=languages,
|
|
215
|
-
aggregation=aggregation,
|
|
216
|
-
getter=getter,
|
|
243
|
+
aggregation=aggregation,
|
|
244
|
+
getter=getter,
|
|
217
245
|
scripts=scripts,
|
|
218
246
|
)
|
|
219
247
|
entry = dict(
|
|
@@ -292,7 +320,9 @@ class ModelResult(BaseModel):
|
|
|
292
320
|
scores_data = self._get_score_for_table()
|
|
293
321
|
|
|
294
322
|
if not scores_data:
|
|
295
|
-
|
|
323
|
+
msg = "No scores data available. Returning empty DataFrame."
|
|
324
|
+
logger.warning(msg)
|
|
325
|
+
warnings.warn(msg)
|
|
296
326
|
return pd.DataFrame()
|
|
297
327
|
|
|
298
328
|
# Create DataFrame
|
|
@@ -315,7 +345,7 @@ class ModelResult(BaseModel):
|
|
|
315
345
|
def __hash__(self) -> int:
|
|
316
346
|
return id(self)
|
|
317
347
|
|
|
318
|
-
def __iter__(self) -> Iterable[TaskResult]:
|
|
348
|
+
def __iter__(self) -> Iterable[TaskResult]: # type: ignore[override]
|
|
319
349
|
return iter(self.task_results)
|
|
320
350
|
|
|
321
351
|
def __getitem__(self, index) -> TaskResult:
|
|
@@ -368,13 +398,13 @@ class ModelResult(BaseModel):
|
|
|
368
398
|
return [task_res.task_name for task_res in self.task_results]
|
|
369
399
|
|
|
370
400
|
@property
|
|
371
|
-
def modalities(self) -> list[
|
|
401
|
+
def modalities(self) -> list[Modalities]:
|
|
372
402
|
"""Get all modalities in the task results.
|
|
373
403
|
|
|
374
404
|
Returns:
|
|
375
405
|
A list of modalities in the task results.
|
|
376
406
|
"""
|
|
377
|
-
mods = []
|
|
407
|
+
mods: list[Modalities] = []
|
|
378
408
|
for task_res in self.task_results:
|
|
379
409
|
task_modalities = getattr(task_res, "modalities", [])
|
|
380
410
|
mods.extend(task_modalities)
|