mteb 2.5.3__py3-none-any.whl → 2.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +10 -15
- mteb/_evaluators/any_sts_evaluator.py +1 -4
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +17 -16
- mteb/_evaluators/sklearn_evaluator.py +9 -8
- mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
- mteb/_evaluators/text/summarization_evaluator.py +20 -16
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +27 -21
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +3 -16
- mteb/abstasks/classification.py +10 -4
- mteb/abstasks/clustering.py +18 -14
- mteb/abstasks/clustering_legacy.py +8 -8
- mteb/abstasks/image/image_text_pair_classification.py +5 -3
- mteb/abstasks/multilabel_classification.py +20 -16
- mteb/abstasks/pair_classification.py +18 -9
- mteb/abstasks/regression.py +3 -3
- mteb/abstasks/retrieval.py +12 -9
- mteb/abstasks/sts.py +6 -3
- mteb/abstasks/task_metadata.py +20 -16
- mteb/abstasks/text/bitext_mining.py +36 -25
- mteb/abstasks/text/reranking.py +7 -5
- mteb/abstasks/text/summarization.py +8 -3
- mteb/abstasks/zeroshot_classification.py +5 -2
- mteb/benchmarks/benchmark.py +4 -2
- mteb/benchmarks/benchmarks/benchmarks.py +22 -1
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +21 -18
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +8 -8
- mteb/cli/generate_model_card.py +39 -20
- mteb/deprecated_evaluator.py +56 -43
- mteb/evaluate.py +35 -29
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +25 -27
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +1 -1
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +2 -2
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +8 -1
- mteb/models/instruct_wrapper.py +11 -5
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bm25.py +1 -1
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/dino_models.py +23 -23
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/model2vec_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +8 -8
- mteb/models/model_implementations/openclip_models.py +7 -7
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/repllama_models.py +2 -2
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/vlm2vec_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +4 -4
- mteb/models/model_meta.py +30 -14
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
- mteb/models/search_wrappers.py +22 -10
- mteb/models/sentence_transformer_wrapper.py +9 -4
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +25 -19
- mteb/results/model_result.py +49 -21
- mteb/results/task_result.py +45 -51
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/METADATA +1 -1
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/RECORD +105 -104
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/WHEEL +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/top_level.txt +0 -0
mteb/models/model_meta.py
CHANGED
|
@@ -22,6 +22,7 @@ from huggingface_hub import (
|
|
|
22
22
|
from huggingface_hub.errors import (
|
|
23
23
|
EntryNotFoundError,
|
|
24
24
|
GatedRepoError,
|
|
25
|
+
HFValidationError,
|
|
25
26
|
NotASafetensorsRepoError,
|
|
26
27
|
RepositoryNotFoundError,
|
|
27
28
|
SafetensorsParsingError,
|
|
@@ -81,7 +82,7 @@ def _get_loader_name(
|
|
|
81
82
|
return loader.__name__
|
|
82
83
|
|
|
83
84
|
|
|
84
|
-
_SENTENCE_TRANSFORMER_LIB_NAME = "Sentence Transformers"
|
|
85
|
+
_SENTENCE_TRANSFORMER_LIB_NAME: FRAMEWORKS = "Sentence Transformers"
|
|
85
86
|
|
|
86
87
|
|
|
87
88
|
class ModelMeta(BaseModel):
|
|
@@ -263,10 +264,8 @@ class ModelMeta(BaseModel):
|
|
|
263
264
|
_kwargs = self.loader_kwargs.copy()
|
|
264
265
|
_kwargs.update(kwargs)
|
|
265
266
|
|
|
266
|
-
model:
|
|
267
|
-
|
|
268
|
-
)
|
|
269
|
-
model.mteb_model_meta = self # type: ignore
|
|
267
|
+
model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs)
|
|
268
|
+
model.mteb_model_meta = self # type: ignore[misc]
|
|
270
269
|
return model
|
|
271
270
|
|
|
272
271
|
def model_name_as_path(self) -> str:
|
|
@@ -307,7 +306,7 @@ class ModelMeta(BaseModel):
|
|
|
307
306
|
embedding_dim = None
|
|
308
307
|
max_tokens = None
|
|
309
308
|
|
|
310
|
-
if model_name and compute_metadata and
|
|
309
|
+
if model_name and compute_metadata and _repo_exists(model_name):
|
|
311
310
|
reference = "https://huggingface.co/" + model_name
|
|
312
311
|
card = ModelCard.load(model_name)
|
|
313
312
|
card_data: ModelCardData = card.data
|
|
@@ -318,9 +317,8 @@ class ModelMeta(BaseModel):
|
|
|
318
317
|
model_config = None
|
|
319
318
|
logger.warning(f"Can't get configuration for {model_name}. Error: {e}")
|
|
320
319
|
|
|
321
|
-
if (
|
|
322
|
-
card_data.
|
|
323
|
-
or _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
|
|
320
|
+
if card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME or (
|
|
321
|
+
card_data.tags and _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
|
|
324
322
|
):
|
|
325
323
|
frameworks.append(_SENTENCE_TRANSFORMER_LIB_NAME)
|
|
326
324
|
else:
|
|
@@ -417,7 +415,7 @@ class ModelMeta(BaseModel):
|
|
|
417
415
|
meta.framework.append("Sentence Transformers")
|
|
418
416
|
meta.modalities = ["text"]
|
|
419
417
|
|
|
420
|
-
if model and compute_metadata and
|
|
418
|
+
if model and compute_metadata and _repo_exists(model):
|
|
421
419
|
# have max_seq_length field
|
|
422
420
|
sbert_config = _get_json_from_hub(
|
|
423
421
|
model, "sentence_bert_config.json", "model", revision=revision
|
|
@@ -435,7 +433,7 @@ class ModelMeta(BaseModel):
|
|
|
435
433
|
and config_sbert.get("similarity_fn_name") is not None
|
|
436
434
|
):
|
|
437
435
|
meta.similarity_fn_name = ScoringFunction.from_str(
|
|
438
|
-
config_sbert
|
|
436
|
+
config_sbert["similarity_fn_name"]
|
|
439
437
|
)
|
|
440
438
|
else:
|
|
441
439
|
meta.similarity_fn_name = ScoringFunction.COSINE
|
|
@@ -516,7 +514,7 @@ class ModelMeta(BaseModel):
|
|
|
516
514
|
warnings.warn(msg)
|
|
517
515
|
|
|
518
516
|
return_dataset = training_datasets.copy()
|
|
519
|
-
visited = set()
|
|
517
|
+
visited: set[str] = set()
|
|
520
518
|
|
|
521
519
|
for dataset in training_datasets:
|
|
522
520
|
similar_tasks = _collect_similar_tasks(dataset, visited)
|
|
@@ -550,6 +548,8 @@ class ModelMeta(BaseModel):
|
|
|
550
548
|
|
|
551
549
|
@staticmethod
|
|
552
550
|
def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | None:
|
|
551
|
+
if not model_name:
|
|
552
|
+
return None
|
|
553
553
|
try:
|
|
554
554
|
safetensors_metadata = get_safetensors_metadata(model_name)
|
|
555
555
|
if len(safetensors_metadata.parameter_count) >= 0:
|
|
@@ -563,7 +563,7 @@ class ModelMeta(BaseModel):
|
|
|
563
563
|
logger.warning(
|
|
564
564
|
f"Can't calculate number of parameters for {model_name}. Got error {e}"
|
|
565
565
|
)
|
|
566
|
-
|
|
566
|
+
return None
|
|
567
567
|
|
|
568
568
|
def calculate_num_parameters_from_hub(self) -> int | None:
|
|
569
569
|
"""Calculates the number of parameters in the model.
|
|
@@ -626,7 +626,7 @@ class ModelMeta(BaseModel):
|
|
|
626
626
|
if "API" in self.framework or self.name is None:
|
|
627
627
|
return None
|
|
628
628
|
|
|
629
|
-
return self._calculate_memory_usage_mb(self.
|
|
629
|
+
return self._calculate_memory_usage_mb(self.name, self.n_parameters)
|
|
630
630
|
|
|
631
631
|
@staticmethod
|
|
632
632
|
def fetch_release_date(model_name: str) -> StrDate | None:
|
|
@@ -786,3 +786,19 @@ def _get_file_on_hub(
|
|
|
786
786
|
except (GatedRepoError, RepositoryNotFoundError, EntryNotFoundError) as e:
|
|
787
787
|
logger.warning(f"Can't get file {file_name} of {repo_id}: {e}")
|
|
788
788
|
return None
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
def _repo_exists(repo_id: str, repo_type: str | None = None) -> bool:
|
|
792
|
+
"""Checks if a repository exists on HuggingFace Hub.
|
|
793
|
+
|
|
794
|
+
Repo exists will raise HFValidationError for invalid local paths
|
|
795
|
+
|
|
796
|
+
Args:
|
|
797
|
+
repo_id: The repository ID.
|
|
798
|
+
repo_type: The type of repository (e.g., "model", "dataset", "space").
|
|
799
|
+
"""
|
|
800
|
+
try:
|
|
801
|
+
return repo_exists(repo_id=repo_id, repo_type=repo_type)
|
|
802
|
+
except HFValidationError as e:
|
|
803
|
+
logger.warning(f"Can't check existence of {repo_id}: {e}")
|
|
804
|
+
return False
|
|
@@ -109,7 +109,7 @@ class FaissSearchIndex:
|
|
|
109
109
|
ids = ids.tolist()
|
|
110
110
|
|
|
111
111
|
if issubclass(self.index_type, faiss.IndexFlatL2):
|
|
112
|
-
similarities = -np.sqrt(np.maximum(similarities, 0))
|
|
112
|
+
similarities = (-np.sqrt(np.maximum(similarities, 0))).tolist()
|
|
113
113
|
|
|
114
114
|
return similarities, ids
|
|
115
115
|
|
|
@@ -117,8 +117,8 @@ class FaissSearchIndex:
|
|
|
117
117
|
self,
|
|
118
118
|
embeddings: Array,
|
|
119
119
|
top_k: int,
|
|
120
|
-
top_ranked: TopRankedDocumentsType
|
|
121
|
-
query_idx_to_id: dict[int, str]
|
|
120
|
+
top_ranked: TopRankedDocumentsType,
|
|
121
|
+
query_idx_to_id: dict[int, str],
|
|
122
122
|
) -> tuple[list[list[float]], list[list[int]]]:
|
|
123
123
|
doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(self.idxs)}
|
|
124
124
|
scores_all: list[list[float]] = []
|
|
@@ -136,9 +136,9 @@ class FaissSearchIndex:
|
|
|
136
136
|
continue
|
|
137
137
|
|
|
138
138
|
candidate_indices = [doc_id_to_idx[doc_id] for doc_id in ranked_ids]
|
|
139
|
-
d = self.index.d
|
|
139
|
+
d = self.index.d # type: ignore[union-attr]
|
|
140
140
|
candidate_embs = np.vstack(
|
|
141
|
-
[self.index.reconstruct(idx) for idx in candidate_indices]
|
|
141
|
+
[self.index.reconstruct(idx) for idx in candidate_indices] # type: ignore[union-attr]
|
|
142
142
|
)
|
|
143
143
|
sub_reranking_index = self.index_type(d)
|
|
144
144
|
sub_reranking_index.add(candidate_embs)
|
mteb/models/search_wrappers.py
CHANGED
|
@@ -200,7 +200,7 @@ class SearchEncoderWrapper:
|
|
|
200
200
|
# Reset the task corpus dataloader to None to free up memory
|
|
201
201
|
self.task_corpus = None
|
|
202
202
|
|
|
203
|
-
results = {qid: {} for qid in query_idx_to_id.values()}
|
|
203
|
+
results: RetrievalOutputType = {qid: {} for qid in query_idx_to_id.values()}
|
|
204
204
|
for qid in result_heaps:
|
|
205
205
|
for score, corpus_id in result_heaps[qid]:
|
|
206
206
|
results[qid][corpus_id] = score
|
|
@@ -218,13 +218,19 @@ class SearchEncoderWrapper:
|
|
|
218
218
|
encode_kwargs: dict[str, Any],
|
|
219
219
|
) -> dict[str, list[tuple[float, str]]]:
|
|
220
220
|
logger.info("Encoding Corpus in batches (this might take a while)...")
|
|
221
|
+
if self.task_corpus is None:
|
|
222
|
+
raise ValueError("Corpus must be indexed before searching.")
|
|
223
|
+
|
|
221
224
|
itr = range(0, len(self.task_corpus), self.corpus_chunk_size)
|
|
222
225
|
|
|
223
|
-
result_heaps
|
|
226
|
+
result_heaps: dict[str, list[tuple[float, str]]] = {
|
|
227
|
+
qid: [] for qid in query_idx_to_id.values()
|
|
228
|
+
}
|
|
224
229
|
for batch_num, corpus_start_idx in enumerate(itr):
|
|
225
230
|
logger.info(f"Encoding Batch {batch_num + 1}/{len(itr)}...")
|
|
226
231
|
corpus_end_idx = min(
|
|
227
|
-
corpus_start_idx + self.corpus_chunk_size,
|
|
232
|
+
corpus_start_idx + self.corpus_chunk_size,
|
|
233
|
+
len(self.task_corpus),
|
|
228
234
|
)
|
|
229
235
|
sub_corpus = self.task_corpus.select(
|
|
230
236
|
range(corpus_start_idx, corpus_end_idx)
|
|
@@ -249,7 +255,7 @@ class SearchEncoderWrapper:
|
|
|
249
255
|
scores = self.model.similarity(query_embeddings, sub_corpus_embeddings)
|
|
250
256
|
|
|
251
257
|
# get top-k values
|
|
252
|
-
|
|
258
|
+
cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = torch.topk(
|
|
253
259
|
torch.as_tensor(scores),
|
|
254
260
|
min(
|
|
255
261
|
top_k + 1,
|
|
@@ -258,8 +264,8 @@ class SearchEncoderWrapper:
|
|
|
258
264
|
dim=1,
|
|
259
265
|
largest=True,
|
|
260
266
|
)
|
|
261
|
-
cos_scores_top_k_idx =
|
|
262
|
-
cos_scores_top_k_values =
|
|
267
|
+
cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
|
|
268
|
+
cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
|
|
263
269
|
|
|
264
270
|
sub_corpus_ids = list(sub_corpus_ids)
|
|
265
271
|
result_heaps = self._sort_full_corpus_results(
|
|
@@ -319,7 +325,11 @@ class SearchEncoderWrapper:
|
|
|
319
325
|
Returns:
|
|
320
326
|
A dictionary mapping query IDs to a list of tuples, each containing a relevance score and a document ID.
|
|
321
327
|
"""
|
|
322
|
-
|
|
328
|
+
if self.task_corpus is None:
|
|
329
|
+
raise ValueError("Corpus must be indexed before searching.")
|
|
330
|
+
result_heaps: dict[str, list[tuple[float, str]]] = {
|
|
331
|
+
qid: [] for qid in query_idx_to_id.values()
|
|
332
|
+
}
|
|
323
333
|
doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)}
|
|
324
334
|
|
|
325
335
|
all_doc_embeddings = self.model.encode(
|
|
@@ -387,12 +397,12 @@ class SearchEncoderWrapper:
|
|
|
387
397
|
|
|
388
398
|
def _rerank_sort_results(
|
|
389
399
|
self,
|
|
390
|
-
result_heaps: list[tuple[float, str]],
|
|
400
|
+
result_heaps: dict[str, list[tuple[float, str]]],
|
|
391
401
|
query_id: str,
|
|
392
402
|
ranked_ids: list[str],
|
|
393
403
|
scores_top_k_idx: torch.Tensor,
|
|
394
404
|
scores_top_k_values: torch.Tensor,
|
|
395
|
-
) -> list[tuple[float, str]]:
|
|
405
|
+
) -> dict[str, list[tuple[float, str]]]:
|
|
396
406
|
"""Sort the heap into descending order list.
|
|
397
407
|
|
|
398
408
|
Returns:
|
|
@@ -503,6 +513,8 @@ class SearchCrossEncoderWrapper:
|
|
|
503
513
|
raise ValueError(
|
|
504
514
|
"CrossEncoder search requires top_ranked documents for reranking."
|
|
505
515
|
)
|
|
516
|
+
if self.task_corpus is None:
|
|
517
|
+
raise ValueError("Corpus must be indexed before searching.")
|
|
506
518
|
|
|
507
519
|
query_id_to_idx = {row["id"]: i for i, row in enumerate(queries)}
|
|
508
520
|
doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)}
|
|
@@ -542,7 +554,7 @@ class SearchCrossEncoderWrapper:
|
|
|
542
554
|
hf_subset=hf_subset,
|
|
543
555
|
)
|
|
544
556
|
|
|
545
|
-
results = {qid: {} for qid in queries["id"]}
|
|
557
|
+
results: RetrievalOutputType = {qid: {} for qid in queries["id"]}
|
|
546
558
|
for (query_id, corpus_id), score in zip(doc_pairs_ids, predictions):
|
|
547
559
|
results[query_id][corpus_id] = float(score)
|
|
548
560
|
|
|
@@ -103,8 +103,11 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
|
|
|
103
103
|
logger.warning(msg)
|
|
104
104
|
warnings.warn(msg)
|
|
105
105
|
|
|
106
|
+
def similarity(self, embeddings1: Array, embeddings2: Array) -> Array:
|
|
107
|
+
"""Compute the similarity between two collections of embeddings."""
|
|
106
108
|
if hasattr(self.model, "similarity") and callable(self.model.similarity):
|
|
107
|
-
|
|
109
|
+
return self.model.similarity(embeddings1, embeddings2)
|
|
110
|
+
return super().similarity(embeddings1, embeddings2)
|
|
108
111
|
|
|
109
112
|
def encode(
|
|
110
113
|
self,
|
|
@@ -150,7 +153,7 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
|
|
|
150
153
|
prompt_name = None
|
|
151
154
|
if self.model_prompts is not None:
|
|
152
155
|
prompt_name = self.get_prompt_name(task_metadata, prompt_type)
|
|
153
|
-
prompt = self.model_prompts.get(prompt_name, None)
|
|
156
|
+
prompt = self.model_prompts.get(prompt_name, None) # type: ignore[arg-type]
|
|
154
157
|
if prompt_name:
|
|
155
158
|
prompt_log = f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}"
|
|
156
159
|
else:
|
|
@@ -221,7 +224,7 @@ class SentenceTransformerMultimodalEncoderWrapper(SentenceTransformerEncoderWrap
|
|
|
221
224
|
prompt_name = None
|
|
222
225
|
if self.model_prompts is not None:
|
|
223
226
|
prompt_name = self.get_prompt_name(task_metadata, prompt_type)
|
|
224
|
-
prompt = self.model_prompts.get(prompt_name, None)
|
|
227
|
+
prompt = self.model_prompts.get(prompt_name, None) # type: ignore[arg-type]
|
|
225
228
|
if prompt_name:
|
|
226
229
|
logger.info(
|
|
227
230
|
f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}"
|
|
@@ -234,7 +237,9 @@ class SentenceTransformerMultimodalEncoderWrapper(SentenceTransformerEncoderWrap
|
|
|
234
237
|
all_embeddings = []
|
|
235
238
|
for batch in inputs:
|
|
236
239
|
batch_column = next(iter(batch.keys()))
|
|
237
|
-
batched_input
|
|
240
|
+
batched_input: list[dict[str, Any]] = [
|
|
241
|
+
dict() for _ in range(len(batch[batch_column]))
|
|
242
|
+
]
|
|
238
243
|
|
|
239
244
|
# transform from {"text": [text1, text2], "image": [image1, image2]} to
|
|
240
245
|
# [{"text": text1, "image": image1}, {"text": text2, "image": image2}]
|
mteb/py.typed
ADDED
|
File without changes
|
|
@@ -1,10 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import functools
|
|
2
4
|
import json
|
|
3
5
|
import logging
|
|
4
6
|
import warnings
|
|
5
|
-
from collections.abc import Callable, Iterable, Iterator
|
|
7
|
+
from collections.abc import Callable, Iterable, Iterator
|
|
6
8
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Literal
|
|
9
|
+
from typing import Any, Literal, cast
|
|
8
10
|
|
|
9
11
|
import pandas as pd
|
|
10
12
|
from packaging.version import InvalidVersion, Version
|
|
@@ -33,11 +35,12 @@ from .model_result import ModelResult, _aggregate_and_pivot
|
|
|
33
35
|
logger = logging.getLogger(__name__)
|
|
34
36
|
|
|
35
37
|
|
|
36
|
-
# Global cache for model metas and version parsing
|
|
37
38
|
@functools.lru_cache
|
|
38
39
|
def _get_cached_model_metas() -> dict[str, str | None]:
|
|
39
40
|
"""Cache model metas to avoid repeated calls."""
|
|
40
|
-
return {
|
|
41
|
+
return {
|
|
42
|
+
meta.name: meta.revision for meta in get_model_metas() if meta.name is not None
|
|
43
|
+
}
|
|
41
44
|
|
|
42
45
|
|
|
43
46
|
@functools.lru_cache(maxsize=10000)
|
|
@@ -77,10 +80,10 @@ class BenchmarkResults(BaseModel):
|
|
|
77
80
|
task_names: list[str] | None = None,
|
|
78
81
|
languages: list[str] | None = None,
|
|
79
82
|
domains: list[TaskDomain] | None = None,
|
|
80
|
-
task_types: list[TaskType] | None = None,
|
|
83
|
+
task_types: list[TaskType] | None = None,
|
|
81
84
|
modalities: list[Modalities] | None = None,
|
|
82
85
|
is_public: bool | None = None,
|
|
83
|
-
) ->
|
|
86
|
+
) -> BenchmarkResults:
|
|
84
87
|
# TODO: Same as filter_models
|
|
85
88
|
model_results = [
|
|
86
89
|
res._filter_tasks(
|
|
@@ -97,7 +100,7 @@ class BenchmarkResults(BaseModel):
|
|
|
97
100
|
model_results=[res for res in model_results if res.task_results]
|
|
98
101
|
)
|
|
99
102
|
|
|
100
|
-
def select_tasks(self, tasks:
|
|
103
|
+
def select_tasks(self, tasks: Iterable[AbsTask]) -> BenchmarkResults:
|
|
101
104
|
"""Select tasks from the benchmark results.
|
|
102
105
|
|
|
103
106
|
Args:
|
|
@@ -115,7 +118,7 @@ class BenchmarkResults(BaseModel):
|
|
|
115
118
|
self,
|
|
116
119
|
names: list[str] | list[ModelMeta],
|
|
117
120
|
revisions: list[str | None] | None = None,
|
|
118
|
-
) ->
|
|
121
|
+
) -> BenchmarkResults:
|
|
119
122
|
"""Get models by name and revision.
|
|
120
123
|
|
|
121
124
|
Args:
|
|
@@ -128,7 +131,7 @@ class BenchmarkResults(BaseModel):
|
|
|
128
131
|
models_res = []
|
|
129
132
|
_revisions = revisions if revisions is not None else [None] * len(names)
|
|
130
133
|
|
|
131
|
-
name_rev = {}
|
|
134
|
+
name_rev: dict[str, str | None] = {}
|
|
132
135
|
|
|
133
136
|
if len(names) != len(_revisions):
|
|
134
137
|
raise ValueError(
|
|
@@ -137,9 +140,12 @@ class BenchmarkResults(BaseModel):
|
|
|
137
140
|
|
|
138
141
|
for name, revision in zip(names, _revisions):
|
|
139
142
|
if isinstance(name, ModelMeta):
|
|
143
|
+
if name.name is None:
|
|
144
|
+
raise ValueError("name in ModelMeta is None. It must be a string.")
|
|
140
145
|
name_rev[name.name] = name.revision
|
|
141
146
|
else:
|
|
142
|
-
|
|
147
|
+
name_ = cast(str, name)
|
|
148
|
+
name_rev[name_] = revision
|
|
143
149
|
|
|
144
150
|
for model_res in self.model_results:
|
|
145
151
|
model_name = model_res.model_name
|
|
@@ -159,7 +165,7 @@ class BenchmarkResults(BaseModel):
|
|
|
159
165
|
n_parameters_range: tuple[int | None, int | None] = (None, None),
|
|
160
166
|
use_instructions: bool | None = None,
|
|
161
167
|
zero_shot_on: list[AbsTask] | None = None,
|
|
162
|
-
) ->
|
|
168
|
+
) -> BenchmarkResults:
|
|
163
169
|
# mostly a utility function for the leaderboard app.
|
|
164
170
|
# I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter.
|
|
165
171
|
# interface would then be the same as the get_models function
|
|
@@ -182,7 +188,7 @@ class BenchmarkResults(BaseModel):
|
|
|
182
188
|
|
|
183
189
|
return type(self).model_construct(model_results=new_model_results)
|
|
184
190
|
|
|
185
|
-
def join_revisions(self) ->
|
|
191
|
+
def join_revisions(self) -> BenchmarkResults:
|
|
186
192
|
"""Join revisions of the same model.
|
|
187
193
|
|
|
188
194
|
In case of conflicts, the following rules are applied:
|
|
@@ -212,10 +218,10 @@ class BenchmarkResults(BaseModel):
|
|
|
212
218
|
|
|
213
219
|
# Use cached model metas
|
|
214
220
|
model_to_main_revision = _get_cached_model_metas()
|
|
215
|
-
task_df["main_revision"] = task_df["model"].map(model_to_main_revision)
|
|
221
|
+
task_df["main_revision"] = task_df["model"].map(model_to_main_revision)
|
|
216
222
|
|
|
217
223
|
# Use cached version parsing
|
|
218
|
-
task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)
|
|
224
|
+
task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)
|
|
219
225
|
|
|
220
226
|
# Filter out rows without scores first
|
|
221
227
|
task_df = task_df[task_df["has_scores"]]
|
|
@@ -259,8 +265,8 @@ class BenchmarkResults(BaseModel):
|
|
|
259
265
|
# so grouping by original revision ensures consistent ModelResult creation
|
|
260
266
|
for (model, model_revision), group in task_df.groupby(["model", "revision"]):
|
|
261
267
|
model_result = ModelResult.model_construct(
|
|
262
|
-
model_name=model,
|
|
263
|
-
model_revision=model_revision,
|
|
268
|
+
model_name=model, # type: ignore[arg-type]
|
|
269
|
+
model_revision=model_revision, # type: ignore[arg-type]
|
|
264
270
|
task_results=list(group["task_result"]),
|
|
265
271
|
)
|
|
266
272
|
model_results.append(model_result)
|
|
@@ -291,7 +297,7 @@ class BenchmarkResults(BaseModel):
|
|
|
291
297
|
{
|
|
292
298
|
"model": model_res.model_name,
|
|
293
299
|
"revision": model_res.model_revision,
|
|
294
|
-
**model_scores,
|
|
300
|
+
**model_scores,
|
|
295
301
|
}
|
|
296
302
|
)
|
|
297
303
|
except Exception as e:
|
|
@@ -404,7 +410,7 @@ class BenchmarkResults(BaseModel):
|
|
|
404
410
|
|
|
405
411
|
return self.benchmark._create_summary_table(self)
|
|
406
412
|
|
|
407
|
-
def __iter__(self) -> Iterator[ModelResult]:
|
|
413
|
+
def __iter__(self) -> Iterator[ModelResult]: # type: ignore[override]
|
|
408
414
|
return iter(self.model_results)
|
|
409
415
|
|
|
410
416
|
def __getitem__(self, index: int) -> ModelResult:
|
|
@@ -426,7 +432,7 @@ class BenchmarkResults(BaseModel):
|
|
|
426
432
|
out_file.write(self.model_dump_json(indent=2))
|
|
427
433
|
|
|
428
434
|
@classmethod
|
|
429
|
-
def from_validated(cls, **data) ->
|
|
435
|
+
def from_validated(cls, **data) -> BenchmarkResults:
|
|
430
436
|
"""Create BenchmarkResults from validated data.
|
|
431
437
|
|
|
432
438
|
Args:
|
mteb/results/model_result.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import warnings
|
|
3
|
-
from collections.abc import Callable, Iterable
|
|
4
|
-
from typing import Any, Literal
|
|
5
|
+
from collections.abc import Callable, Iterable
|
|
6
|
+
from typing import Any, Literal, cast
|
|
5
7
|
|
|
6
8
|
import numpy as np
|
|
7
9
|
import pandas as pd
|
|
8
10
|
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
-
from typing_extensions import
|
|
11
|
+
from typing_extensions import overload
|
|
10
12
|
|
|
11
13
|
from mteb.abstasks.abstask import AbsTask
|
|
12
14
|
from mteb.abstasks.task_metadata import (
|
|
@@ -58,7 +60,7 @@ def _aggregate_and_pivot(
|
|
|
58
60
|
index=index_columns,
|
|
59
61
|
columns=columns,
|
|
60
62
|
values="score",
|
|
61
|
-
aggfunc=aggregation_fn,
|
|
63
|
+
aggfunc=aggregation_fn, # type: ignore[arg-type]
|
|
62
64
|
).reset_index()
|
|
63
65
|
elif format == "long":
|
|
64
66
|
return (
|
|
@@ -81,7 +83,7 @@ class ModelResult(BaseModel):
|
|
|
81
83
|
model_revision: str | None
|
|
82
84
|
task_results: list[TaskResult]
|
|
83
85
|
default_modalities: list[Modalities] = Field(
|
|
84
|
-
default_factory=lambda: ["text"], alias="modalities"
|
|
86
|
+
default_factory=lambda: [cast(Modalities, "text")], alias="modalities"
|
|
85
87
|
)
|
|
86
88
|
model_config = (
|
|
87
89
|
ConfigDict( # to free up the name model_* which is otherwise protected
|
|
@@ -95,16 +97,17 @@ class ModelResult(BaseModel):
|
|
|
95
97
|
return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))"
|
|
96
98
|
|
|
97
99
|
@classmethod
|
|
98
|
-
def from_validated(cls, **data: dict[str, Any]) ->
|
|
100
|
+
def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
|
|
99
101
|
"""Create a ModelResult from validated data.
|
|
100
102
|
|
|
101
103
|
Args:
|
|
102
104
|
data: The validated data.
|
|
103
105
|
"""
|
|
104
|
-
data["task_results"] = [
|
|
105
|
-
TaskResult.from_validated(**res)
|
|
106
|
+
data["task_results"] = [ # type: ignore[assignment]
|
|
107
|
+
TaskResult.from_validated(**res) # type: ignore[arg-type]
|
|
108
|
+
for res in data["task_results"]
|
|
106
109
|
]
|
|
107
|
-
return cls.model_construct(**data)
|
|
110
|
+
return cls.model_construct(**data) # type: ignore[arg-type]
|
|
108
111
|
|
|
109
112
|
def _filter_tasks(
|
|
110
113
|
self,
|
|
@@ -114,7 +117,7 @@ class ModelResult(BaseModel):
|
|
|
114
117
|
task_types: list[TaskType] | None = None,
|
|
115
118
|
modalities: list[Modalities] | None = None,
|
|
116
119
|
is_public: bool | None = None,
|
|
117
|
-
) ->
|
|
120
|
+
) -> ModelResult:
|
|
118
121
|
new_task_results = []
|
|
119
122
|
for task_result in self.task_results:
|
|
120
123
|
if (task_names is not None) and (task_result.task_name not in task_names):
|
|
@@ -142,7 +145,7 @@ class ModelResult(BaseModel):
|
|
|
142
145
|
task_results=new_task_results,
|
|
143
146
|
)
|
|
144
147
|
|
|
145
|
-
def select_tasks(self, tasks:
|
|
148
|
+
def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
|
|
146
149
|
"""Select tasks from the ModelResult based on a list of AbsTask objects.
|
|
147
150
|
|
|
148
151
|
Args:
|
|
@@ -160,6 +163,28 @@ class ModelResult(BaseModel):
|
|
|
160
163
|
task_results=new_task_results,
|
|
161
164
|
)
|
|
162
165
|
|
|
166
|
+
@overload
|
|
167
|
+
def _get_scores(
|
|
168
|
+
self,
|
|
169
|
+
splits: list[SplitName] | None = None,
|
|
170
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
171
|
+
scripts: list[ISOLanguageScript] | None = None,
|
|
172
|
+
getter: Callable[[ScoresDict], Score] | None = None,
|
|
173
|
+
aggregation: Callable[[list[Score]], Any] | None = None,
|
|
174
|
+
format: Literal["wide"] = "wide",
|
|
175
|
+
) -> dict: ...
|
|
176
|
+
|
|
177
|
+
@overload
|
|
178
|
+
def _get_scores(
|
|
179
|
+
self,
|
|
180
|
+
splits: list[SplitName] | None = None,
|
|
181
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
182
|
+
scripts: list[ISOLanguageScript] | None = None,
|
|
183
|
+
getter: Callable[[ScoresDict], Score] | None = None,
|
|
184
|
+
aggregation: Callable[[list[Score]], Any] | None = None,
|
|
185
|
+
format: Literal["long"] = "long",
|
|
186
|
+
) -> list: ...
|
|
187
|
+
|
|
163
188
|
def _get_scores(
|
|
164
189
|
self,
|
|
165
190
|
splits: list[SplitName] | None = None,
|
|
@@ -177,21 +202,24 @@ class ModelResult(BaseModel):
|
|
|
177
202
|
aggregation = aggregation if aggregation is not None else np.mean
|
|
178
203
|
else:
|
|
179
204
|
use_fast = True
|
|
205
|
+
aggregation = cast(Callable[[list[Score]], Any], aggregation)
|
|
206
|
+
getter = cast(Callable[[ScoresDict], Score], getter)
|
|
207
|
+
|
|
180
208
|
if format == "wide":
|
|
181
209
|
scores = {}
|
|
182
210
|
for res in self.task_results:
|
|
183
211
|
try:
|
|
184
212
|
if use_fast:
|
|
185
213
|
scores[res.task_name] = res._get_score_fast(
|
|
186
|
-
splits=splits,
|
|
187
|
-
languages=languages,
|
|
214
|
+
splits=splits,
|
|
215
|
+
languages=languages,
|
|
188
216
|
)
|
|
189
217
|
else:
|
|
190
218
|
scores[res.task_name] = res.get_score(
|
|
191
219
|
splits=splits,
|
|
192
220
|
languages=languages,
|
|
193
|
-
aggregation=aggregation,
|
|
194
|
-
getter=getter,
|
|
221
|
+
aggregation=aggregation,
|
|
222
|
+
getter=getter,
|
|
195
223
|
scripts=scripts,
|
|
196
224
|
)
|
|
197
225
|
except Exception as e:
|
|
@@ -206,14 +234,14 @@ class ModelResult(BaseModel):
|
|
|
206
234
|
if use_fast:
|
|
207
235
|
score = task_res._get_score_fast(
|
|
208
236
|
splits=splits,
|
|
209
|
-
languages=languages,
|
|
237
|
+
languages=languages,
|
|
210
238
|
)
|
|
211
239
|
else:
|
|
212
240
|
score = task_res.get_score(
|
|
213
241
|
splits=splits,
|
|
214
242
|
languages=languages,
|
|
215
|
-
aggregation=aggregation,
|
|
216
|
-
getter=getter,
|
|
243
|
+
aggregation=aggregation,
|
|
244
|
+
getter=getter,
|
|
217
245
|
scripts=scripts,
|
|
218
246
|
)
|
|
219
247
|
entry = dict(
|
|
@@ -317,7 +345,7 @@ class ModelResult(BaseModel):
|
|
|
317
345
|
def __hash__(self) -> int:
|
|
318
346
|
return id(self)
|
|
319
347
|
|
|
320
|
-
def __iter__(self) -> Iterable[TaskResult]:
|
|
348
|
+
def __iter__(self) -> Iterable[TaskResult]: # type: ignore[override]
|
|
321
349
|
return iter(self.task_results)
|
|
322
350
|
|
|
323
351
|
def __getitem__(self, index) -> TaskResult:
|
|
@@ -370,13 +398,13 @@ class ModelResult(BaseModel):
|
|
|
370
398
|
return [task_res.task_name for task_res in self.task_results]
|
|
371
399
|
|
|
372
400
|
@property
|
|
373
|
-
def modalities(self) -> list[
|
|
401
|
+
def modalities(self) -> list[Modalities]:
|
|
374
402
|
"""Get all modalities in the task results.
|
|
375
403
|
|
|
376
404
|
Returns:
|
|
377
405
|
A list of modalities in the task results.
|
|
378
406
|
"""
|
|
379
|
-
mods = []
|
|
407
|
+
mods: list[Modalities] = []
|
|
380
408
|
for task_res in self.task_results:
|
|
381
409
|
task_modalities = getattr(task_res, "modalities", [])
|
|
382
410
|
mods.extend(task_modalities)
|