mteb 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/abstasks/abstask.py +6 -6
- mteb/abstasks/aggregated_task.py +4 -10
- mteb/abstasks/clustering_legacy.py +3 -2
- mteb/abstasks/task_metadata.py +2 -3
- mteb/cache.py +7 -4
- mteb/cli/build_cli.py +10 -5
- mteb/cli/generate_model_card.py +4 -3
- mteb/deprecated_evaluator.py +4 -3
- mteb/evaluate.py +4 -1
- mteb/get_tasks.py +4 -3
- mteb/leaderboard/app.py +70 -3
- mteb/models/abs_encoder.py +5 -3
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +4 -1
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +13 -12
- mteb/models/model_implementations/align_models.py +1 -0
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +2 -0
- mteb/models/model_implementations/ara_models.py +1 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +17 -0
- mteb/models/model_implementations/bica_model.py +1 -0
- mteb/models/model_implementations/blip2_models.py +2 -0
- mteb/models/model_implementations/blip_models.py +8 -0
- mteb/models/model_implementations/bm25.py +1 -0
- mteb/models/model_implementations/bmretriever_models.py +4 -0
- mteb/models/model_implementations/cadet_models.py +1 -0
- mteb/models/model_implementations/cde_models.py +2 -0
- mteb/models/model_implementations/clip_models.py +3 -0
- mteb/models/model_implementations/clips_models.py +3 -0
- mteb/models/model_implementations/codefuse_models.py +3 -0
- mteb/models/model_implementations/codesage_models.py +3 -0
- mteb/models/model_implementations/cohere_models.py +4 -0
- mteb/models/model_implementations/cohere_v.py +5 -0
- mteb/models/model_implementations/colpali_models.py +3 -0
- mteb/models/model_implementations/colqwen_models.py +9 -0
- mteb/models/model_implementations/colsmol_models.py +2 -0
- mteb/models/model_implementations/conan_models.py +1 -0
- mteb/models/model_implementations/dino_models.py +19 -0
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +9 -0
- mteb/models/model_implementations/e5_v.py +1 -0
- mteb/models/model_implementations/eagerworks_models.py +1 -0
- mteb/models/model_implementations/emillykkejensen_models.py +3 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +1 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +8 -0
- mteb/models/model_implementations/facebookai.py +2 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +6 -3
- mteb/models/model_implementations/google_models.py +5 -0
- mteb/models/model_implementations/granite_vision_embedding_models.py +1 -0
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +9 -0
- mteb/models/model_implementations/hinvec_models.py +1 -0
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +2 -0
- mteb/models/model_implementations/jina_clip.py +1 -0
- mteb/models/model_implementations/jina_models.py +7 -1
- mteb/models/model_implementations/kalm_models.py +6 -0
- mteb/models/model_implementations/kblab.py +1 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
- mteb/models/model_implementations/kfst.py +1 -0
- mteb/models/model_implementations/kowshik24_models.py +1 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +1 -0
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +3 -0
- mteb/models/model_implementations/llm2vec_models.py +8 -0
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +63 -0
- mteb/models/model_implementations/mme5_models.py +1 -0
- mteb/models/model_implementations/moco_models.py +2 -0
- mteb/models/model_implementations/model2vec_models.py +13 -0
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/mxbai_models.py +3 -0
- mteb/models/model_implementations/nbailab.py +3 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
- mteb/models/model_implementations/nomic_models.py +6 -0
- mteb/models/model_implementations/nomic_models_vision.py +1 -0
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +2 -0
- mteb/models/model_implementations/nvidia_models.py +3 -0
- mteb/models/model_implementations/octen_models.py +195 -0
- mteb/models/model_implementations/openai_models.py +5 -0
- mteb/models/model_implementations/openclip_models.py +8 -0
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
- mteb/models/model_implementations/ops_moa_models.py +2 -0
- mteb/models/model_implementations/pawan_models.py +1 -0
- mteb/models/model_implementations/piccolo_models.py +2 -0
- mteb/models/model_implementations/promptriever_models.py +4 -0
- mteb/models/model_implementations/pylate_models.py +3 -0
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +3 -0
- mteb/models/model_implementations/qzhou_models.py +2 -0
- mteb/models/model_implementations/random_baseline.py +2 -1
- mteb/models/model_implementations/rasgaard_models.py +1 -0
- mteb/models/model_implementations/reasonir_model.py +1 -0
- mteb/models/model_implementations/repllama_models.py +2 -0
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +20 -0
- mteb/models/model_implementations/ruri_models.py +10 -0
- mteb/models/model_implementations/salesforce_models.py +3 -0
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +18 -0
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +10 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +2 -0
- mteb/models/model_implementations/ua_sentence_models.py +1 -0
- mteb/models/model_implementations/uae_models.py +1 -0
- mteb/models/model_implementations/vdr_models.py +1 -0
- mteb/models/model_implementations/vi_vn_models.py +6 -0
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +2 -0
- mteb/models/model_implementations/voyage_models.py +15 -0
- mteb/models/model_implementations/voyage_v.py +1 -0
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +1 -0
- mteb/models/model_meta.py +49 -4
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +4 -1
- mteb/models/search_wrappers.py +4 -2
- mteb/models/sentence_transformer_wrapper.py +10 -10
- mteb/results/benchmark_results.py +67 -43
- mteb/results/model_result.py +3 -1
- mteb/results/task_result.py +22 -17
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/METADATA +1 -1
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/RECORD +148 -147
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/WHEEL +0 -0
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/top_level.txt +0 -0
mteb/models/model_meta.py
CHANGED
|
@@ -26,7 +26,7 @@ from huggingface_hub.errors import (
|
|
|
26
26
|
RepositoryNotFoundError,
|
|
27
27
|
SafetensorsParsingError,
|
|
28
28
|
)
|
|
29
|
-
from pydantic import BaseModel, ConfigDict, field_validator
|
|
29
|
+
from pydantic import BaseModel, ConfigDict, field_validator, model_validator
|
|
30
30
|
from transformers import AutoConfig
|
|
31
31
|
from typing_extensions import Self
|
|
32
32
|
|
|
@@ -57,6 +57,8 @@ FRAMEWORKS = Literal[
|
|
|
57
57
|
"ColPali",
|
|
58
58
|
]
|
|
59
59
|
|
|
60
|
+
MODEL_TYPES = Literal["dense", "cross-encoder", "late-interaction"]
|
|
61
|
+
|
|
60
62
|
|
|
61
63
|
class ScoringFunction(HelpfulStrEnum):
|
|
62
64
|
"""The scoring function used by the models."""
|
|
@@ -114,7 +116,7 @@ class ModelMeta(BaseModel):
|
|
|
114
116
|
a benchmark as well as mark dataset contaminations.
|
|
115
117
|
adapted_from: Name of the model from which this model is adapted. For quantizations, fine-tunes, long doc extensions, etc.
|
|
116
118
|
superseded_by: Name of the model that supersedes this model, e.g., nvidia/NV-Embed-v2 supersedes v1.
|
|
117
|
-
|
|
119
|
+
model_type: A list of strings representing the type of model.
|
|
118
120
|
modalities: A list of strings representing the modalities the model supports. Default is ["text"].
|
|
119
121
|
contacts: The people to contact in case of a problem in the model, preferably a GitHub handle.
|
|
120
122
|
"""
|
|
@@ -144,10 +146,49 @@ class ModelMeta(BaseModel):
|
|
|
144
146
|
adapted_from: str | None = None
|
|
145
147
|
superseded_by: str | None = None
|
|
146
148
|
modalities: list[Modalities] = ["text"]
|
|
147
|
-
|
|
149
|
+
model_type: list[MODEL_TYPES] = ["dense"]
|
|
148
150
|
citation: str | None = None
|
|
149
151
|
contacts: list[str] | None = None
|
|
150
152
|
|
|
153
|
+
@model_validator(mode="before")
|
|
154
|
+
@classmethod
|
|
155
|
+
def handle_legacy_is_cross_encoder(cls, data: Any) -> Any:
|
|
156
|
+
"""Handle legacy is_cross_encoder field by converting it to model_type.
|
|
157
|
+
|
|
158
|
+
This validator handles backward compatibility for the deprecated is_cross_encoder field.
|
|
159
|
+
If is_cross_encoder=True is provided, it adds "cross_encoder" to model_type.
|
|
160
|
+
"""
|
|
161
|
+
if isinstance(data, dict) and "is_cross_encoder" in data:
|
|
162
|
+
is_cross_encoder_value = data.pop("is_cross_encoder")
|
|
163
|
+
|
|
164
|
+
if is_cross_encoder_value is not None:
|
|
165
|
+
warnings.warn(
|
|
166
|
+
"is_cross_encoder is deprecated and will be removed in a future version. "
|
|
167
|
+
"Use model_type=['cross-encoder'] instead.",
|
|
168
|
+
DeprecationWarning,
|
|
169
|
+
stacklevel=2,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
model_type = data.get("model_type", ["dense"])
|
|
173
|
+
|
|
174
|
+
if is_cross_encoder_value:
|
|
175
|
+
if "cross-encoder" not in model_type:
|
|
176
|
+
data["model_type"] = ["cross-encoder"]
|
|
177
|
+
else:
|
|
178
|
+
if "cross-encoder" in model_type:
|
|
179
|
+
model_type = [t for t in model_type if t != "cross-encoder"]
|
|
180
|
+
data["model_type"] = model_type if model_type else ["dense"]
|
|
181
|
+
|
|
182
|
+
return data
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def is_cross_encoder(self) -> bool:
|
|
186
|
+
"""Returns True if the model is a cross-encoder.
|
|
187
|
+
|
|
188
|
+
Derived from model_type field. A model is considered a cross-encoder if "cross-encoder" is in its model_type list.
|
|
189
|
+
"""
|
|
190
|
+
return "cross-encoder" in self.model_type
|
|
191
|
+
|
|
151
192
|
@field_validator("similarity_fn_name", mode="before")
|
|
152
193
|
@classmethod
|
|
153
194
|
def _validate_similarity_fn_name(cls, value: str) -> ScoringFunction | None:
|
|
@@ -183,6 +224,7 @@ class ModelMeta(BaseModel):
|
|
|
183
224
|
else dict_repr["training_datasets"]
|
|
184
225
|
)
|
|
185
226
|
dict_repr["loader"] = _get_loader_name(loader)
|
|
227
|
+
dict_repr["is_cross_encoder"] = self.is_cross_encoder
|
|
186
228
|
return dict_repr
|
|
187
229
|
|
|
188
230
|
@field_validator("languages")
|
|
@@ -425,6 +467,7 @@ class ModelMeta(BaseModel):
|
|
|
425
467
|
meta.loader = CrossEncoderWrapper
|
|
426
468
|
meta.embed_dim = None
|
|
427
469
|
meta.modalities = ["text"]
|
|
470
|
+
meta.model_type = ["cross-encoder"]
|
|
428
471
|
return meta
|
|
429
472
|
|
|
430
473
|
def is_zero_shot_on(self, tasks: Sequence[AbsTask] | Sequence[str]) -> bool | None:
|
|
@@ -468,7 +511,9 @@ class ModelMeta(BaseModel):
|
|
|
468
511
|
if adapted_training_datasets is not None:
|
|
469
512
|
training_datasets |= adapted_training_datasets
|
|
470
513
|
except (ValueError, KeyError) as e:
|
|
471
|
-
|
|
514
|
+
msg = f"Could not get source model: {e} in MTEB"
|
|
515
|
+
logger.warning(msg)
|
|
516
|
+
warnings.warn(msg)
|
|
472
517
|
|
|
473
518
|
return_dataset = training_datasets.copy()
|
|
474
519
|
visited = set()
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
2
3
|
from collections.abc import Callable
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
@@ -127,7 +128,9 @@ class FaissSearchIndex:
|
|
|
127
128
|
query_id = query_idx_to_id[query_idx]
|
|
128
129
|
ranked_ids = top_ranked.get(query_id)
|
|
129
130
|
if not ranked_ids:
|
|
130
|
-
|
|
131
|
+
msg = f"No top-ranked documents for query {query_id}"
|
|
132
|
+
logger.warning(msg)
|
|
133
|
+
warnings.warn(msg)
|
|
131
134
|
scores_all.append([])
|
|
132
135
|
idxs_all.append([])
|
|
133
136
|
continue
|
mteb/models/search_wrappers.py
CHANGED
|
@@ -340,7 +340,8 @@ class SearchEncoderWrapper:
|
|
|
340
340
|
for query_idx, query_embedding in enumerate(query_embeddings):
|
|
341
341
|
query_id = query_idx_to_id[query_idx]
|
|
342
342
|
if query_id not in top_ranked:
|
|
343
|
-
|
|
343
|
+
msg = f"No pre-ranked documents found for query {query_id}"
|
|
344
|
+
logger.warning(msg)
|
|
344
345
|
continue
|
|
345
346
|
|
|
346
347
|
ranked_ids = top_ranked[query_id]
|
|
@@ -511,7 +512,8 @@ class SearchCrossEncoderWrapper:
|
|
|
511
512
|
doc_pairs_ids: list[tuple[str, str]] = []
|
|
512
513
|
for query_id, corpus_ids in top_ranked.items():
|
|
513
514
|
if query_id not in top_ranked:
|
|
514
|
-
|
|
515
|
+
msg = f"No pre-ranked documents found for query {query_id}"
|
|
516
|
+
logger.warning(msg)
|
|
515
517
|
continue
|
|
516
518
|
|
|
517
519
|
query_idx = query_id_to_idx[query_id]
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import warnings
|
|
4
5
|
from typing import TYPE_CHECKING, Any
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
@@ -75,9 +76,9 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
|
|
|
75
76
|
if built_in_prompts and not model_prompts:
|
|
76
77
|
model_prompts = built_in_prompts
|
|
77
78
|
elif model_prompts and built_in_prompts:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
)
|
|
79
|
+
msg = f"Model prompts specified, these will overwrite the default model prompts. Current prompts will be:\n {model_prompts}"
|
|
80
|
+
logger.warning(msg)
|
|
81
|
+
warnings.warn(msg)
|
|
81
82
|
self.model.prompts = model_prompts
|
|
82
83
|
|
|
83
84
|
self.model_prompts, invalid_prompts = self.validate_task_to_prompt_name(
|
|
@@ -86,9 +87,9 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
|
|
|
86
87
|
|
|
87
88
|
if invalid_prompts:
|
|
88
89
|
invalid_prompts = "\n".join(invalid_prompts)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
)
|
|
90
|
+
msg = f"Some prompts are not in the expected format and will be ignored. Problems:\n\n{invalid_prompts}"
|
|
91
|
+
logger.warning(msg)
|
|
92
|
+
warnings.warn(msg)
|
|
92
93
|
|
|
93
94
|
if (
|
|
94
95
|
self.model_prompts
|
|
@@ -98,10 +99,9 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
|
|
|
98
99
|
or PromptType.document.value not in self.model_prompts
|
|
99
100
|
)
|
|
100
101
|
):
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
)
|
|
102
|
+
msg = f"SentenceTransformers that use prompts most often need to be configured with at least 'query' and 'document' prompts to ensure optimal performance. Received {self.model_prompts}"
|
|
103
|
+
logger.warning(msg)
|
|
104
|
+
warnings.warn(msg)
|
|
105
105
|
|
|
106
106
|
if hasattr(self.model, "similarity") and callable(self.model.similarity):
|
|
107
107
|
self.similarity = self.model.similarity
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import functools
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import warnings
|
|
@@ -32,6 +33,24 @@ from .model_result import ModelResult, _aggregate_and_pivot
|
|
|
32
33
|
logger = logging.getLogger(__name__)
|
|
33
34
|
|
|
34
35
|
|
|
36
|
+
# Global cache for model metas and version parsing
|
|
37
|
+
@functools.lru_cache
|
|
38
|
+
def _get_cached_model_metas() -> dict[str, str | None]:
|
|
39
|
+
"""Cache model metas to avoid repeated calls."""
|
|
40
|
+
return {meta.name: meta.revision for meta in get_model_metas()}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@functools.lru_cache(maxsize=10000)
|
|
44
|
+
def _parse_version_cached(version_str: str | None) -> Version | None:
|
|
45
|
+
"""Cache version parsing to avoid repeated parsing."""
|
|
46
|
+
if version_str is None:
|
|
47
|
+
return None
|
|
48
|
+
try:
|
|
49
|
+
return Version(version_str)
|
|
50
|
+
except (InvalidVersion, TypeError):
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
35
54
|
class BenchmarkResults(BaseModel):
|
|
36
55
|
"""Data class to hold the benchmark results of a model.
|
|
37
56
|
|
|
@@ -174,40 +193,6 @@ class BenchmarkResults(BaseModel):
|
|
|
174
193
|
Returns:
|
|
175
194
|
A new BenchmarkResults object with the revisions joined.
|
|
176
195
|
"""
|
|
177
|
-
|
|
178
|
-
def parse_version(version_str: str) -> Version | None:
|
|
179
|
-
try:
|
|
180
|
-
return Version(version_str)
|
|
181
|
-
except (InvalidVersion, TypeError):
|
|
182
|
-
return None
|
|
183
|
-
|
|
184
|
-
def keep_best(group: pd.DataFrame) -> pd.DataFrame:
|
|
185
|
-
# Filtering out task_results where no scores are present
|
|
186
|
-
group = group[group["has_scores"]]
|
|
187
|
-
is_main_revision = group["revision"] == group["main_revision"]
|
|
188
|
-
# If the main revision is present we select that
|
|
189
|
-
if is_main_revision.sum() > 0:
|
|
190
|
-
return group[is_main_revision].head(n=1)
|
|
191
|
-
unique_revisions = group["revision"].unique()
|
|
192
|
-
|
|
193
|
-
# ensure None/NA/"external" revisions is filtered out
|
|
194
|
-
group.loc[group["revision"].isna(), "revision"] = "no_revision_available"
|
|
195
|
-
group.loc[group["revision"] == "external", "revision"] = (
|
|
196
|
-
"no_revision_available"
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
# Filtering out no_revision_available if other revisions are present
|
|
200
|
-
if (len(unique_revisions) > 1) and (
|
|
201
|
-
"no_revision_available" in unique_revisions
|
|
202
|
-
):
|
|
203
|
-
group = group[group["revision"] != "no_revision_available"]
|
|
204
|
-
# If there are any not-NA mteb versions, we select the latest one
|
|
205
|
-
if group["mteb_version"].notna().any():
|
|
206
|
-
group = group.dropna(subset=["mteb_version"])
|
|
207
|
-
group = group.sort_values("mteb_version", ascending=False)
|
|
208
|
-
return group.head(n=1)
|
|
209
|
-
return group.head(n=1)
|
|
210
|
-
|
|
211
196
|
records = []
|
|
212
197
|
for model_result in self:
|
|
213
198
|
for task_result in model_result.task_results:
|
|
@@ -224,17 +209,54 @@ class BenchmarkResults(BaseModel):
|
|
|
224
209
|
if not records:
|
|
225
210
|
return BenchmarkResults.model_construct(model_results=[])
|
|
226
211
|
task_df = pd.DataFrame.from_records(records)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
212
|
+
|
|
213
|
+
# Use cached model metas
|
|
214
|
+
model_to_main_revision = _get_cached_model_metas()
|
|
230
215
|
task_df["main_revision"] = task_df["model"].map(model_to_main_revision) # type: ignore
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
216
|
+
|
|
217
|
+
# Use cached version parsing
|
|
218
|
+
task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached) # type: ignore
|
|
219
|
+
|
|
220
|
+
# Filter out rows without scores first
|
|
221
|
+
task_df = task_df[task_df["has_scores"]]
|
|
222
|
+
|
|
223
|
+
# Optimize groupby with vectorized operations
|
|
224
|
+
# Sort by priority: main_revision match, then mteb_version (descending), then revision
|
|
225
|
+
task_df["is_main_revision"] = task_df["revision"] == task_df["main_revision"]
|
|
226
|
+
|
|
227
|
+
# Handle None/NA/external revisions
|
|
228
|
+
task_df["revision_clean"] = task_df["revision"].copy()
|
|
229
|
+
task_df.loc[task_df["revision"].isna(), "revision_clean"] = (
|
|
230
|
+
"no_revision_available"
|
|
231
|
+
)
|
|
232
|
+
task_df.loc[task_df["revision"] == "external", "revision_clean"] = (
|
|
233
|
+
"no_revision_available"
|
|
236
234
|
)
|
|
235
|
+
|
|
236
|
+
# Create a priority column for sorting
|
|
237
|
+
# Higher priority = better to keep
|
|
238
|
+
# Priority: main_revision (1000), has valid mteb_version (100), has valid revision (10)
|
|
239
|
+
task_df["priority"] = 0
|
|
240
|
+
task_df.loc[task_df["is_main_revision"], "priority"] += 1000
|
|
241
|
+
task_df.loc[task_df["mteb_version"].notna(), "priority"] += 100
|
|
242
|
+
task_df.loc[
|
|
243
|
+
task_df["revision_clean"] != "no_revision_available", "priority"
|
|
244
|
+
] += 10
|
|
245
|
+
|
|
246
|
+
# Sort by priority (desc), mteb_version (desc), and take first per group
|
|
247
|
+
task_df = task_df.sort_values(
|
|
248
|
+
["model", "task_name", "priority", "mteb_version"],
|
|
249
|
+
ascending=[True, True, False, False],
|
|
250
|
+
na_position="last",
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
task_df = task_df.groupby(["model", "task_name"], as_index=False).first()
|
|
254
|
+
|
|
255
|
+
# Reconstruct model results
|
|
237
256
|
model_results = []
|
|
257
|
+
# Group by original revision to maintain deterministic behavior
|
|
258
|
+
# After the first() selection above, each (model, task_name) is unique,
|
|
259
|
+
# so grouping by original revision ensures consistent ModelResult creation
|
|
238
260
|
for (model, model_revision), group in task_df.groupby(["model", "revision"]):
|
|
239
261
|
model_result = ModelResult.model_construct(
|
|
240
262
|
model_name=model,
|
|
@@ -342,7 +364,9 @@ class BenchmarkResults(BaseModel):
|
|
|
342
364
|
scores_data.extend(model_result._get_score_for_table())
|
|
343
365
|
|
|
344
366
|
if not scores_data:
|
|
345
|
-
|
|
367
|
+
msg = "No scores data available. Returning empty DataFrame."
|
|
368
|
+
logger.warning(msg)
|
|
369
|
+
warnings.warn(msg)
|
|
346
370
|
return pd.DataFrame()
|
|
347
371
|
|
|
348
372
|
# Create DataFrame
|
mteb/results/model_result.py
CHANGED
|
@@ -292,7 +292,9 @@ class ModelResult(BaseModel):
|
|
|
292
292
|
scores_data = self._get_score_for_table()
|
|
293
293
|
|
|
294
294
|
if not scores_data:
|
|
295
|
-
|
|
295
|
+
msg = "No scores data available. Returning empty DataFrame."
|
|
296
|
+
logger.warning(msg)
|
|
297
|
+
warnings.warn(msg)
|
|
296
298
|
return pd.DataFrame()
|
|
297
299
|
|
|
298
300
|
# Create DataFrame
|
mteb/results/task_result.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
|
+
import warnings
|
|
5
6
|
from argparse import Namespace
|
|
6
7
|
from collections import defaultdict
|
|
7
8
|
from collections.abc import Callable, Iterable
|
|
@@ -462,7 +463,9 @@ class TaskResult(BaseModel):
|
|
|
462
463
|
if main_score in hf_subset_scores:
|
|
463
464
|
hf_subset_scores["main_score"] = hf_subset_scores[main_score]
|
|
464
465
|
else:
|
|
465
|
-
|
|
466
|
+
msg = f"Main score {main_score} not found in scores"
|
|
467
|
+
logger.warning(msg)
|
|
468
|
+
warnings.warn(msg)
|
|
466
469
|
hf_subset_scores["main_score"] = None
|
|
467
470
|
|
|
468
471
|
# specific fixes:
|
|
@@ -633,21 +636,23 @@ class TaskResult(BaseModel):
|
|
|
633
636
|
task = get_task(self.task_name)
|
|
634
637
|
|
|
635
638
|
splits = task.eval_splits
|
|
636
|
-
hf_subsets = task.hf_subsets
|
|
637
|
-
hf_subsets = set(hf_subsets)
|
|
639
|
+
hf_subsets = set(task.hf_subsets) # Convert to set once
|
|
638
640
|
|
|
639
641
|
new_scores = {}
|
|
640
642
|
seen_splits = set()
|
|
641
643
|
for split in self.scores:
|
|
642
644
|
if split not in splits:
|
|
643
645
|
continue
|
|
644
|
-
new_scores[split] = []
|
|
645
646
|
seen_subsets = set()
|
|
646
|
-
for
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
647
|
+
# Use list comprehension for better performance
|
|
648
|
+
new_scores[split] = [
|
|
649
|
+
_scores
|
|
650
|
+
for _scores in self.scores[split]
|
|
651
|
+
if _scores["hf_subset"] in hf_subsets
|
|
652
|
+
]
|
|
653
|
+
for _scores in new_scores[split]:
|
|
650
654
|
seen_subsets.add(_scores["hf_subset"])
|
|
655
|
+
|
|
651
656
|
if seen_subsets != hf_subsets:
|
|
652
657
|
missing_subsets = hf_subsets - seen_subsets
|
|
653
658
|
if len(missing_subsets) > 2:
|
|
@@ -656,17 +661,17 @@ class TaskResult(BaseModel):
|
|
|
656
661
|
else:
|
|
657
662
|
missing_subsets_str = str(missing_subsets)
|
|
658
663
|
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
)
|
|
664
|
+
msg = f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
|
|
665
|
+
logger.warning(msg)
|
|
666
|
+
warnings.warn(msg)
|
|
662
667
|
seen_splits.add(split)
|
|
663
668
|
if seen_splits != set(splits):
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
)
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
return
|
|
669
|
+
msg = f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
|
|
670
|
+
logger.warning(msg)
|
|
671
|
+
warnings.warn(msg)
|
|
672
|
+
data = self.model_dump()
|
|
673
|
+
data["scores"] = new_scores
|
|
674
|
+
return type(self).model_construct(**data)
|
|
670
675
|
|
|
671
676
|
def is_mergeable(
|
|
672
677
|
self,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.3
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|