mteb 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. mteb/abstasks/abstask.py +6 -6
  2. mteb/abstasks/aggregated_task.py +4 -10
  3. mteb/abstasks/clustering_legacy.py +3 -2
  4. mteb/abstasks/task_metadata.py +2 -3
  5. mteb/cache.py +7 -4
  6. mteb/cli/build_cli.py +10 -5
  7. mteb/cli/generate_model_card.py +4 -3
  8. mteb/deprecated_evaluator.py +4 -3
  9. mteb/evaluate.py +4 -1
  10. mteb/get_tasks.py +4 -3
  11. mteb/leaderboard/app.py +70 -3
  12. mteb/models/abs_encoder.py +5 -3
  13. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +4 -1
  14. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +13 -12
  15. mteb/models/model_implementations/align_models.py +1 -0
  16. mteb/models/model_implementations/amazon_models.py +1 -0
  17. mteb/models/model_implementations/andersborges.py +2 -0
  18. mteb/models/model_implementations/ara_models.py +1 -0
  19. mteb/models/model_implementations/arctic_models.py +8 -0
  20. mteb/models/model_implementations/b1ade_models.py +1 -0
  21. mteb/models/model_implementations/bedrock_models.py +4 -0
  22. mteb/models/model_implementations/bge_models.py +17 -0
  23. mteb/models/model_implementations/bica_model.py +1 -0
  24. mteb/models/model_implementations/blip2_models.py +2 -0
  25. mteb/models/model_implementations/blip_models.py +8 -0
  26. mteb/models/model_implementations/bm25.py +1 -0
  27. mteb/models/model_implementations/bmretriever_models.py +4 -0
  28. mteb/models/model_implementations/cadet_models.py +1 -0
  29. mteb/models/model_implementations/cde_models.py +2 -0
  30. mteb/models/model_implementations/clip_models.py +3 -0
  31. mteb/models/model_implementations/clips_models.py +3 -0
  32. mteb/models/model_implementations/codefuse_models.py +3 -0
  33. mteb/models/model_implementations/codesage_models.py +3 -0
  34. mteb/models/model_implementations/cohere_models.py +4 -0
  35. mteb/models/model_implementations/cohere_v.py +5 -0
  36. mteb/models/model_implementations/colpali_models.py +3 -0
  37. mteb/models/model_implementations/colqwen_models.py +9 -0
  38. mteb/models/model_implementations/colsmol_models.py +2 -0
  39. mteb/models/model_implementations/conan_models.py +1 -0
  40. mteb/models/model_implementations/dino_models.py +19 -0
  41. mteb/models/model_implementations/e5_instruct.py +4 -0
  42. mteb/models/model_implementations/e5_models.py +9 -0
  43. mteb/models/model_implementations/e5_v.py +1 -0
  44. mteb/models/model_implementations/eagerworks_models.py +1 -0
  45. mteb/models/model_implementations/emillykkejensen_models.py +3 -0
  46. mteb/models/model_implementations/en_code_retriever.py +1 -0
  47. mteb/models/model_implementations/euler_models.py +1 -0
  48. mteb/models/model_implementations/evaclip_models.py +4 -0
  49. mteb/models/model_implementations/fa_models.py +8 -0
  50. mteb/models/model_implementations/facebookai.py +2 -0
  51. mteb/models/model_implementations/geogpt_models.py +1 -0
  52. mteb/models/model_implementations/gme_v_models.py +6 -3
  53. mteb/models/model_implementations/google_models.py +5 -0
  54. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -0
  55. mteb/models/model_implementations/gritlm_models.py +2 -0
  56. mteb/models/model_implementations/gte_models.py +9 -0
  57. mteb/models/model_implementations/hinvec_models.py +1 -0
  58. mteb/models/model_implementations/human.py +1 -0
  59. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  60. mteb/models/model_implementations/inf_models.py +2 -0
  61. mteb/models/model_implementations/jasper_models.py +2 -0
  62. mteb/models/model_implementations/jina_clip.py +1 -0
  63. mteb/models/model_implementations/jina_models.py +7 -1
  64. mteb/models/model_implementations/kalm_models.py +6 -0
  65. mteb/models/model_implementations/kblab.py +1 -0
  66. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
  67. mteb/models/model_implementations/kfst.py +1 -0
  68. mteb/models/model_implementations/kowshik24_models.py +1 -0
  69. mteb/models/model_implementations/lens_models.py +2 -0
  70. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  71. mteb/models/model_implementations/linq_models.py +1 -0
  72. mteb/models/model_implementations/listconranker.py +1 -1
  73. mteb/models/model_implementations/llm2clip_models.py +3 -0
  74. mteb/models/model_implementations/llm2vec_models.py +8 -0
  75. mteb/models/model_implementations/mcinext_models.py +7 -1
  76. mteb/models/model_implementations/mdbr_models.py +2 -0
  77. mteb/models/model_implementations/misc_models.py +63 -0
  78. mteb/models/model_implementations/mme5_models.py +1 -0
  79. mteb/models/model_implementations/moco_models.py +2 -0
  80. mteb/models/model_implementations/model2vec_models.py +13 -0
  81. mteb/models/model_implementations/moka_models.py +3 -0
  82. mteb/models/model_implementations/mxbai_models.py +3 -0
  83. mteb/models/model_implementations/nbailab.py +3 -0
  84. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  85. mteb/models/model_implementations/nomic_models.py +6 -0
  86. mteb/models/model_implementations/nomic_models_vision.py +1 -0
  87. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +2 -0
  88. mteb/models/model_implementations/nvidia_models.py +3 -0
  89. mteb/models/model_implementations/octen_models.py +195 -0
  90. mteb/models/model_implementations/openai_models.py +5 -0
  91. mteb/models/model_implementations/openclip_models.py +8 -0
  92. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  93. mteb/models/model_implementations/ops_moa_models.py +2 -0
  94. mteb/models/model_implementations/pawan_models.py +1 -0
  95. mteb/models/model_implementations/piccolo_models.py +2 -0
  96. mteb/models/model_implementations/promptriever_models.py +4 -0
  97. mteb/models/model_implementations/pylate_models.py +3 -0
  98. mteb/models/model_implementations/qodo_models.py +2 -0
  99. mteb/models/model_implementations/qtack_models.py +1 -0
  100. mteb/models/model_implementations/qwen3_models.py +3 -0
  101. mteb/models/model_implementations/qzhou_models.py +2 -0
  102. mteb/models/model_implementations/random_baseline.py +2 -1
  103. mteb/models/model_implementations/rasgaard_models.py +1 -0
  104. mteb/models/model_implementations/reasonir_model.py +1 -0
  105. mteb/models/model_implementations/repllama_models.py +2 -0
  106. mteb/models/model_implementations/rerankers_custom.py +3 -3
  107. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  108. mteb/models/model_implementations/richinfoai_models.py +1 -0
  109. mteb/models/model_implementations/ru_sentence_models.py +20 -0
  110. mteb/models/model_implementations/ruri_models.py +10 -0
  111. mteb/models/model_implementations/salesforce_models.py +3 -0
  112. mteb/models/model_implementations/samilpwc_models.py +1 -0
  113. mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
  114. mteb/models/model_implementations/searchmap_models.py +1 -0
  115. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -0
  116. mteb/models/model_implementations/seed_models.py +1 -0
  117. mteb/models/model_implementations/sentence_transformers_models.py +18 -0
  118. mteb/models/model_implementations/shuu_model.py +32 -31
  119. mteb/models/model_implementations/siglip_models.py +10 -0
  120. mteb/models/model_implementations/sonar_models.py +1 -0
  121. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
  122. mteb/models/model_implementations/stella_models.py +6 -0
  123. mteb/models/model_implementations/tarka_models.py +2 -0
  124. mteb/models/model_implementations/ua_sentence_models.py +1 -0
  125. mteb/models/model_implementations/uae_models.py +1 -0
  126. mteb/models/model_implementations/vdr_models.py +1 -0
  127. mteb/models/model_implementations/vi_vn_models.py +6 -0
  128. mteb/models/model_implementations/vista_models.py +2 -0
  129. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  130. mteb/models/model_implementations/voyage_models.py +15 -0
  131. mteb/models/model_implementations/voyage_v.py +1 -0
  132. mteb/models/model_implementations/xyz_models.py +1 -0
  133. mteb/models/model_implementations/youtu_models.py +1 -0
  134. mteb/models/model_implementations/yuan_models.py +1 -0
  135. mteb/models/model_implementations/yuan_models_en.py +1 -0
  136. mteb/models/model_meta.py +49 -4
  137. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +4 -1
  138. mteb/models/search_wrappers.py +4 -2
  139. mteb/models/sentence_transformer_wrapper.py +10 -10
  140. mteb/results/benchmark_results.py +67 -43
  141. mteb/results/model_result.py +3 -1
  142. mteb/results/task_result.py +22 -17
  143. {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/METADATA +1 -1
  144. {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/RECORD +148 -147
  145. {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/WHEEL +0 -0
  146. {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/entry_points.txt +0 -0
  147. {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/licenses/LICENSE +0 -0
  148. {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/top_level.txt +0 -0
mteb/models/model_meta.py CHANGED
@@ -26,7 +26,7 @@ from huggingface_hub.errors import (
26
26
  RepositoryNotFoundError,
27
27
  SafetensorsParsingError,
28
28
  )
29
- from pydantic import BaseModel, ConfigDict, field_validator
29
+ from pydantic import BaseModel, ConfigDict, field_validator, model_validator
30
30
  from transformers import AutoConfig
31
31
  from typing_extensions import Self
32
32
 
@@ -57,6 +57,8 @@ FRAMEWORKS = Literal[
57
57
  "ColPali",
58
58
  ]
59
59
 
60
+ MODEL_TYPES = Literal["dense", "cross-encoder", "late-interaction"]
61
+
60
62
 
61
63
  class ScoringFunction(HelpfulStrEnum):
62
64
  """The scoring function used by the models."""
@@ -114,7 +116,7 @@ class ModelMeta(BaseModel):
114
116
  a benchmark as well as mark dataset contaminations.
115
117
  adapted_from: Name of the model from which this model is adapted. For quantizations, fine-tunes, long doc extensions, etc.
116
118
  superseded_by: Name of the model that supersedes this model, e.g., nvidia/NV-Embed-v2 supersedes v1.
117
- is_cross_encoder: Whether the model can act as a cross-encoder or not.
119
+ model_type: A list of strings representing the type of model.
118
120
  modalities: A list of strings representing the modalities the model supports. Default is ["text"].
119
121
  contacts: The people to contact in case of a problem in the model, preferably a GitHub handle.
120
122
  """
@@ -144,10 +146,49 @@ class ModelMeta(BaseModel):
144
146
  adapted_from: str | None = None
145
147
  superseded_by: str | None = None
146
148
  modalities: list[Modalities] = ["text"]
147
- is_cross_encoder: bool | None = None
149
+ model_type: list[MODEL_TYPES] = ["dense"]
148
150
  citation: str | None = None
149
151
  contacts: list[str] | None = None
150
152
 
153
+ @model_validator(mode="before")
154
+ @classmethod
155
+ def handle_legacy_is_cross_encoder(cls, data: Any) -> Any:
156
+ """Handle legacy is_cross_encoder field by converting it to model_type.
157
+
158
+ This validator handles backward compatibility for the deprecated is_cross_encoder field.
159
+ If is_cross_encoder=True is provided, it adds "cross_encoder" to model_type.
160
+ """
161
+ if isinstance(data, dict) and "is_cross_encoder" in data:
162
+ is_cross_encoder_value = data.pop("is_cross_encoder")
163
+
164
+ if is_cross_encoder_value is not None:
165
+ warnings.warn(
166
+ "is_cross_encoder is deprecated and will be removed in a future version. "
167
+ "Use model_type=['cross-encoder'] instead.",
168
+ DeprecationWarning,
169
+ stacklevel=2,
170
+ )
171
+
172
+ model_type = data.get("model_type", ["dense"])
173
+
174
+ if is_cross_encoder_value:
175
+ if "cross-encoder" not in model_type:
176
+ data["model_type"] = ["cross-encoder"]
177
+ else:
178
+ if "cross-encoder" in model_type:
179
+ model_type = [t for t in model_type if t != "cross-encoder"]
180
+ data["model_type"] = model_type if model_type else ["dense"]
181
+
182
+ return data
183
+
184
+ @property
185
+ def is_cross_encoder(self) -> bool:
186
+ """Returns True if the model is a cross-encoder.
187
+
188
+ Derived from model_type field. A model is considered a cross-encoder if "cross-encoder" is in its model_type list.
189
+ """
190
+ return "cross-encoder" in self.model_type
191
+
151
192
  @field_validator("similarity_fn_name", mode="before")
152
193
  @classmethod
153
194
  def _validate_similarity_fn_name(cls, value: str) -> ScoringFunction | None:
@@ -183,6 +224,7 @@ class ModelMeta(BaseModel):
183
224
  else dict_repr["training_datasets"]
184
225
  )
185
226
  dict_repr["loader"] = _get_loader_name(loader)
227
+ dict_repr["is_cross_encoder"] = self.is_cross_encoder
186
228
  return dict_repr
187
229
 
188
230
  @field_validator("languages")
@@ -425,6 +467,7 @@ class ModelMeta(BaseModel):
425
467
  meta.loader = CrossEncoderWrapper
426
468
  meta.embed_dim = None
427
469
  meta.modalities = ["text"]
470
+ meta.model_type = ["cross-encoder"]
428
471
  return meta
429
472
 
430
473
  def is_zero_shot_on(self, tasks: Sequence[AbsTask] | Sequence[str]) -> bool | None:
@@ -468,7 +511,9 @@ class ModelMeta(BaseModel):
468
511
  if adapted_training_datasets is not None:
469
512
  training_datasets |= adapted_training_datasets
470
513
  except (ValueError, KeyError) as e:
471
- logger.warning(f"Could not get source model: {e} in MTEB")
514
+ msg = f"Could not get source model: {e} in MTEB"
515
+ logger.warning(msg)
516
+ warnings.warn(msg)
472
517
 
473
518
  return_dataset = training_datasets.copy()
474
519
  visited = set()
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import warnings
2
3
  from collections.abc import Callable
3
4
 
4
5
  import numpy as np
@@ -127,7 +128,9 @@ class FaissSearchIndex:
127
128
  query_id = query_idx_to_id[query_idx]
128
129
  ranked_ids = top_ranked.get(query_id)
129
130
  if not ranked_ids:
130
- logger.warning(f"No top-ranked documents for query {query_id}")
131
+ msg = f"No top-ranked documents for query {query_id}"
132
+ logger.warning(msg)
133
+ warnings.warn(msg)
131
134
  scores_all.append([])
132
135
  idxs_all.append([])
133
136
  continue
@@ -340,7 +340,8 @@ class SearchEncoderWrapper:
340
340
  for query_idx, query_embedding in enumerate(query_embeddings):
341
341
  query_id = query_idx_to_id[query_idx]
342
342
  if query_id not in top_ranked:
343
- logger.warning(f"No pre-ranked documents found for query {query_id}")
343
+ msg = f"No pre-ranked documents found for query {query_id}"
344
+ logger.warning(msg)
344
345
  continue
345
346
 
346
347
  ranked_ids = top_ranked[query_id]
@@ -511,7 +512,8 @@ class SearchCrossEncoderWrapper:
511
512
  doc_pairs_ids: list[tuple[str, str]] = []
512
513
  for query_id, corpus_ids in top_ranked.items():
513
514
  if query_id not in top_ranked:
514
- logger.warning(f"No pre-ranked documents found for query {query_id}")
515
+ msg = f"No pre-ranked documents found for query {query_id}"
516
+ logger.warning(msg)
515
517
  continue
516
518
 
517
519
  query_idx = query_id_to_idx[query_id]
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import warnings
4
5
  from typing import TYPE_CHECKING, Any
5
6
 
6
7
  import numpy as np
@@ -75,9 +76,9 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
75
76
  if built_in_prompts and not model_prompts:
76
77
  model_prompts = built_in_prompts
77
78
  elif model_prompts and built_in_prompts:
78
- logger.warning(
79
- f"Model prompts specified, these will overwrite the default model prompts. Current prompts will be:\n {model_prompts}"
80
- )
79
+ msg = f"Model prompts specified, these will overwrite the default model prompts. Current prompts will be:\n {model_prompts}"
80
+ logger.warning(msg)
81
+ warnings.warn(msg)
81
82
  self.model.prompts = model_prompts
82
83
 
83
84
  self.model_prompts, invalid_prompts = self.validate_task_to_prompt_name(
@@ -86,9 +87,9 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
86
87
 
87
88
  if invalid_prompts:
88
89
  invalid_prompts = "\n".join(invalid_prompts)
89
- logger.warning(
90
- f"Some prompts are not in the expected format and will be ignored. Problems:\n\n{invalid_prompts}"
91
- )
90
+ msg = f"Some prompts are not in the expected format and will be ignored. Problems:\n\n{invalid_prompts}"
91
+ logger.warning(msg)
92
+ warnings.warn(msg)
92
93
 
93
94
  if (
94
95
  self.model_prompts
@@ -98,10 +99,9 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
98
99
  or PromptType.document.value not in self.model_prompts
99
100
  )
100
101
  ):
101
- logger.warning(
102
- "SentenceTransformers that use prompts most often need to be configured with at least 'query' and"
103
- f" 'document' prompts to ensure optimal performance. Received {self.model_prompts}"
104
- )
102
+ msg = f"SentenceTransformers that use prompts most often need to be configured with at least 'query' and 'document' prompts to ensure optimal performance. Received {self.model_prompts}"
103
+ logger.warning(msg)
104
+ warnings.warn(msg)
105
105
 
106
106
  if hasattr(self.model, "similarity") and callable(self.model.similarity):
107
107
  self.similarity = self.model.similarity
@@ -1,3 +1,4 @@
1
+ import functools
1
2
  import json
2
3
  import logging
3
4
  import warnings
@@ -32,6 +33,24 @@ from .model_result import ModelResult, _aggregate_and_pivot
32
33
  logger = logging.getLogger(__name__)
33
34
 
34
35
 
36
+ # Global cache for model metas and version parsing
37
+ @functools.lru_cache
38
+ def _get_cached_model_metas() -> dict[str, str | None]:
39
+ """Cache model metas to avoid repeated calls."""
40
+ return {meta.name: meta.revision for meta in get_model_metas()}
41
+
42
+
43
+ @functools.lru_cache(maxsize=10000)
44
+ def _parse_version_cached(version_str: str | None) -> Version | None:
45
+ """Cache version parsing to avoid repeated parsing."""
46
+ if version_str is None:
47
+ return None
48
+ try:
49
+ return Version(version_str)
50
+ except (InvalidVersion, TypeError):
51
+ return None
52
+
53
+
35
54
  class BenchmarkResults(BaseModel):
36
55
  """Data class to hold the benchmark results of a model.
37
56
 
@@ -174,40 +193,6 @@ class BenchmarkResults(BaseModel):
174
193
  Returns:
175
194
  A new BenchmarkResults object with the revisions joined.
176
195
  """
177
-
178
- def parse_version(version_str: str) -> Version | None:
179
- try:
180
- return Version(version_str)
181
- except (InvalidVersion, TypeError):
182
- return None
183
-
184
- def keep_best(group: pd.DataFrame) -> pd.DataFrame:
185
- # Filtering out task_results where no scores are present
186
- group = group[group["has_scores"]]
187
- is_main_revision = group["revision"] == group["main_revision"]
188
- # If the main revision is present we select that
189
- if is_main_revision.sum() > 0:
190
- return group[is_main_revision].head(n=1)
191
- unique_revisions = group["revision"].unique()
192
-
193
- # ensure None/NA/"external" revisions is filtered out
194
- group.loc[group["revision"].isna(), "revision"] = "no_revision_available"
195
- group.loc[group["revision"] == "external", "revision"] = (
196
- "no_revision_available"
197
- )
198
-
199
- # Filtering out no_revision_available if other revisions are present
200
- if (len(unique_revisions) > 1) and (
201
- "no_revision_available" in unique_revisions
202
- ):
203
- group = group[group["revision"] != "no_revision_available"]
204
- # If there are any not-NA mteb versions, we select the latest one
205
- if group["mteb_version"].notna().any():
206
- group = group.dropna(subset=["mteb_version"])
207
- group = group.sort_values("mteb_version", ascending=False)
208
- return group.head(n=1)
209
- return group.head(n=1)
210
-
211
196
  records = []
212
197
  for model_result in self:
213
198
  for task_result in model_result.task_results:
@@ -224,17 +209,54 @@ class BenchmarkResults(BaseModel):
224
209
  if not records:
225
210
  return BenchmarkResults.model_construct(model_results=[])
226
211
  task_df = pd.DataFrame.from_records(records)
227
- model_to_main_revision = {
228
- meta.name: meta.revision for meta in get_model_metas()
229
- }
212
+
213
+ # Use cached model metas
214
+ model_to_main_revision = _get_cached_model_metas()
230
215
  task_df["main_revision"] = task_df["model"].map(model_to_main_revision) # type: ignore
231
- task_df["mteb_version"] = task_df["mteb_version"].map(parse_version) # type: ignore
232
- task_df = (
233
- task_df.groupby(["model", "task_name"])
234
- .apply(keep_best)
235
- .reset_index(drop=True)
216
+
217
+ # Use cached version parsing
218
+ task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached) # type: ignore
219
+
220
+ # Filter out rows without scores first
221
+ task_df = task_df[task_df["has_scores"]]
222
+
223
+ # Optimize groupby with vectorized operations
224
+ # Sort by priority: main_revision match, then mteb_version (descending), then revision
225
+ task_df["is_main_revision"] = task_df["revision"] == task_df["main_revision"]
226
+
227
+ # Handle None/NA/external revisions
228
+ task_df["revision_clean"] = task_df["revision"].copy()
229
+ task_df.loc[task_df["revision"].isna(), "revision_clean"] = (
230
+ "no_revision_available"
231
+ )
232
+ task_df.loc[task_df["revision"] == "external", "revision_clean"] = (
233
+ "no_revision_available"
236
234
  )
235
+
236
+ # Create a priority column for sorting
237
+ # Higher priority = better to keep
238
+ # Priority: main_revision (1000), has valid mteb_version (100), has valid revision (10)
239
+ task_df["priority"] = 0
240
+ task_df.loc[task_df["is_main_revision"], "priority"] += 1000
241
+ task_df.loc[task_df["mteb_version"].notna(), "priority"] += 100
242
+ task_df.loc[
243
+ task_df["revision_clean"] != "no_revision_available", "priority"
244
+ ] += 10
245
+
246
+ # Sort by priority (desc), mteb_version (desc), and take first per group
247
+ task_df = task_df.sort_values(
248
+ ["model", "task_name", "priority", "mteb_version"],
249
+ ascending=[True, True, False, False],
250
+ na_position="last",
251
+ )
252
+
253
+ task_df = task_df.groupby(["model", "task_name"], as_index=False).first()
254
+
255
+ # Reconstruct model results
237
256
  model_results = []
257
+ # Group by original revision to maintain deterministic behavior
258
+ # After the first() selection above, each (model, task_name) is unique,
259
+ # so grouping by original revision ensures consistent ModelResult creation
238
260
  for (model, model_revision), group in task_df.groupby(["model", "revision"]):
239
261
  model_result = ModelResult.model_construct(
240
262
  model_name=model,
@@ -342,7 +364,9 @@ class BenchmarkResults(BaseModel):
342
364
  scores_data.extend(model_result._get_score_for_table())
343
365
 
344
366
  if not scores_data:
345
- logger.warning("No scores data available. Returning empty DataFrame.")
367
+ msg = "No scores data available. Returning empty DataFrame."
368
+ logger.warning(msg)
369
+ warnings.warn(msg)
346
370
  return pd.DataFrame()
347
371
 
348
372
  # Create DataFrame
@@ -292,7 +292,9 @@ class ModelResult(BaseModel):
292
292
  scores_data = self._get_score_for_table()
293
293
 
294
294
  if not scores_data:
295
- logger.warning("No scores data available. Returning empty DataFrame.")
295
+ msg = "No scores data available. Returning empty DataFrame."
296
+ logger.warning(msg)
297
+ warnings.warn(msg)
296
298
  return pd.DataFrame()
297
299
 
298
300
  # Create DataFrame
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  import logging
5
+ import warnings
5
6
  from argparse import Namespace
6
7
  from collections import defaultdict
7
8
  from collections.abc import Callable, Iterable
@@ -462,7 +463,9 @@ class TaskResult(BaseModel):
462
463
  if main_score in hf_subset_scores:
463
464
  hf_subset_scores["main_score"] = hf_subset_scores[main_score]
464
465
  else:
465
- logger.warning(f"Main score {main_score} not found in scores")
466
+ msg = f"Main score {main_score} not found in scores"
467
+ logger.warning(msg)
468
+ warnings.warn(msg)
466
469
  hf_subset_scores["main_score"] = None
467
470
 
468
471
  # specific fixes:
@@ -633,21 +636,23 @@ class TaskResult(BaseModel):
633
636
  task = get_task(self.task_name)
634
637
 
635
638
  splits = task.eval_splits
636
- hf_subsets = task.hf_subsets
637
- hf_subsets = set(hf_subsets)
639
+ hf_subsets = set(task.hf_subsets) # Convert to set once
638
640
 
639
641
  new_scores = {}
640
642
  seen_splits = set()
641
643
  for split in self.scores:
642
644
  if split not in splits:
643
645
  continue
644
- new_scores[split] = []
645
646
  seen_subsets = set()
646
- for _scores in self.scores[split]:
647
- if _scores["hf_subset"] not in hf_subsets:
648
- continue
649
- new_scores[split].append(_scores)
647
+ # Use list comprehension for better performance
648
+ new_scores[split] = [
649
+ _scores
650
+ for _scores in self.scores[split]
651
+ if _scores["hf_subset"] in hf_subsets
652
+ ]
653
+ for _scores in new_scores[split]:
650
654
  seen_subsets.add(_scores["hf_subset"])
655
+
651
656
  if seen_subsets != hf_subsets:
652
657
  missing_subsets = hf_subsets - seen_subsets
653
658
  if len(missing_subsets) > 2:
@@ -656,17 +661,17 @@ class TaskResult(BaseModel):
656
661
  else:
657
662
  missing_subsets_str = str(missing_subsets)
658
663
 
659
- logger.warning(
660
- f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
661
- )
664
+ msg = f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
665
+ logger.warning(msg)
666
+ warnings.warn(msg)
662
667
  seen_splits.add(split)
663
668
  if seen_splits != set(splits):
664
- logger.warning(
665
- f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
666
- )
667
- new_res = {**self.to_dict(), "scores": new_scores}
668
- new_res = TaskResult.from_validated(**new_res)
669
- return new_res
669
+ msg = f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
670
+ logger.warning(msg)
671
+ warnings.warn(msg)
672
+ data = self.model_dump()
673
+ data["scores"] = new_scores
674
+ return type(self).model_construct(**data)
670
675
 
671
676
  def is_mergeable(
672
677
  self,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.5.1
3
+ Version: 2.5.3
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>