mteb 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/abstasks/abstask.py +6 -6
- mteb/abstasks/aggregated_task.py +4 -10
- mteb/abstasks/clustering_legacy.py +3 -2
- mteb/abstasks/task_metadata.py +2 -3
- mteb/cache.py +7 -4
- mteb/cli/build_cli.py +10 -5
- mteb/cli/generate_model_card.py +4 -3
- mteb/deprecated_evaluator.py +4 -3
- mteb/evaluate.py +4 -1
- mteb/get_tasks.py +4 -3
- mteb/leaderboard/app.py +70 -3
- mteb/models/abs_encoder.py +5 -3
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +4 -1
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +13 -12
- mteb/models/model_implementations/align_models.py +1 -0
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +2 -0
- mteb/models/model_implementations/ara_models.py +1 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +17 -0
- mteb/models/model_implementations/bica_model.py +1 -0
- mteb/models/model_implementations/blip2_models.py +2 -0
- mteb/models/model_implementations/blip_models.py +8 -0
- mteb/models/model_implementations/bm25.py +1 -0
- mteb/models/model_implementations/bmretriever_models.py +4 -0
- mteb/models/model_implementations/cadet_models.py +1 -0
- mteb/models/model_implementations/cde_models.py +2 -0
- mteb/models/model_implementations/clip_models.py +3 -0
- mteb/models/model_implementations/clips_models.py +3 -0
- mteb/models/model_implementations/codefuse_models.py +3 -0
- mteb/models/model_implementations/codesage_models.py +3 -0
- mteb/models/model_implementations/cohere_models.py +4 -0
- mteb/models/model_implementations/cohere_v.py +5 -0
- mteb/models/model_implementations/colpali_models.py +3 -0
- mteb/models/model_implementations/colqwen_models.py +9 -0
- mteb/models/model_implementations/colsmol_models.py +2 -0
- mteb/models/model_implementations/conan_models.py +1 -0
- mteb/models/model_implementations/dino_models.py +19 -0
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +9 -0
- mteb/models/model_implementations/e5_v.py +1 -0
- mteb/models/model_implementations/eagerworks_models.py +1 -0
- mteb/models/model_implementations/emillykkejensen_models.py +3 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +1 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +8 -0
- mteb/models/model_implementations/facebookai.py +2 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +6 -3
- mteb/models/model_implementations/google_models.py +5 -0
- mteb/models/model_implementations/granite_vision_embedding_models.py +1 -0
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +9 -0
- mteb/models/model_implementations/hinvec_models.py +1 -0
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +2 -0
- mteb/models/model_implementations/jina_clip.py +1 -0
- mteb/models/model_implementations/jina_models.py +7 -1
- mteb/models/model_implementations/kalm_models.py +6 -0
- mteb/models/model_implementations/kblab.py +1 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
- mteb/models/model_implementations/kfst.py +1 -0
- mteb/models/model_implementations/kowshik24_models.py +1 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +1 -0
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +3 -0
- mteb/models/model_implementations/llm2vec_models.py +8 -0
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +63 -0
- mteb/models/model_implementations/mme5_models.py +1 -0
- mteb/models/model_implementations/moco_models.py +2 -0
- mteb/models/model_implementations/model2vec_models.py +13 -0
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/mxbai_models.py +3 -0
- mteb/models/model_implementations/nbailab.py +3 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
- mteb/models/model_implementations/nomic_models.py +6 -0
- mteb/models/model_implementations/nomic_models_vision.py +1 -0
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +2 -0
- mteb/models/model_implementations/nvidia_models.py +3 -0
- mteb/models/model_implementations/octen_models.py +195 -0
- mteb/models/model_implementations/openai_models.py +5 -0
- mteb/models/model_implementations/openclip_models.py +8 -0
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
- mteb/models/model_implementations/ops_moa_models.py +2 -0
- mteb/models/model_implementations/pawan_models.py +1 -0
- mteb/models/model_implementations/piccolo_models.py +2 -0
- mteb/models/model_implementations/promptriever_models.py +4 -0
- mteb/models/model_implementations/pylate_models.py +3 -0
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +3 -0
- mteb/models/model_implementations/qzhou_models.py +2 -0
- mteb/models/model_implementations/random_baseline.py +2 -1
- mteb/models/model_implementations/rasgaard_models.py +1 -0
- mteb/models/model_implementations/reasonir_model.py +1 -0
- mteb/models/model_implementations/repllama_models.py +2 -0
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +20 -0
- mteb/models/model_implementations/ruri_models.py +10 -0
- mteb/models/model_implementations/salesforce_models.py +3 -0
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +18 -0
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +10 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +2 -0
- mteb/models/model_implementations/ua_sentence_models.py +1 -0
- mteb/models/model_implementations/uae_models.py +1 -0
- mteb/models/model_implementations/vdr_models.py +1 -0
- mteb/models/model_implementations/vi_vn_models.py +6 -0
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +2 -0
- mteb/models/model_implementations/voyage_models.py +15 -0
- mteb/models/model_implementations/voyage_v.py +1 -0
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +1 -0
- mteb/models/model_meta.py +49 -4
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +4 -1
- mteb/models/search_wrappers.py +4 -2
- mteb/models/sentence_transformer_wrapper.py +10 -10
- mteb/results/benchmark_results.py +67 -43
- mteb/results/model_result.py +3 -1
- mteb/results/task_result.py +22 -17
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/METADATA +1 -1
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/RECORD +148 -147
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/WHEEL +0 -0
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.1.dist-info → mteb-2.5.3.dist-info}/top_level.txt +0 -0
mteb/abstasks/abstask.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
5
|
from collections.abc import Sequence
|
|
5
6
|
from copy import copy
|
|
@@ -102,9 +103,9 @@ class AbsTask(ABC):
|
|
|
102
103
|
def check_if_dataset_is_superseded(self) -> None:
|
|
103
104
|
"""Check if the dataset is superseded by a newer version."""
|
|
104
105
|
if self.superseded_by:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
)
|
|
106
|
+
msg = f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}'. We recommend using the newer version of the dataset unless you are running a specific benchmark. See `get_task('{self.superseded_by}').metadata.description` to get a description of the task and changes."
|
|
107
|
+
logger.warning(msg)
|
|
108
|
+
warnings.warn(msg)
|
|
108
109
|
|
|
109
110
|
def dataset_transform(self):
|
|
110
111
|
"""A transform operations applied to the dataset after loading.
|
|
@@ -607,9 +608,8 @@ class AbsTask(ABC):
|
|
|
607
608
|
self.data_loaded = False
|
|
608
609
|
logger.info(f"Unloaded dataset {self.metadata.name} from memory.")
|
|
609
610
|
else:
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
)
|
|
611
|
+
msg = f"Dataset `{self.metadata.name}` is not loaded, cannot unload it."
|
|
612
|
+
logger.warning(msg)
|
|
613
613
|
|
|
614
614
|
@property
|
|
615
615
|
def superseded_by(self) -> str | None:
|
mteb/abstasks/aggregated_task.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import Any
|
|
4
5
|
|
|
@@ -113,20 +114,13 @@ class AbsTaskAggregate(AbsTask):
|
|
|
113
114
|
)
|
|
114
115
|
mteb_versions = {tr.mteb_version for tr in task_results}
|
|
115
116
|
if len(mteb_versions) != 1:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
)
|
|
117
|
+
msg = f"All tasks of {self.metadata.name} is not run using the same version. different versions found are: {mteb_versions}"
|
|
118
|
+
logger.warning(msg)
|
|
119
|
+
warnings.warn(msg)
|
|
119
120
|
task_res.mteb_version = None
|
|
120
121
|
task_res.mteb_version = task_results[0].mteb_version
|
|
121
122
|
return task_res
|
|
122
123
|
|
|
123
|
-
def check_if_dataset_is_superseded(self) -> None:
|
|
124
|
-
"""Check if the dataset is superseded by a newer version"""
|
|
125
|
-
if self.superseded_by:
|
|
126
|
-
logger.warning(
|
|
127
|
-
f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset."
|
|
128
|
-
)
|
|
129
|
-
|
|
130
124
|
def filter_eval_splits(self, eval_splits: list[str] | None) -> Self:
|
|
131
125
|
"""Filter the evaluation splits of the task.
|
|
132
126
|
|
|
@@ -89,6 +89,9 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
89
89
|
prediction_folder: Path | None = None,
|
|
90
90
|
**kwargs: Any,
|
|
91
91
|
) -> ScoresDict:
|
|
92
|
+
data_split = data_split.select_columns(
|
|
93
|
+
[self.input_column_name, self.label_column_name]
|
|
94
|
+
)
|
|
92
95
|
# MTEB text clustering requires renaming and eval per subset.
|
|
93
96
|
if self.metadata.modalities == ["text"]:
|
|
94
97
|
all_metrics = []
|
|
@@ -97,8 +100,6 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
97
100
|
logger.info(
|
|
98
101
|
f"Running clustering on cluster ({i + 1}/{len(data_split)})"
|
|
99
102
|
)
|
|
100
|
-
if "__index_level_0__" in cluster_set:
|
|
101
|
-
cluster_set.pop("__index_level_0__")
|
|
102
103
|
clustering_dataset = Dataset.from_dict(cluster_set).select_columns(
|
|
103
104
|
[self.input_column_name, self.label_column_name]
|
|
104
105
|
)
|
mteb/abstasks/task_metadata.py
CHANGED
|
@@ -376,9 +376,8 @@ class TaskMetadata(BaseModel):
|
|
|
376
376
|
if include_cite and cite:
|
|
377
377
|
# check for whitespace in the citation
|
|
378
378
|
if " " in cite:
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
)
|
|
379
|
+
msg = "Citation contains whitespace. Please ensure that the citation is correctly formatted."
|
|
380
|
+
logger.warning(msg)
|
|
382
381
|
return f"\\cite{{{cite}}}"
|
|
383
382
|
return cite
|
|
384
383
|
|
mteb/cache.py
CHANGED
|
@@ -3,6 +3,7 @@ import logging
|
|
|
3
3
|
import os
|
|
4
4
|
import shutil
|
|
5
5
|
import subprocess
|
|
6
|
+
import warnings
|
|
6
7
|
from collections import defaultdict
|
|
7
8
|
from collections.abc import Sequence
|
|
8
9
|
from pathlib import Path
|
|
@@ -83,9 +84,9 @@ class ResultCache:
|
|
|
83
84
|
model_path = results_folder / model_name
|
|
84
85
|
|
|
85
86
|
if model_revision is None:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
)
|
|
87
|
+
msg = "`model_revision` is not specified, attempting to load the latest revision. To disable this behavior, specify the 'model_revision` explicitly."
|
|
88
|
+
logger.warning(msg)
|
|
89
|
+
warnings.warn(msg)
|
|
89
90
|
# get revs from paths
|
|
90
91
|
revisions = [p for p in model_path.glob("*") if p.is_dir()]
|
|
91
92
|
if not revisions:
|
|
@@ -281,7 +282,9 @@ class ResultCache:
|
|
|
281
282
|
shutil.rmtree(self.cache_path)
|
|
282
283
|
logger.info(f"Cache directory {self.cache_path} cleared.")
|
|
283
284
|
else:
|
|
284
|
-
|
|
285
|
+
msg = f"Cache directory `{self.cache_path}` does not exist."
|
|
286
|
+
logger.warning(msg)
|
|
287
|
+
warnings.warn(msg)
|
|
285
288
|
|
|
286
289
|
def __repr__(self) -> str:
|
|
287
290
|
return f"ResultCache(cache_path={self.cache_path})"
|
mteb/cli/build_cli.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
import torch
|
|
@@ -69,15 +70,17 @@ def run(args: argparse.Namespace) -> None:
|
|
|
69
70
|
|
|
70
71
|
overwrite_strategy = args.overwrite_strategy
|
|
71
72
|
if args.overwrite:
|
|
72
|
-
|
|
73
|
-
"`--overwrite` is deprecated, please use `--overwrite-strategy 'always'` instead."
|
|
73
|
+
warnings.warn(
|
|
74
|
+
"`--overwrite` is deprecated, please use `--overwrite-strategy 'always'` instead.",
|
|
75
|
+
DeprecationWarning,
|
|
74
76
|
)
|
|
75
77
|
overwrite_strategy = OverwriteStrategy.ALWAYS.value
|
|
76
78
|
|
|
77
79
|
prediction_folder = args.prediction_folder
|
|
78
80
|
if args.save_predictions:
|
|
79
|
-
|
|
80
|
-
"`--save_predictions` is deprecated, please use `--prediction-folder` instead."
|
|
81
|
+
warnings.warn(
|
|
82
|
+
"`--save_predictions` is deprecated, please use `--prediction-folder` instead.",
|
|
83
|
+
DeprecationWarning,
|
|
81
84
|
)
|
|
82
85
|
prediction_folder = args.output_folder
|
|
83
86
|
|
|
@@ -279,7 +282,9 @@ def _create_meta(args: argparse.Namespace) -> None:
|
|
|
279
282
|
from_existing = Path(from_existing)
|
|
280
283
|
|
|
281
284
|
if output_path.exists() and overwrite:
|
|
282
|
-
|
|
285
|
+
msg = "Output path already exists, overwriting."
|
|
286
|
+
logger.warning(msg)
|
|
287
|
+
warnings.warn(msg)
|
|
283
288
|
elif output_path.exists():
|
|
284
289
|
raise FileExistsError(
|
|
285
290
|
"Output path already exists, use --overwrite to overwrite."
|
mteb/cli/generate_model_card.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
from huggingface_hub import ModelCard, ModelCardData, repo_exists
|
|
@@ -92,9 +93,9 @@ def generate_model_card(
|
|
|
92
93
|
if repo_exists(existing_model_card_id_or_path):
|
|
93
94
|
existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token)
|
|
94
95
|
else:
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
)
|
|
96
|
+
msg = f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
|
|
97
|
+
logger.warning(msg)
|
|
98
|
+
warnings.warn(msg)
|
|
98
99
|
existing_model_card.save(output_path)
|
|
99
100
|
|
|
100
101
|
|
mteb/deprecated_evaluator.py
CHANGED
|
@@ -5,6 +5,7 @@ import logging
|
|
|
5
5
|
import os
|
|
6
6
|
import sys
|
|
7
7
|
import traceback
|
|
8
|
+
import warnings
|
|
8
9
|
from collections.abc import Iterable
|
|
9
10
|
from copy import deepcopy
|
|
10
11
|
from datetime import datetime
|
|
@@ -470,9 +471,9 @@ class MTEB:
|
|
|
470
471
|
raise ImportError(
|
|
471
472
|
"codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions."
|
|
472
473
|
)
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
)
|
|
474
|
+
msg = "Evaluating multiple MTEB runs simultaneously will produce incorrect CO₂ results"
|
|
475
|
+
logger.warning(msg)
|
|
476
|
+
warnings.warn(msg)
|
|
476
477
|
with EmissionsTracker(
|
|
477
478
|
save_to_file=False,
|
|
478
479
|
save_to_api=False,
|
mteb/evaluate.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import warnings
|
|
4
5
|
from collections.abc import Iterable
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from time import time
|
|
@@ -136,10 +137,12 @@ def _evaluate_task(
|
|
|
136
137
|
task.load_data()
|
|
137
138
|
except DatasetNotFoundError as e:
|
|
138
139
|
if not task.metadata.is_public and public_only is None:
|
|
139
|
-
|
|
140
|
+
msg = (
|
|
140
141
|
f"Dataset for private task '{task.metadata.name}' not found. "
|
|
141
142
|
"Make sure you have access to the dataset and that you have set up the authentication correctly. To disable this warning set `public_only=False`"
|
|
142
143
|
)
|
|
144
|
+
logger.warning(msg)
|
|
145
|
+
warnings.warn(msg)
|
|
143
146
|
return TaskError(
|
|
144
147
|
task_name=task.metadata.name,
|
|
145
148
|
exception=str(e),
|
mteb/get_tasks.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import difflib
|
|
4
4
|
import logging
|
|
5
|
+
import warnings
|
|
5
6
|
from collections import Counter, defaultdict
|
|
6
7
|
from collections.abc import Sequence
|
|
7
8
|
from typing import Any
|
|
@@ -340,9 +341,9 @@ def get_task(
|
|
|
340
341
|
"""
|
|
341
342
|
if task_name in _TASK_RENAMES:
|
|
342
343
|
_task_name = _TASK_RENAMES[task_name]
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
)
|
|
344
|
+
msg = f"The task with the given name '{task_name}' has been renamed to '{_task_name}'. To prevent this warning use the new name."
|
|
345
|
+
logger.warning(msg)
|
|
346
|
+
warnings.warn(msg)
|
|
346
347
|
|
|
347
348
|
if task_name not in _TASKS_REGISTRY:
|
|
348
349
|
close_matches = difflib.get_close_matches(task_name, _TASKS_REGISTRY.keys())
|
mteb/leaderboard/app.py
CHANGED
|
@@ -36,9 +36,15 @@ LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.lang
|
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
def _load_results(cache: ResultCache) -> BenchmarkResults:
|
|
39
|
+
start_time = time.time()
|
|
39
40
|
results_cache_path = Path(__file__).parent.joinpath("__cached_results.json")
|
|
40
41
|
if not results_cache_path.exists():
|
|
42
|
+
logger.info("Cached results not found, downloading from remote...")
|
|
41
43
|
cache.download_from_remote()
|
|
44
|
+
download_time = time.time() - start_time
|
|
45
|
+
logger.info(f"Downloaded remote results in {download_time:.2f}s")
|
|
46
|
+
|
|
47
|
+
load_start = time.time()
|
|
42
48
|
all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
|
|
43
49
|
|
|
44
50
|
all_results = cache.load_results(
|
|
@@ -47,10 +53,16 @@ def _load_results(cache: ResultCache) -> BenchmarkResults:
|
|
|
47
53
|
require_model_meta=False,
|
|
48
54
|
include_remote=True,
|
|
49
55
|
)
|
|
56
|
+
load_time = time.time() - load_start
|
|
57
|
+
logger.info(f"Loaded results from cache in {load_time:.2f}s")
|
|
50
58
|
return all_results
|
|
51
59
|
else:
|
|
60
|
+
logger.info("Loading cached results from disk...")
|
|
52
61
|
with results_cache_path.open() as cache_file:
|
|
53
|
-
|
|
62
|
+
results = mteb.BenchmarkResults.from_validated(**json.load(cache_file))
|
|
63
|
+
total_time = time.time() - start_time
|
|
64
|
+
logger.info(f"Loaded cached results in {total_time:.2f}s")
|
|
65
|
+
return results
|
|
54
66
|
|
|
55
67
|
|
|
56
68
|
def _produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str:
|
|
@@ -322,20 +334,48 @@ def _cache_update_task_list(
|
|
|
322
334
|
|
|
323
335
|
def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
324
336
|
"""Returns a Gradio Blocks app for the MTEB leaderboard."""
|
|
325
|
-
|
|
337
|
+
app_start = time.time()
|
|
338
|
+
logger.info("=== Starting leaderboard app initialization ===")
|
|
339
|
+
|
|
340
|
+
logger.info("Step 1/7: Loading all benchmark results...")
|
|
341
|
+
load_start = time.time()
|
|
326
342
|
all_results = _load_results(cache)
|
|
343
|
+
load_time = time.time() - load_start
|
|
344
|
+
logger.info(f"Step 1/7 complete: Loaded results in {load_time:.2f}s")
|
|
327
345
|
|
|
346
|
+
logger.info("Step 2/7: Fetching benchmarks...")
|
|
347
|
+
bench_start = time.time()
|
|
328
348
|
benchmarks = sorted(
|
|
329
349
|
mteb.get_benchmarks(display_on_leaderboard=True), key=lambda x: x.name
|
|
330
350
|
)
|
|
351
|
+
bench_time = time.time() - bench_start
|
|
352
|
+
logger.info(
|
|
353
|
+
f"Step 2/7 complete: Fetched {len(benchmarks)} benchmarks in {bench_time:.2f}s"
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
logger.info(
|
|
357
|
+
"Step 3/7: Processing all benchmarks (select_tasks + join_revisions)..."
|
|
358
|
+
)
|
|
359
|
+
process_start = time.time()
|
|
331
360
|
all_benchmark_results = {
|
|
332
361
|
benchmark.name: all_results.select_tasks(benchmark.tasks).join_revisions()
|
|
333
362
|
for benchmark in benchmarks
|
|
334
363
|
}
|
|
364
|
+
process_time = time.time() - process_start
|
|
365
|
+
if len(benchmarks) > 0:
|
|
366
|
+
logger.info(
|
|
367
|
+
f"Step 3/7 complete: Processed {len(benchmarks)} benchmarks in {process_time:.2f}s (avg {process_time / len(benchmarks):.2f}s/benchmark)"
|
|
368
|
+
)
|
|
369
|
+
else:
|
|
370
|
+
logger.info(
|
|
371
|
+
f"Step 3/7 complete: Processed 0 benchmarks in {process_time:.2f}s (avg N/A)"
|
|
372
|
+
)
|
|
335
373
|
|
|
336
374
|
default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME)
|
|
337
375
|
default_results = all_benchmark_results[default_benchmark.name]
|
|
338
|
-
|
|
376
|
+
|
|
377
|
+
logger.info("Step 4/7: Filtering models...")
|
|
378
|
+
filter_start = time.time()
|
|
339
379
|
|
|
340
380
|
default_scores = default_results._get_scores(format="long")
|
|
341
381
|
all_models = list({entry["model_name"] for entry in default_scores})
|
|
@@ -355,7 +395,13 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
355
395
|
# Filter BenchmarkResults based on default filtered models (as required by Kenneth)
|
|
356
396
|
filtered_model_names = [entry["model_name"] for entry in default_filtered_scores]
|
|
357
397
|
filtered_benchmark_results = default_results.select_models(filtered_model_names)
|
|
398
|
+
filter_time = time.time() - filter_start
|
|
399
|
+
logger.info(
|
|
400
|
+
f"Step 4/7 complete: Filtered {len(filtered_model_names)} models in {filter_time:.2f}s"
|
|
401
|
+
)
|
|
358
402
|
|
|
403
|
+
logger.info("Step 5/7: Generating tables...")
|
|
404
|
+
table_start = time.time()
|
|
359
405
|
summary_table = apply_summary_styling_from_benchmark(
|
|
360
406
|
default_benchmark, filtered_benchmark_results
|
|
361
407
|
)
|
|
@@ -366,10 +412,14 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
366
412
|
default_benchmark,
|
|
367
413
|
filtered_benchmark_results,
|
|
368
414
|
)
|
|
415
|
+
table_time = time.time() - table_start
|
|
416
|
+
logger.info(f"Step 5/7 complete: Generated tables in {table_time:.2f}s")
|
|
369
417
|
|
|
370
418
|
# Check if this benchmark displays per-language results
|
|
371
419
|
display_language_table = len(default_benchmark.language_view) > 0
|
|
372
420
|
|
|
421
|
+
logger.info("Step 6/7: Creating Gradio components...")
|
|
422
|
+
component_start = time.time()
|
|
373
423
|
lang_select = gr.CheckboxGroup(
|
|
374
424
|
sorted(default_results.languages),
|
|
375
425
|
value=sorted(default_results.languages),
|
|
@@ -410,7 +460,13 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
410
460
|
label="Modality",
|
|
411
461
|
info="Select modalities to include.",
|
|
412
462
|
)
|
|
463
|
+
component_time = time.time() - component_start
|
|
464
|
+
logger.info(
|
|
465
|
+
f"Step 6/7 complete: Created Gradio components in {component_time:.2f}s"
|
|
466
|
+
)
|
|
413
467
|
|
|
468
|
+
logger.info("Step 7/7: Building Gradio interface and callbacks...")
|
|
469
|
+
interface_start = time.time()
|
|
414
470
|
with gr.Blocks(fill_width=True) as demo:
|
|
415
471
|
with gr.Sidebar(
|
|
416
472
|
position="left",
|
|
@@ -926,7 +982,11 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
926
982
|
)
|
|
927
983
|
|
|
928
984
|
gr.Markdown(ACKNOWLEDGEMENT, elem_id="ack_markdown")
|
|
985
|
+
interface_time = time.time() - interface_start
|
|
986
|
+
logger.info(f"Step 7/7 complete: Built Gradio interface in {interface_time:.2f}s")
|
|
929
987
|
|
|
988
|
+
logger.info("Starting prerun on all benchmarks to populate caches...")
|
|
989
|
+
prerun_start = time.time()
|
|
930
990
|
# Prerun on all benchmarks, so that results of callbacks get cached
|
|
931
991
|
for benchmark in benchmarks:
|
|
932
992
|
(
|
|
@@ -952,6 +1012,13 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
952
1012
|
update_tables(
|
|
953
1013
|
bench_scores, filtered_tasks, bench_initial_models, benchmark.name
|
|
954
1014
|
)
|
|
1015
|
+
prerun_time = time.time() - prerun_start
|
|
1016
|
+
logger.info(
|
|
1017
|
+
f"Prerun complete: Processed {len(benchmarks)} benchmarks in {prerun_time:.2f}s"
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
total_time = time.time() - app_start
|
|
1021
|
+
logger.info(f"=== Leaderboard app initialization complete in {total_time:.2f}s ===")
|
|
955
1022
|
return demo
|
|
956
1023
|
|
|
957
1024
|
|
mteb/models/abs_encoder.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
2
3
|
from abc import ABC, abstractmethod
|
|
3
4
|
from collections.abc import Callable, Sequence
|
|
4
5
|
from typing import Any, Literal, cast, get_args, overload
|
|
@@ -187,6 +188,7 @@ class AbsEncoder(ABC):
|
|
|
187
188
|
except KeyError:
|
|
188
189
|
msg = f"Task name {task_name} is not valid. {valid_keys_msg}"
|
|
189
190
|
logger.warning(msg)
|
|
191
|
+
warnings.warn(msg)
|
|
190
192
|
invalid_task_messages.add(msg)
|
|
191
193
|
invalid_keys.add(task_key)
|
|
192
194
|
|
|
@@ -232,9 +234,9 @@ class AbsEncoder(ABC):
|
|
|
232
234
|
if isinstance(prompt, dict) and prompt_type:
|
|
233
235
|
if prompt.get(prompt_type.value):
|
|
234
236
|
return prompt[prompt_type.value]
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
)
|
|
237
|
+
msg = f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
|
|
238
|
+
logger.warning(msg)
|
|
239
|
+
warnings.warn(msg)
|
|
238
240
|
return ""
|
|
239
241
|
|
|
240
242
|
if prompt:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
@@ -71,7 +72,9 @@ class FaissCache:
|
|
|
71
72
|
try:
|
|
72
73
|
return self.index.reconstruct(idx)
|
|
73
74
|
except Exception:
|
|
74
|
-
|
|
75
|
+
msg = f"Vector id {idx} missing for hash {item_hash}"
|
|
76
|
+
logger.warning(msg)
|
|
77
|
+
warnings.warn(msg)
|
|
75
78
|
return None
|
|
76
79
|
|
|
77
80
|
def save(self) -> None:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
@@ -41,9 +42,9 @@ class NumpyCache:
|
|
|
41
42
|
for item, vec in zip(item, vectors):
|
|
42
43
|
item_hash = _hash_item(item)
|
|
43
44
|
if item_hash in self.hash_to_index:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
)
|
|
45
|
+
msg = f"Hash collision or duplicate item for hash {item_hash}. Overwriting existing vector."
|
|
46
|
+
logger.warning(msg)
|
|
47
|
+
warnings.warn(msg)
|
|
47
48
|
index = self.hash_to_index[item_hash]
|
|
48
49
|
else:
|
|
49
50
|
index = len(self.hash_to_index)
|
|
@@ -107,9 +108,9 @@ class NumpyCache:
|
|
|
107
108
|
f"Loaded vector dimension {self.vector_dim} from {self.dimension_file}"
|
|
108
109
|
)
|
|
109
110
|
else:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
)
|
|
111
|
+
msg = "Dimension file not found. Vector dimension remains uninitialized."
|
|
112
|
+
logger.warning(msg)
|
|
113
|
+
warnings.warn(msg)
|
|
113
114
|
|
|
114
115
|
def save(self) -> None:
|
|
115
116
|
"""Persist VectorCacheMap to disk."""
|
|
@@ -151,14 +152,14 @@ class NumpyCache:
|
|
|
151
152
|
self.vectors = self.vectors.reshape(-1, self.vector_dim)
|
|
152
153
|
logger.info(f"Loaded vectors file with shape: {self.vectors.shape}")
|
|
153
154
|
else:
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
)
|
|
155
|
+
msg = "Vector dimension not set. Unable to load vectors file."
|
|
156
|
+
logger.warning(msg)
|
|
157
|
+
warnings.warn(msg)
|
|
157
158
|
logger.info(f"Loaded VectorCacheMap from {self.directory}")
|
|
158
159
|
else:
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
)
|
|
160
|
+
msg = "No existing files found. Initialized empty VectorCacheMap."
|
|
161
|
+
logger.warning(msg)
|
|
162
|
+
warnings.warn(msg)
|
|
162
163
|
except Exception as e:
|
|
163
164
|
logger.error(f"Error loading VectorCacheMap: {str(e)}")
|
|
164
165
|
raise
|
|
@@ -105,6 +105,7 @@ class ALIGNModel(AbsEncoder):
|
|
|
105
105
|
align_base = ModelMeta(
|
|
106
106
|
loader=ALIGNModel,
|
|
107
107
|
name="kakaobrain/align-base",
|
|
108
|
+
model_type=["dense"],
|
|
108
109
|
languages=["eng-Latn"],
|
|
109
110
|
revision="e96a37facc7b1f59090ece82293226b817afd6ba",
|
|
110
111
|
release_date="2023-02-24",
|
|
@@ -6,6 +6,7 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
|
6
6
|
model2vecdk = ModelMeta(
|
|
7
7
|
loader=Model2VecModel, # type: ignore
|
|
8
8
|
name="andersborges/model2vecdk",
|
|
9
|
+
model_type=["dense"],
|
|
9
10
|
languages=["dan-Latn"],
|
|
10
11
|
open_weights=True,
|
|
11
12
|
revision="cb576c78dcc1b729e4612645f61db59929d69e61",
|
|
@@ -36,6 +37,7 @@ model2vecdk = ModelMeta(
|
|
|
36
37
|
model2vecdk_stem = ModelMeta(
|
|
37
38
|
loader=Model2VecModel, # type: ignore
|
|
38
39
|
name="andersborges/model2vecdk-stem",
|
|
40
|
+
model_type=["dense"],
|
|
39
41
|
languages=["dan-Latn"],
|
|
40
42
|
open_weights=True,
|
|
41
43
|
revision="cb576c78dcc1b729e4612645f61db59929d69e61",
|
|
@@ -4,6 +4,7 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
|
|
|
4
4
|
arabic_triplet_matryoshka = ModelMeta(
|
|
5
5
|
loader=sentence_transformers_loader,
|
|
6
6
|
name="Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2",
|
|
7
|
+
model_type=["dense"],
|
|
7
8
|
languages=["ara-Arab"],
|
|
8
9
|
open_weights=True,
|
|
9
10
|
revision="ed357f222f0b6ea6670d2c9b5a1cb93950d34200",
|
|
@@ -140,6 +140,7 @@ arctic_v2_training_datasets = {
|
|
|
140
140
|
arctic_embed_xs = ModelMeta(
|
|
141
141
|
loader=sentence_transformers_loader,
|
|
142
142
|
name="Snowflake/snowflake-arctic-embed-xs",
|
|
143
|
+
model_type=["dense"],
|
|
143
144
|
revision="742da4f66e1823b5b4dbe6c320a1375a1fd85f9e",
|
|
144
145
|
release_date="2024-07-08", # initial commit of hf model.
|
|
145
146
|
languages=["eng-Latn"],
|
|
@@ -165,6 +166,7 @@ arctic_embed_xs = ModelMeta(
|
|
|
165
166
|
arctic_embed_s = ModelMeta(
|
|
166
167
|
loader=sentence_transformers_loader,
|
|
167
168
|
name="Snowflake/snowflake-arctic-embed-s",
|
|
169
|
+
model_type=["dense"],
|
|
168
170
|
revision="d3c1d2d433dd0fdc8e9ca01331a5f225639e798f",
|
|
169
171
|
release_date="2024-04-12", # initial commit of hf model.
|
|
170
172
|
languages=["eng-Latn"],
|
|
@@ -190,6 +192,7 @@ arctic_embed_s = ModelMeta(
|
|
|
190
192
|
arctic_embed_m = ModelMeta(
|
|
191
193
|
loader=sentence_transformers_loader,
|
|
192
194
|
name="Snowflake/snowflake-arctic-embed-m",
|
|
195
|
+
model_type=["dense"],
|
|
193
196
|
revision="cc17beacbac32366782584c8752220405a0f3f40",
|
|
194
197
|
release_date="2024-04-12", # initial commit of hf model.
|
|
195
198
|
languages=["eng-Latn"],
|
|
@@ -215,6 +218,7 @@ arctic_embed_m_long = ModelMeta(
|
|
|
215
218
|
loader=sentence_transformers_loader,
|
|
216
219
|
loader_kwargs={"trust_remote_code": True},
|
|
217
220
|
name="Snowflake/snowflake-arctic-embed-m-long",
|
|
221
|
+
model_type=["dense"],
|
|
218
222
|
revision="89d0f6ab196eead40b90cb6f9fefec01a908d2d1",
|
|
219
223
|
release_date="2024-04-12", # initial commit of hf model.
|
|
220
224
|
languages=["eng-Latn"],
|
|
@@ -239,6 +243,7 @@ arctic_embed_m_long = ModelMeta(
|
|
|
239
243
|
arctic_embed_l = ModelMeta(
|
|
240
244
|
loader=sentence_transformers_loader,
|
|
241
245
|
name="Snowflake/snowflake-arctic-embed-l",
|
|
246
|
+
model_type=["dense"],
|
|
242
247
|
revision="9a9e5834d2e89cdd8bb72b64111dde496e4fe78c",
|
|
243
248
|
release_date="2024-04-12", # initial commit of hf model.
|
|
244
249
|
languages=["eng-Latn"],
|
|
@@ -268,6 +273,7 @@ arctic_embed_m_v1_5 = ModelMeta(
|
|
|
268
273
|
},
|
|
269
274
|
),
|
|
270
275
|
name="Snowflake/snowflake-arctic-embed-m-v1.5",
|
|
276
|
+
model_type=["dense"],
|
|
271
277
|
revision="97eab2e17fcb7ccb8bb94d6e547898fa1a6a0f47",
|
|
272
278
|
release_date="2024-07-08", # initial commit of hf model.
|
|
273
279
|
languages=["eng-Latn"],
|
|
@@ -293,6 +299,7 @@ arctic_embed_m_v2_0 = ModelMeta(
|
|
|
293
299
|
loader=sentence_transformers_loader,
|
|
294
300
|
loader_kwargs={"trust_remote_code": True},
|
|
295
301
|
name="Snowflake/snowflake-arctic-embed-m-v2.0",
|
|
302
|
+
model_type=["dense"],
|
|
296
303
|
revision="f2a7d59d80dfda5b1d14f096f3ce88bb6bf9ebdc",
|
|
297
304
|
release_date="2024-12-04", # initial commit of hf model.
|
|
298
305
|
languages=LANGUAGES_V2_0,
|
|
@@ -317,6 +324,7 @@ arctic_embed_m_v2_0 = ModelMeta(
|
|
|
317
324
|
arctic_embed_l_v2_0 = ModelMeta(
|
|
318
325
|
loader=sentence_transformers_loader,
|
|
319
326
|
name="Snowflake/snowflake-arctic-embed-l-v2.0",
|
|
327
|
+
model_type=["dense"],
|
|
320
328
|
revision="edc2df7b6c25794b340229ca082e7c78782e6374",
|
|
321
329
|
release_date="2024-12-04", # initial commit of hf model.
|
|
322
330
|
languages=LANGUAGES_V2_0,
|
|
@@ -155,6 +155,7 @@ class BedrockModel(AbsEncoder):
|
|
|
155
155
|
|
|
156
156
|
amazon_titan_embed_text_v1 = ModelMeta(
|
|
157
157
|
name="bedrock/amazon-titan-embed-text-v1",
|
|
158
|
+
model_type=["dense"],
|
|
158
159
|
revision="1",
|
|
159
160
|
release_date="2023-09-27",
|
|
160
161
|
languages=None, # not specified
|
|
@@ -181,6 +182,7 @@ amazon_titan_embed_text_v1 = ModelMeta(
|
|
|
181
182
|
|
|
182
183
|
amazon_titan_embed_text_v2 = ModelMeta(
|
|
183
184
|
name="bedrock/amazon-titan-embed-text-v2",
|
|
185
|
+
model_type=["dense"],
|
|
184
186
|
revision="1",
|
|
185
187
|
release_date="2024-04-30",
|
|
186
188
|
languages=None, # not specified
|
|
@@ -216,6 +218,7 @@ cohere_embed_english_v3 = ModelMeta(
|
|
|
216
218
|
model_prompts=cohere_model_prompts,
|
|
217
219
|
),
|
|
218
220
|
name="bedrock/cohere-embed-english-v3",
|
|
221
|
+
model_type=["dense"],
|
|
219
222
|
languages=["eng-Latn"],
|
|
220
223
|
open_weights=False,
|
|
221
224
|
reference="https://cohere.com/blog/introducing-embed-v3",
|
|
@@ -243,6 +246,7 @@ cohere_embed_multilingual_v3 = ModelMeta(
|
|
|
243
246
|
model_prompts=cohere_model_prompts,
|
|
244
247
|
),
|
|
245
248
|
name="bedrock/cohere-embed-multilingual-v3",
|
|
249
|
+
model_type=["dense"],
|
|
246
250
|
languages=cohere_supported_languages,
|
|
247
251
|
open_weights=False,
|
|
248
252
|
reference="https://cohere.com/blog/introducing-embed-v3",
|