mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +17 -18
- mteb/_evaluators/any_sts_evaluator.py +3 -3
- mteb/_evaluators/clustering_evaluator.py +2 -2
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
- mteb/_evaluators/pair_classification_evaluator.py +5 -3
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +11 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
- mteb/_evaluators/text/summarization_evaluator.py +23 -18
- mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -10
- mteb/abstasks/clustering.py +19 -15
- mteb/abstasks/clustering_legacy.py +10 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +23 -19
- mteb/abstasks/pair_classification.py +20 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +8 -5
- mteb/abstasks/task_metadata.py +31 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/benchmark.py +4 -2
- mteb/benchmarks/benchmarks/__init__.py +4 -0
- mteb/benchmarks/benchmarks/benchmarks.py +112 -11
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +182 -29
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +63 -49
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +44 -33
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +162 -34
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +10 -6
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +21 -3
- mteb/models/instruct_wrapper.py +28 -8
- mteb/models/model_implementations/align_models.py +1 -1
- mteb/models/model_implementations/andersborges.py +4 -4
- mteb/models/model_implementations/ara_models.py +1 -1
- mteb/models/model_implementations/arctic_models.py +8 -8
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +45 -21
- mteb/models/model_implementations/bica_model.py +3 -3
- mteb/models/model_implementations/blip2_models.py +2 -2
- mteb/models/model_implementations/blip_models.py +16 -16
- mteb/models/model_implementations/bm25.py +4 -4
- mteb/models/model_implementations/bmretriever_models.py +6 -4
- mteb/models/model_implementations/cadet_models.py +1 -1
- mteb/models/model_implementations/cde_models.py +11 -4
- mteb/models/model_implementations/clip_models.py +6 -6
- mteb/models/model_implementations/clips_models.py +3 -3
- mteb/models/model_implementations/codefuse_models.py +5 -5
- mteb/models/model_implementations/codesage_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +5 -5
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/colpali_models.py +3 -3
- mteb/models/model_implementations/colqwen_models.py +8 -8
- mteb/models/model_implementations/colsmol_models.py +2 -2
- mteb/models/model_implementations/conan_models.py +1 -1
- mteb/models/model_implementations/dino_models.py +42 -42
- mteb/models/model_implementations/e5_instruct.py +23 -4
- mteb/models/model_implementations/e5_models.py +9 -9
- mteb/models/model_implementations/e5_v.py +6 -6
- mteb/models/model_implementations/eagerworks_models.py +1 -1
- mteb/models/model_implementations/emillykkejensen_models.py +6 -6
- mteb/models/model_implementations/en_code_retriever.py +1 -1
- mteb/models/model_implementations/euler_models.py +2 -2
- mteb/models/model_implementations/fa_models.py +9 -9
- mteb/models/model_implementations/facebookai.py +14 -2
- mteb/models/model_implementations/geogpt_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +6 -5
- mteb/models/model_implementations/google_models.py +1 -1
- mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
- mteb/models/model_implementations/gritlm_models.py +2 -2
- mteb/models/model_implementations/gte_models.py +25 -13
- mteb/models/model_implementations/hinvec_models.py +1 -1
- mteb/models/model_implementations/ibm_granite_models.py +30 -6
- mteb/models/model_implementations/inf_models.py +2 -2
- mteb/models/model_implementations/jasper_models.py +2 -2
- mteb/models/model_implementations/jina_clip.py +48 -10
- mteb/models/model_implementations/jina_models.py +18 -11
- mteb/models/model_implementations/kblab.py +12 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
- mteb/models/model_implementations/kfst.py +1 -1
- mteb/models/model_implementations/kowshik24_models.py +1 -1
- mteb/models/model_implementations/lgai_embedding_models.py +1 -1
- mteb/models/model_implementations/linq_models.py +1 -1
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +6 -6
- mteb/models/model_implementations/llm2vec_models.py +8 -8
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/mdbr_models.py +17 -3
- mteb/models/model_implementations/misc_models.py +68 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +1 -1
- mteb/models/model_implementations/moco_models.py +4 -4
- mteb/models/model_implementations/mod_models.py +1 -1
- mteb/models/model_implementations/model2vec_models.py +14 -14
- mteb/models/model_implementations/moka_models.py +1 -1
- mteb/models/model_implementations/nbailab.py +3 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
- mteb/models/model_implementations/nomic_models.py +30 -15
- mteb/models/model_implementations/nomic_models_vision.py +1 -1
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
- mteb/models/model_implementations/nvidia_models.py +151 -19
- mteb/models/model_implementations/octen_models.py +61 -2
- mteb/models/model_implementations/openclip_models.py +13 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
- mteb/models/model_implementations/ops_moa_models.py +1 -1
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +1 -1
- mteb/models/model_implementations/piccolo_models.py +1 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +4 -4
- mteb/models/model_implementations/pylate_models.py +10 -9
- mteb/models/model_implementations/qodo_models.py +2 -2
- mteb/models/model_implementations/qtack_models.py +1 -1
- mteb/models/model_implementations/qwen3_models.py +3 -3
- mteb/models/model_implementations/qzhou_models.py +2 -2
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +2 -2
- mteb/models/model_implementations/reasonir_model.py +1 -1
- mteb/models/model_implementations/repllama_models.py +3 -3
- mteb/models/model_implementations/rerankers_custom.py +12 -6
- mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
- mteb/models/model_implementations/richinfoai_models.py +1 -1
- mteb/models/model_implementations/ru_sentence_models.py +20 -20
- mteb/models/model_implementations/ruri_models.py +10 -10
- mteb/models/model_implementations/salesforce_models.py +3 -3
- mteb/models/model_implementations/samilpwc_models.py +1 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
- mteb/models/model_implementations/searchmap_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
- mteb/models/model_implementations/sentence_transformers_models.py +124 -22
- mteb/models/model_implementations/shuu_model.py +1 -1
- mteb/models/model_implementations/siglip_models.py +20 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
- mteb/models/model_implementations/stella_models.py +17 -4
- mteb/models/model_implementations/tarka_models.py +2 -2
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +1 -1
- mteb/models/model_implementations/uae_models.py +7 -1
- mteb/models/model_implementations/vdr_models.py +1 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -6
- mteb/models/model_implementations/vlm2vec_models.py +3 -3
- mteb/models/model_implementations/voyage_models.py +84 -0
- mteb/models/model_implementations/voyage_v.py +9 -7
- mteb/models/model_implementations/youtu_models.py +1 -1
- mteb/models/model_implementations/yuan_models.py +1 -1
- mteb/models/model_implementations/yuan_models_en.py +1 -1
- mteb/models/model_meta.py +80 -31
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +33 -18
- mteb/models/sentence_transformer_wrapper.py +50 -25
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +29 -21
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +80 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +12 -0
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/leaderboard/app.py
CHANGED
|
@@ -5,7 +5,7 @@ import tempfile
|
|
|
5
5
|
import time
|
|
6
6
|
import warnings
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Literal
|
|
8
|
+
from typing import Literal, get_args
|
|
9
9
|
from urllib.parse import urlencode
|
|
10
10
|
|
|
11
11
|
import cachetools
|
|
@@ -29,40 +29,115 @@ from mteb.leaderboard.table import (
|
|
|
29
29
|
apply_summary_styling_from_benchmark,
|
|
30
30
|
)
|
|
31
31
|
from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
|
|
32
|
+
from mteb.models.model_meta import MODEL_TYPES
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger(__name__)
|
|
34
35
|
|
|
36
|
+
|
|
35
37
|
LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.languages})
|
|
38
|
+
MODEL_TYPE_CHOICES = list(get_args(MODEL_TYPES))
|
|
36
39
|
|
|
37
40
|
|
|
38
41
|
def _load_results(cache: ResultCache) -> BenchmarkResults:
|
|
42
|
+
"""Load benchmark results using an optimized caching strategy.
|
|
43
|
+
|
|
44
|
+
This function implements a two-tier caching strategy for faster leaderboard startup:
|
|
45
|
+
|
|
46
|
+
1. **Primary Strategy (Fast)**: Download pre-computed cached results from the
|
|
47
|
+
'cached-data' branch as a compressed JSON file (~2MB vs ~200MB full repo).
|
|
48
|
+
This avoids the need to clone the entire results repository and provides
|
|
49
|
+
near-instantaneous loading for most users.
|
|
50
|
+
|
|
51
|
+
2. **Fallback Strategy (Slower)**: If the cached download fails, fall back to
|
|
52
|
+
the original approach of downloading the full results repository and
|
|
53
|
+
building the cache from scratch.
|
|
54
|
+
|
|
55
|
+
The cached results file contains pre-aggregated benchmark data that eliminates
|
|
56
|
+
the need for expensive operations like task selection and revision joining
|
|
57
|
+
during app startup.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
cache: ResultCache instance used for both optimized and fallback operations
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
BenchmarkResults: Complete benchmark results ready for leaderboard display
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
Various exceptions related to network issues, file I/O, or data validation
|
|
67
|
+
are logged and may cause fallback to the slower repository-based approach.
|
|
68
|
+
"""
|
|
39
69
|
start_time = time.time()
|
|
40
70
|
results_cache_path = Path(__file__).parent.joinpath("__cached_results.json")
|
|
71
|
+
|
|
41
72
|
if not results_cache_path.exists():
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
load_start = time.time()
|
|
48
|
-
all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
|
|
49
|
-
|
|
50
|
-
all_results = cache.load_results(
|
|
51
|
-
models=all_model_names,
|
|
52
|
-
only_main_score=True,
|
|
53
|
-
require_model_meta=False,
|
|
54
|
-
include_remote=True,
|
|
73
|
+
# First try to download the cached results file from the cached-data branch
|
|
74
|
+
# This is faster than cloning the entire results repository
|
|
75
|
+
logger.info(
|
|
76
|
+
"Cached results not found, trying to download from cached-data branch..."
|
|
55
77
|
)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
# Use ResultCache's optimized download method
|
|
81
|
+
# Default saves to mteb/leaderboard/__cached_results.json
|
|
82
|
+
results_cache_path = cache._download_cached_results_from_branch()
|
|
83
|
+
download_time = time.time() - start_time
|
|
84
|
+
logger.info(
|
|
85
|
+
f"Downloaded cached results from cached-data branch in {download_time:.2f}s"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.error(
|
|
90
|
+
f"Failed to download from cached-data branch: {type(e).__name__}: {e}"
|
|
91
|
+
)
|
|
92
|
+
logger.info("Falling back to downloading full remote repository...")
|
|
93
|
+
|
|
94
|
+
# Fall back to the original approach: clone the full repo
|
|
95
|
+
cache.download_from_remote()
|
|
96
|
+
download_time = time.time() - start_time
|
|
97
|
+
logger.info(f"Downloaded remote results in {download_time:.2f}s")
|
|
98
|
+
|
|
99
|
+
load_start = time.time()
|
|
100
|
+
all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
|
|
101
|
+
|
|
102
|
+
all_results = cache.load_results(
|
|
103
|
+
models=all_model_names,
|
|
104
|
+
only_main_score=True,
|
|
105
|
+
require_model_meta=False,
|
|
106
|
+
include_remote=True,
|
|
107
|
+
)
|
|
108
|
+
load_time = time.time() - load_start
|
|
109
|
+
logger.info(f"Loaded results from cache in {load_time:.2f}s")
|
|
110
|
+
return all_results
|
|
111
|
+
|
|
112
|
+
# Load the cached results file (either pre-existing or just downloaded)
|
|
113
|
+
logger.info("Loading cached results from disk...")
|
|
114
|
+
try:
|
|
115
|
+
logger.info(f"Opening file: {results_cache_path}")
|
|
116
|
+
|
|
117
|
+
file_size = results_cache_path.stat().st_size
|
|
118
|
+
logger.info(f"File exists, size: {file_size} bytes")
|
|
119
|
+
|
|
61
120
|
with results_cache_path.open() as cache_file:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
121
|
+
logger.info("File opened successfully, attempting JSON parse...")
|
|
122
|
+
json_data = json.load(cache_file)
|
|
123
|
+
logger.info(
|
|
124
|
+
f"JSON parsed successfully, keys: {list(json_data.keys()) if isinstance(json_data, dict) else 'not a dict'}"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
logger.info("Attempting BenchmarkResults.from_validated...")
|
|
128
|
+
results = mteb.BenchmarkResults.from_validated(**json_data)
|
|
129
|
+
logger.info("BenchmarkResults.from_validated successful")
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
# TODO: Handle the case when we fail to load cached results from disk.
|
|
133
|
+
logger.error(
|
|
134
|
+
f"Failed to load cached results from disk: {type(e).__name__}: {e}"
|
|
135
|
+
)
|
|
136
|
+
raise
|
|
137
|
+
|
|
138
|
+
total_time = time.time() - start_time
|
|
139
|
+
logger.info(f"Loaded cached results in {total_time:.2f}s")
|
|
140
|
+
return results
|
|
66
141
|
|
|
67
142
|
|
|
68
143
|
def _produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str:
|
|
@@ -169,7 +244,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
|
|
|
169
244
|
df = df.drop(columns="reference")
|
|
170
245
|
return gr.DataFrame(
|
|
171
246
|
df,
|
|
172
|
-
datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
|
|
247
|
+
datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
|
|
173
248
|
buttons=["copy", "fullscreen"],
|
|
174
249
|
show_search="filter",
|
|
175
250
|
)
|
|
@@ -187,6 +262,7 @@ def _filter_models(
|
|
|
187
262
|
instructions: bool | None,
|
|
188
263
|
max_model_size: int,
|
|
189
264
|
zero_shot_setting: Literal["only_zero_shot", "allow_all", "remove_unknown"],
|
|
265
|
+
model_types: list[str] | None,
|
|
190
266
|
):
|
|
191
267
|
lower, upper = 0, max_model_size
|
|
192
268
|
# Setting to None, when the user doesn't specify anything
|
|
@@ -205,6 +281,7 @@ def _filter_models(
|
|
|
205
281
|
use_instructions=instructions,
|
|
206
282
|
frameworks=compatibility,
|
|
207
283
|
n_parameters_range=(lower, upper),
|
|
284
|
+
model_types=model_types,
|
|
208
285
|
)
|
|
209
286
|
|
|
210
287
|
models_to_keep = set()
|
|
@@ -269,6 +346,7 @@ def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
|
|
|
269
346
|
instructions=None,
|
|
270
347
|
max_model_size=MAX_MODEL_SIZE,
|
|
271
348
|
zero_shot_setting="allow_all",
|
|
349
|
+
model_types=MODEL_TYPE_CHOICES,
|
|
272
350
|
)
|
|
273
351
|
# Sort to ensure consistency with update_models
|
|
274
352
|
initial_models = sorted(initial_models)
|
|
@@ -387,6 +465,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
387
465
|
instructions=None,
|
|
388
466
|
max_model_size=MAX_MODEL_SIZE,
|
|
389
467
|
zero_shot_setting="allow_all",
|
|
468
|
+
model_types=MODEL_TYPE_CHOICES,
|
|
390
469
|
)
|
|
391
470
|
default_filtered_scores = [
|
|
392
471
|
entry for entry in default_scores if entry["model_name"] in filtered_models
|
|
@@ -583,6 +662,12 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
583
662
|
label="Model Parameters",
|
|
584
663
|
interactive=True,
|
|
585
664
|
)
|
|
665
|
+
with gr.Column():
|
|
666
|
+
model_type_select = gr.CheckboxGroup(
|
|
667
|
+
MODEL_TYPE_CHOICES,
|
|
668
|
+
value=MODEL_TYPE_CHOICES,
|
|
669
|
+
label="Model Type",
|
|
670
|
+
)
|
|
586
671
|
|
|
587
672
|
with gr.Tab("Summary"):
|
|
588
673
|
summary_table.render()
|
|
@@ -755,7 +840,8 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
755
840
|
compatibility,
|
|
756
841
|
instructions,
|
|
757
842
|
max_model_size,
|
|
758
|
-
zero_shot
|
|
843
|
+
zero_shot,
|
|
844
|
+
model_type_select: hash(
|
|
759
845
|
(
|
|
760
846
|
id(scores),
|
|
761
847
|
hash(tuple(tasks)),
|
|
@@ -764,6 +850,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
764
850
|
hash(instructions),
|
|
765
851
|
hash(max_model_size),
|
|
766
852
|
hash(zero_shot),
|
|
853
|
+
hash(tuple(model_type_select)),
|
|
767
854
|
)
|
|
768
855
|
),
|
|
769
856
|
)
|
|
@@ -775,6 +862,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
775
862
|
instructions: bool | None,
|
|
776
863
|
max_model_size: int,
|
|
777
864
|
zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"],
|
|
865
|
+
model_type_select: list[str],
|
|
778
866
|
):
|
|
779
867
|
start_time = time.time()
|
|
780
868
|
model_names = list({entry["model_name"] for entry in scores})
|
|
@@ -786,6 +874,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
786
874
|
instructions,
|
|
787
875
|
max_model_size,
|
|
788
876
|
zero_shot_setting=zero_shot,
|
|
877
|
+
model_types=model_type_select,
|
|
789
878
|
)
|
|
790
879
|
elapsed = time.time() - start_time
|
|
791
880
|
logger.debug(f"update_models callback: {elapsed}s")
|
|
@@ -803,6 +892,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
803
892
|
instructions,
|
|
804
893
|
max_model_size,
|
|
805
894
|
zero_shot,
|
|
895
|
+
model_type_select,
|
|
806
896
|
],
|
|
807
897
|
outputs=[models],
|
|
808
898
|
)
|
|
@@ -817,6 +907,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
817
907
|
instructions,
|
|
818
908
|
max_model_size,
|
|
819
909
|
zero_shot,
|
|
910
|
+
model_type_select,
|
|
820
911
|
],
|
|
821
912
|
outputs=[models],
|
|
822
913
|
)
|
|
@@ -830,6 +921,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
830
921
|
instructions,
|
|
831
922
|
max_model_size,
|
|
832
923
|
zero_shot,
|
|
924
|
+
model_type_select,
|
|
833
925
|
],
|
|
834
926
|
outputs=[models],
|
|
835
927
|
)
|
|
@@ -843,6 +935,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
843
935
|
instructions,
|
|
844
936
|
max_model_size,
|
|
845
937
|
zero_shot,
|
|
938
|
+
model_type_select,
|
|
846
939
|
],
|
|
847
940
|
outputs=[models],
|
|
848
941
|
)
|
|
@@ -856,6 +949,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
856
949
|
instructions,
|
|
857
950
|
max_model_size,
|
|
858
951
|
zero_shot,
|
|
952
|
+
model_type_select,
|
|
859
953
|
],
|
|
860
954
|
outputs=[models],
|
|
861
955
|
)
|
|
@@ -869,6 +963,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
869
963
|
instructions,
|
|
870
964
|
max_model_size,
|
|
871
965
|
zero_shot,
|
|
966
|
+
model_type_select,
|
|
872
967
|
],
|
|
873
968
|
outputs=[models],
|
|
874
969
|
)
|
|
@@ -882,6 +977,21 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
882
977
|
instructions,
|
|
883
978
|
max_model_size,
|
|
884
979
|
zero_shot,
|
|
980
|
+
model_type_select,
|
|
981
|
+
],
|
|
982
|
+
outputs=[models],
|
|
983
|
+
)
|
|
984
|
+
model_type_select.change(
|
|
985
|
+
update_models,
|
|
986
|
+
inputs=[
|
|
987
|
+
scores,
|
|
988
|
+
task_select,
|
|
989
|
+
availability,
|
|
990
|
+
compatibility,
|
|
991
|
+
instructions,
|
|
992
|
+
max_model_size,
|
|
993
|
+
zero_shot,
|
|
994
|
+
model_type_select,
|
|
885
995
|
],
|
|
886
996
|
outputs=[models],
|
|
887
997
|
)
|
|
@@ -1023,16 +1133,34 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
1023
1133
|
|
|
1024
1134
|
|
|
1025
1135
|
if __name__ == "__main__":
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
logging.
|
|
1030
|
-
logging.
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1136
|
+
import os
|
|
1137
|
+
|
|
1138
|
+
# Add process ID to logging for multiprocessing debugging
|
|
1139
|
+
logging.basicConfig(
|
|
1140
|
+
level=logging.INFO,
|
|
1141
|
+
format="%(asctime)s - PID:%(process)d - %(name)s - %(levelname)s - %(message)s",
|
|
1142
|
+
force=True, # Override any existing handlers
|
|
1143
|
+
)
|
|
1144
|
+
|
|
1145
|
+
# Flush log handlers immediately (helpful for multiprocessing)
|
|
1146
|
+
for handler in logging.root.handlers:
|
|
1147
|
+
handler.flush()
|
|
1148
|
+
|
|
1149
|
+
logger.info(f"Starting leaderboard app in process {os.getpid()}")
|
|
1150
|
+
|
|
1151
|
+
# Suppress specific WARNING messages while keeping INFO level for the app
|
|
1152
|
+
logging.getLogger("mteb.results.task_result").setLevel(logging.ERROR)
|
|
1153
|
+
logging.getLogger("mteb.models.model_meta").setLevel(logging.ERROR)
|
|
1154
|
+
logging.getLogger("mteb.results.benchmark_results").setLevel(logging.ERROR)
|
|
1155
|
+
|
|
1035
1156
|
warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
|
|
1157
|
+
warnings.filterwarnings("ignore", message="Could not get source model: .*")
|
|
1158
|
+
warnings.filterwarnings(
|
|
1159
|
+
"ignore", message="No scores data available. Returning empty DataFrame."
|
|
1160
|
+
)
|
|
1161
|
+
warnings.filterwarnings("ignore", message="Main score .* not found in scores")
|
|
1162
|
+
warnings.filterwarnings("ignore", message=".*: Missing subsets .* for split .*")
|
|
1163
|
+
warnings.filterwarnings("ignore", message=".*: Missing splits .*")
|
|
1036
1164
|
|
|
1037
1165
|
app = get_leaderboard_app()
|
|
1038
1166
|
|
mteb/load_results.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import sys
|
|
4
|
-
from collections.abc import Sequence
|
|
4
|
+
from collections.abc import Iterable, Sequence
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
|
|
7
7
|
from mteb.abstasks.abstask import AbsTask
|
|
@@ -45,8 +45,8 @@ def _model_name_and_revision(
|
|
|
45
45
|
def load_results(
|
|
46
46
|
results_repo: str = "https://github.com/embeddings-benchmark/results",
|
|
47
47
|
download_latest: bool = True,
|
|
48
|
-
models:
|
|
49
|
-
tasks:
|
|
48
|
+
models: Iterable[ModelMeta] | Sequence[str] | None = None,
|
|
49
|
+
tasks: Iterable[AbsTask] | Sequence[str] | None = None,
|
|
50
50
|
validate_and_filter: bool = True,
|
|
51
51
|
require_model_meta: bool = True,
|
|
52
52
|
only_main_score: bool = False,
|
|
@@ -83,21 +83,21 @@ def load_results(
|
|
|
83
83
|
|
|
84
84
|
if models is not None:
|
|
85
85
|
models_to_keep = {}
|
|
86
|
-
for
|
|
87
|
-
if isinstance(
|
|
88
|
-
models_to_keep[
|
|
86
|
+
for model in models:
|
|
87
|
+
if isinstance(model, ModelMeta):
|
|
88
|
+
models_to_keep[model.name] = model.revision
|
|
89
89
|
else:
|
|
90
|
-
models_to_keep[
|
|
90
|
+
models_to_keep[model] = None
|
|
91
91
|
else:
|
|
92
92
|
models_to_keep = None
|
|
93
93
|
|
|
94
|
-
task_names = {}
|
|
94
|
+
task_names: dict[str, AbsTask | None] = {}
|
|
95
95
|
if tasks is not None:
|
|
96
|
-
for
|
|
97
|
-
if isinstance(
|
|
98
|
-
task_names[
|
|
96
|
+
for task_ in tasks:
|
|
97
|
+
if isinstance(task_, AbsTask):
|
|
98
|
+
task_names[task_.metadata.name] = task_
|
|
99
99
|
else:
|
|
100
|
-
task_names[
|
|
100
|
+
task_names[task_] = None
|
|
101
101
|
|
|
102
102
|
model_results = []
|
|
103
103
|
for model_path in model_paths:
|
mteb/models/abs_encoder.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
2
3
|
from abc import ABC, abstractmethod
|
|
3
4
|
from collections.abc import Callable, Sequence
|
|
4
5
|
from typing import Any, Literal, cast, get_args, overload
|
|
5
6
|
|
|
6
7
|
from torch.utils.data import DataLoader
|
|
8
|
+
from typing_extensions import Unpack
|
|
7
9
|
|
|
8
10
|
import mteb
|
|
9
11
|
from mteb.abstasks.task_metadata import TaskMetadata, TaskType
|
|
@@ -18,6 +20,7 @@ from mteb.similarity_functions import (
|
|
|
18
20
|
from mteb.types import (
|
|
19
21
|
Array,
|
|
20
22
|
BatchedInput,
|
|
23
|
+
EncodeKwargs,
|
|
21
24
|
PromptType,
|
|
22
25
|
)
|
|
23
26
|
|
|
@@ -43,7 +46,7 @@ class AbsEncoder(ABC):
|
|
|
43
46
|
model: Any
|
|
44
47
|
mteb_model_meta: ModelMeta | None = None
|
|
45
48
|
model_prompts: dict[str, str] | None = None
|
|
46
|
-
instruction_template: str | Callable[[str, PromptType], str] | None = None
|
|
49
|
+
instruction_template: str | Callable[[str, PromptType | None], str] | None = None
|
|
47
50
|
prompts_dict: dict[str, str] | None = None
|
|
48
51
|
|
|
49
52
|
def get_prompt_name(
|
|
@@ -110,7 +113,7 @@ class AbsEncoder(ABC):
|
|
|
110
113
|
if not self.model_prompts:
|
|
111
114
|
return None
|
|
112
115
|
prompt_name = self.get_prompt_name(task_metadata, prompt_type)
|
|
113
|
-
return self.model_prompts.get(prompt_name)
|
|
116
|
+
return self.model_prompts.get(prompt_name) if prompt_name else None
|
|
114
117
|
|
|
115
118
|
@staticmethod
|
|
116
119
|
@overload
|
|
@@ -187,6 +190,7 @@ class AbsEncoder(ABC):
|
|
|
187
190
|
except KeyError:
|
|
188
191
|
msg = f"Task name {task_name} is not valid. {valid_keys_msg}"
|
|
189
192
|
logger.warning(msg)
|
|
193
|
+
warnings.warn(msg)
|
|
190
194
|
invalid_task_messages.add(msg)
|
|
191
195
|
invalid_keys.add(task_key)
|
|
192
196
|
|
|
@@ -232,9 +236,9 @@ class AbsEncoder(ABC):
|
|
|
232
236
|
if isinstance(prompt, dict) and prompt_type:
|
|
233
237
|
if prompt.get(prompt_type.value):
|
|
234
238
|
return prompt[prompt_type.value]
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
)
|
|
239
|
+
msg = f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
|
|
240
|
+
logger.warning(msg)
|
|
241
|
+
warnings.warn(msg)
|
|
238
242
|
return ""
|
|
239
243
|
|
|
240
244
|
if prompt:
|
|
@@ -368,7 +372,7 @@ class AbsEncoder(ABC):
|
|
|
368
372
|
hf_split: str,
|
|
369
373
|
hf_subset: str,
|
|
370
374
|
prompt_type: PromptType | None = None,
|
|
371
|
-
**kwargs:
|
|
375
|
+
**kwargs: Unpack[EncodeKwargs],
|
|
372
376
|
) -> Array:
|
|
373
377
|
"""Encodes the given sentences using the encoder.
|
|
374
378
|
|
|
@@ -5,8 +5,6 @@ from typing import Any, Protocol, runtime_checkable
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
|
|
8
|
-
from mteb.types import BatchedInput
|
|
9
|
-
|
|
10
8
|
|
|
11
9
|
@runtime_checkable
|
|
12
10
|
class CacheBackendProtocol(Protocol):
|
|
@@ -26,7 +24,7 @@ class CacheBackendProtocol(Protocol):
|
|
|
26
24
|
**kwargs: Additional backend-specific arguments.
|
|
27
25
|
"""
|
|
28
26
|
|
|
29
|
-
def add(self, item: list[
|
|
27
|
+
def add(self, item: list[dict[str, Any]], vectors: np.ndarray) -> None:
|
|
30
28
|
"""Add a vector to the cache.
|
|
31
29
|
|
|
32
30
|
Args:
|
|
@@ -34,7 +32,7 @@ class CacheBackendProtocol(Protocol):
|
|
|
34
32
|
vectors: Embedding vector of shape (dim,) or (1, dim).
|
|
35
33
|
"""
|
|
36
34
|
|
|
37
|
-
def get_vector(self, item:
|
|
35
|
+
def get_vector(self, item: dict[str, Any]) -> np.ndarray | None:
|
|
38
36
|
"""Retrieve the cached vector for the given item.
|
|
39
37
|
|
|
40
38
|
Args:
|
|
@@ -53,5 +51,5 @@ class CacheBackendProtocol(Protocol):
|
|
|
53
51
|
def close(self) -> None:
|
|
54
52
|
"""Release resources or flush data."""
|
|
55
53
|
|
|
56
|
-
def __contains__(self, item:
|
|
54
|
+
def __contains__(self, item: dict[str, Any]) -> bool:
|
|
57
55
|
"""Check whether the cache contains an item."""
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
+
from collections.abc import Mapping
|
|
3
|
+
from typing import Any
|
|
2
4
|
|
|
3
|
-
from mteb.types import BatchedInput
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
def _hash_item(item: BatchedInput) -> str:
|
|
6
|
+
def _hash_item(item: Mapping[str, Any]) -> str:
|
|
7
7
|
item_hash = ""
|
|
8
8
|
if "text" in item:
|
|
9
|
-
|
|
9
|
+
item_text: str = item["text"]
|
|
10
|
+
item_hash = hashlib.sha256(item_text.encode()).hexdigest()
|
|
10
11
|
|
|
11
12
|
if "image" in item:
|
|
12
13
|
from PIL import Image
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
|
|
@@ -36,7 +38,7 @@ class FaissCache:
|
|
|
36
38
|
logger.info(f"Initialized FAISS VectorCacheMap in {self.directory}")
|
|
37
39
|
self.load()
|
|
38
40
|
|
|
39
|
-
def add(self, items: list[
|
|
41
|
+
def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None:
|
|
40
42
|
"""Add vector to FAISS index."""
|
|
41
43
|
import faiss
|
|
42
44
|
|
|
@@ -71,7 +73,9 @@ class FaissCache:
|
|
|
71
73
|
try:
|
|
72
74
|
return self.index.reconstruct(idx)
|
|
73
75
|
except Exception:
|
|
74
|
-
|
|
76
|
+
msg = f"Vector id {idx} missing for hash {item_hash}"
|
|
77
|
+
logger.warning(msg)
|
|
78
|
+
warnings.warn(msg)
|
|
75
79
|
return None
|
|
76
80
|
|
|
77
81
|
def save(self) -> None:
|