PyPI - mteb - Versions diffs - 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl - Mend

mteb 2.5.2py3-none-any.whl → 2.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (241) hide show

mteb/__init__.py +2 -0
mteb/_create_dataloaders.py +17 -18
mteb/_evaluators/any_sts_evaluator.py +3 -3
mteb/_evaluators/clustering_evaluator.py +2 -2
mteb/_evaluators/evaluator.py +4 -2
mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
mteb/_evaluators/pair_classification_evaluator.py +5 -3
mteb/_evaluators/retrieval_evaluator.py +2 -2
mteb/_evaluators/retrieval_metrics.py +18 -17
mteb/_evaluators/sklearn_evaluator.py +11 -10
mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
mteb/_evaluators/text/summarization_evaluator.py +23 -18
mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
mteb/abstasks/_data_filter/filters.py +1 -1
mteb/abstasks/_data_filter/task_pipelines.py +3 -0
mteb/abstasks/_statistics_calculation.py +18 -10
mteb/abstasks/_stratification.py +18 -18
mteb/abstasks/abstask.py +35 -28
mteb/abstasks/aggregate_task_metadata.py +1 -9
mteb/abstasks/aggregated_task.py +10 -29
mteb/abstasks/classification.py +15 -10
mteb/abstasks/clustering.py +19 -15
mteb/abstasks/clustering_legacy.py +10 -10
mteb/abstasks/image/image_text_pair_classification.py +7 -4
mteb/abstasks/multilabel_classification.py +23 -19
mteb/abstasks/pair_classification.py +20 -11
mteb/abstasks/regression.py +4 -4
mteb/abstasks/retrieval.py +28 -24
mteb/abstasks/retrieval_dataset_loaders.py +2 -2
mteb/abstasks/sts.py +8 -5
mteb/abstasks/task_metadata.py +31 -33
mteb/abstasks/text/bitext_mining.py +39 -28
mteb/abstasks/text/reranking.py +8 -6
mteb/abstasks/text/summarization.py +10 -5
mteb/abstasks/zeroshot_classification.py +8 -4
mteb/benchmarks/benchmark.py +4 -2
mteb/benchmarks/benchmarks/__init__.py +4 -0
mteb/benchmarks/benchmarks/benchmarks.py +112 -11
mteb/benchmarks/get_benchmark.py +14 -55
mteb/cache.py +182 -29
mteb/cli/_display_tasks.py +2 -2
mteb/cli/build_cli.py +110 -14
mteb/cli/generate_model_card.py +43 -23
mteb/deprecated_evaluator.py +63 -49
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
mteb/evaluate.py +44 -33
mteb/filter_tasks.py +25 -26
mteb/get_tasks.py +29 -30
mteb/languages/language_scripts.py +5 -3
mteb/leaderboard/app.py +162 -34
mteb/load_results.py +12 -12
mteb/models/abs_encoder.py +10 -6
mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
mteb/models/cache_wrappers/cache_wrapper.py +2 -2
mteb/models/get_model_meta.py +21 -3
mteb/models/instruct_wrapper.py +28 -8
mteb/models/model_implementations/align_models.py +1 -1
mteb/models/model_implementations/andersborges.py +4 -4
mteb/models/model_implementations/ara_models.py +1 -1
mteb/models/model_implementations/arctic_models.py +8 -8
mteb/models/model_implementations/b1ade_models.py +1 -1
mteb/models/model_implementations/bge_models.py +45 -21
mteb/models/model_implementations/bica_model.py +3 -3
mteb/models/model_implementations/blip2_models.py +2 -2
mteb/models/model_implementations/blip_models.py +16 -16
mteb/models/model_implementations/bm25.py +4 -4
mteb/models/model_implementations/bmretriever_models.py +6 -4
mteb/models/model_implementations/cadet_models.py +1 -1
mteb/models/model_implementations/cde_models.py +11 -4
mteb/models/model_implementations/clip_models.py +6 -6
mteb/models/model_implementations/clips_models.py +3 -3
mteb/models/model_implementations/codefuse_models.py +5 -5
mteb/models/model_implementations/codesage_models.py +3 -3
mteb/models/model_implementations/cohere_models.py +5 -5
mteb/models/model_implementations/cohere_v.py +2 -2
mteb/models/model_implementations/colpali_models.py +3 -3
mteb/models/model_implementations/colqwen_models.py +8 -8
mteb/models/model_implementations/colsmol_models.py +2 -2
mteb/models/model_implementations/conan_models.py +1 -1
mteb/models/model_implementations/dino_models.py +42 -42
mteb/models/model_implementations/e5_instruct.py +23 -4
mteb/models/model_implementations/e5_models.py +9 -9
mteb/models/model_implementations/e5_v.py +6 -6
mteb/models/model_implementations/eagerworks_models.py +1 -1
mteb/models/model_implementations/emillykkejensen_models.py +6 -6
mteb/models/model_implementations/en_code_retriever.py +1 -1
mteb/models/model_implementations/euler_models.py +2 -2
mteb/models/model_implementations/fa_models.py +9 -9
mteb/models/model_implementations/facebookai.py +14 -2
mteb/models/model_implementations/geogpt_models.py +1 -1
mteb/models/model_implementations/gme_v_models.py +6 -5
mteb/models/model_implementations/google_models.py +1 -1
mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
mteb/models/model_implementations/gritlm_models.py +2 -2
mteb/models/model_implementations/gte_models.py +25 -13
mteb/models/model_implementations/hinvec_models.py +1 -1
mteb/models/model_implementations/ibm_granite_models.py +30 -6
mteb/models/model_implementations/inf_models.py +2 -2
mteb/models/model_implementations/jasper_models.py +2 -2
mteb/models/model_implementations/jina_clip.py +48 -10
mteb/models/model_implementations/jina_models.py +18 -11
mteb/models/model_implementations/kblab.py +12 -6
mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
mteb/models/model_implementations/kfst.py +1 -1
mteb/models/model_implementations/kowshik24_models.py +1 -1
mteb/models/model_implementations/lgai_embedding_models.py +1 -1
mteb/models/model_implementations/linq_models.py +1 -1
mteb/models/model_implementations/listconranker.py +1 -1
mteb/models/model_implementations/llm2clip_models.py +6 -6
mteb/models/model_implementations/llm2vec_models.py +8 -8
mteb/models/model_implementations/mcinext_models.py +4 -1
mteb/models/model_implementations/mdbr_models.py +17 -3
mteb/models/model_implementations/misc_models.py +68 -68
mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
mteb/models/model_implementations/mme5_models.py +1 -1
mteb/models/model_implementations/moco_models.py +4 -4
mteb/models/model_implementations/mod_models.py +1 -1
mteb/models/model_implementations/model2vec_models.py +14 -14
mteb/models/model_implementations/moka_models.py +1 -1
mteb/models/model_implementations/nbailab.py +3 -3
mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
mteb/models/model_implementations/nomic_models.py +30 -15
mteb/models/model_implementations/nomic_models_vision.py +1 -1
mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
mteb/models/model_implementations/nvidia_models.py +151 -19
mteb/models/model_implementations/octen_models.py +61 -2
mteb/models/model_implementations/openclip_models.py +13 -13
mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
mteb/models/model_implementations/ops_moa_models.py +1 -1
mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
mteb/models/model_implementations/pawan_models.py +1 -1
mteb/models/model_implementations/piccolo_models.py +1 -1
mteb/models/model_implementations/pixie_models.py +56 -0
mteb/models/model_implementations/promptriever_models.py +4 -4
mteb/models/model_implementations/pylate_models.py +10 -9
mteb/models/model_implementations/qodo_models.py +2 -2
mteb/models/model_implementations/qtack_models.py +1 -1
mteb/models/model_implementations/qwen3_models.py +3 -3
mteb/models/model_implementations/qzhou_models.py +2 -2
mteb/models/model_implementations/random_baseline.py +3 -3
mteb/models/model_implementations/rasgaard_models.py +2 -2
mteb/models/model_implementations/reasonir_model.py +1 -1
mteb/models/model_implementations/repllama_models.py +3 -3
mteb/models/model_implementations/rerankers_custom.py +12 -6
mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
mteb/models/model_implementations/richinfoai_models.py +1 -1
mteb/models/model_implementations/ru_sentence_models.py +20 -20
mteb/models/model_implementations/ruri_models.py +10 -10
mteb/models/model_implementations/salesforce_models.py +3 -3
mteb/models/model_implementations/samilpwc_models.py +1 -1
mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
mteb/models/model_implementations/searchmap_models.py +1 -1
mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
mteb/models/model_implementations/sentence_transformers_models.py +124 -22
mteb/models/model_implementations/shuu_model.py +1 -1
mteb/models/model_implementations/siglip_models.py +20 -20
mteb/models/model_implementations/slm_models.py +416 -0
mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
mteb/models/model_implementations/stella_models.py +17 -4
mteb/models/model_implementations/tarka_models.py +2 -2
mteb/models/model_implementations/text2vec_models.py +9 -3
mteb/models/model_implementations/ua_sentence_models.py +1 -1
mteb/models/model_implementations/uae_models.py +7 -1
mteb/models/model_implementations/vdr_models.py +1 -1
mteb/models/model_implementations/vi_vn_models.py +6 -6
mteb/models/model_implementations/vlm2vec_models.py +3 -3
mteb/models/model_implementations/voyage_models.py +84 -0
mteb/models/model_implementations/voyage_v.py +9 -7
mteb/models/model_implementations/youtu_models.py +1 -1
mteb/models/model_implementations/yuan_models.py +1 -1
mteb/models/model_implementations/yuan_models_en.py +1 -1
mteb/models/model_meta.py +80 -31
mteb/models/models_protocols.py +22 -6
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
mteb/models/search_wrappers.py +33 -18
mteb/models/sentence_transformer_wrapper.py +50 -25
mteb/models/vllm_wrapper.py +327 -0
mteb/py.typed +0 -0
mteb/results/benchmark_results.py +29 -21
mteb/results/model_result.py +52 -22
mteb/results/task_result.py +80 -58
mteb/similarity_functions.py +11 -7
mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
mteb/tasks/classification/est/estonian_valence.py +1 -1
mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
mteb/tasks/classification/multilingual/scala_classification.py +1 -1
mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
mteb/tasks/retrieval/code/code_rag.py +12 -12
mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
mteb/tasks/retrieval/eng/__init__.py +2 -0
mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
mteb/tasks/retrieval/kor/__init__.py +15 -1
mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
mteb/tasks/retrieval/multilingual/__init__.py +2 -0
mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
mteb/tasks/retrieval/nob/norquad.py +2 -2
mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
mteb/tasks/retrieval/vie/__init__.py +14 -6
mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
mteb/types/__init__.py +2 -0
mteb/types/_encoder_io.py +12 -0
mteb/types/_result.py +2 -1
mteb/types/statistics.py +9 -3
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
mteb/models/model_implementations/mxbai_models.py +0 -111
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0

mteb/leaderboard/app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import tempfile
 import time
 import warnings
 from pathlib import Path
-from typing import Literal
+from typing import Literal, get_args
 from urllib.parse import urlencode
 import cachetools
@@ -29,40 +29,115 @@ from mteb.leaderboard.table import (
     apply_summary_styling_from_benchmark,
 )
 from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
+from mteb.models.model_meta import MODEL_TYPES
 logger = logging.getLogger(__name__)
 LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.languages})
+MODEL_TYPE_CHOICES = list(get_args(MODEL_TYPES))
 def _load_results(cache: ResultCache) -> BenchmarkResults:
+    """Load benchmark results using an optimized caching strategy.
+    This function implements a two-tier caching strategy for faster leaderboard startup:
+    1. **Primary Strategy (Fast)**: Download pre-computed cached results from the
+       'cached-data' branch as a compressed JSON file (~2MB vs ~200MB full repo).
+       This avoids the need to clone the entire results repository and provides
+       near-instantaneous loading for most users.
+    2. **Fallback Strategy (Slower)**: If the cached download fails, fall back to
+       the original approach of downloading the full results repository and
+       building the cache from scratch.
+    The cached results file contains pre-aggregated benchmark data that eliminates
+    the need for expensive operations like task selection and revision joining
+    during app startup.
+    Args:
+        cache: ResultCache instance used for both optimized and fallback operations
+    Returns:
+        BenchmarkResults: Complete benchmark results ready for leaderboard display
+    Raises:
+        Various exceptions related to network issues, file I/O, or data validation
+        are logged and may cause fallback to the slower repository-based approach.
+    """
     start_time = time.time()
     results_cache_path = Path(__file__).parent.joinpath("__cached_results.json")
     if not results_cache_path.exists():
-        logger.info("Cached results not found, downloading from remote...")
-        cache.download_from_remote()
-        download_time = time.time() - start_time
-        logger.info(f"Downloaded remote results in {download_time:.2f}s")
-        load_start = time.time()
-        all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
-        all_results = cache.load_results(
-            models=all_model_names,
-            only_main_score=True,
-            require_model_meta=False,
-            include_remote=True,
+        # First try to download the cached results file from the cached-data branch
+        # This is faster than cloning the entire results repository
+        logger.info(
+            "Cached results not found, trying to download from cached-data branch..."
         )
-        load_time = time.time() - load_start
-        logger.info(f"Loaded results from cache in {load_time:.2f}s")
-        return all_results
-    else:
-        logger.info("Loading cached results from disk...")
+        try:
+            # Use ResultCache's optimized download method
+            # Default saves to mteb/leaderboard/__cached_results.json
+            results_cache_path = cache._download_cached_results_from_branch()
+            download_time = time.time() - start_time
+            logger.info(
+                f"Downloaded cached results from cached-data branch in {download_time:.2f}s"
+            )
+        except Exception as e:
+            logger.error(
+                f"Failed to download from cached-data branch: {type(e).__name__}: {e}"
+            )
+            logger.info("Falling back to downloading full remote repository...")
+            # Fall back to the original approach: clone the full repo
+            cache.download_from_remote()
+            download_time = time.time() - start_time
+            logger.info(f"Downloaded remote results in {download_time:.2f}s")
+            load_start = time.time()
+            all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
+            all_results = cache.load_results(
+                models=all_model_names,
+                only_main_score=True,
+                require_model_meta=False,
+                include_remote=True,
+            )
+            load_time = time.time() - load_start
+            logger.info(f"Loaded results from cache in {load_time:.2f}s")
+            return all_results
+    # Load the cached results file (either pre-existing or just downloaded)
+    logger.info("Loading cached results from disk...")
+    try:
+        logger.info(f"Opening file: {results_cache_path}")
+        file_size = results_cache_path.stat().st_size
+        logger.info(f"File exists, size: {file_size} bytes")
         with results_cache_path.open() as cache_file:
-            results = mteb.BenchmarkResults.from_validated(**json.load(cache_file))
-        total_time = time.time() - start_time
-        logger.info(f"Loaded cached results in {total_time:.2f}s")
-        return results
+            logger.info("File opened successfully, attempting JSON parse...")
+            json_data = json.load(cache_file)
+            logger.info(
+                f"JSON parsed successfully, keys: {list(json_data.keys()) if isinstance(json_data, dict) else 'not a dict'}"
+            )
+        logger.info("Attempting BenchmarkResults.from_validated...")
+        results = mteb.BenchmarkResults.from_validated(**json_data)
+        logger.info("BenchmarkResults.from_validated successful")
+    except Exception as e:
+        # TODO: Handle the case when we fail to load cached results from disk.
+        logger.error(
+            f"Failed to load cached results from disk: {type(e).__name__}: {e}"
+        )
+        raise
+    total_time = time.time() - start_time
+    logger.info(f"Loaded cached results in {total_time:.2f}s")
+    return results
 def _produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str:
@@ -169,7 +244,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
     df = df.drop(columns="reference")
     return gr.DataFrame(
         df,
-        datatype=["markdown"] + ["str"] * (len(df.columns) - 1),  # type: ignore
+        datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
         buttons=["copy", "fullscreen"],
         show_search="filter",
     )
@@ -187,6 +262,7 @@ def _filter_models(
     instructions: bool | None,
     max_model_size: int,
     zero_shot_setting: Literal["only_zero_shot", "allow_all", "remove_unknown"],
+    model_types: list[str] | None,
 ):
     lower, upper = 0, max_model_size
     # Setting to None, when the user doesn't specify anything
@@ -205,6 +281,7 @@ def _filter_models(
         use_instructions=instructions,
         frameworks=compatibility,
         n_parameters_range=(lower, upper),
+        model_types=model_types,
     )
     models_to_keep = set()
@@ -269,6 +346,7 @@ def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
         instructions=None,
         max_model_size=MAX_MODEL_SIZE,
         zero_shot_setting="allow_all",
+        model_types=MODEL_TYPE_CHOICES,
     )
     # Sort to ensure consistency with update_models
     initial_models = sorted(initial_models)
@@ -387,6 +465,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
         instructions=None,
         max_model_size=MAX_MODEL_SIZE,
         zero_shot_setting="allow_all",
+        model_types=MODEL_TYPE_CHOICES,
     )
     default_filtered_scores = [
         entry for entry in default_scores if entry["model_name"] in filtered_models
@@ -583,6 +662,12 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
                                     label="Model Parameters",
                                     interactive=True,
                                 )
+                            with gr.Column():
+                                model_type_select = gr.CheckboxGroup(
+                                    MODEL_TYPE_CHOICES,
+                                    value=MODEL_TYPE_CHOICES,
+                                    label="Model Type",
+                                )
         with gr.Tab("Summary"):
             summary_table.render()
@@ -755,7 +840,8 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
             compatibility,
             instructions,
             max_model_size,
-            zero_shot: hash(
+            zero_shot,
+            model_type_select: hash(
                 (
                     id(scores),
                     hash(tuple(tasks)),
@@ -764,6 +850,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
                     hash(instructions),
                     hash(max_model_size),
                     hash(zero_shot),
+                    hash(tuple(model_type_select)),
                 )
             ),
         )
@@ -775,6 +862,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
             instructions: bool | None,
             max_model_size: int,
             zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"],
+            model_type_select: list[str],
         ):
             start_time = time.time()
             model_names = list({entry["model_name"] for entry in scores})
@@ -786,6 +874,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
                 instructions,
                 max_model_size,
                 zero_shot_setting=zero_shot,
+                model_types=model_type_select,
             )
             elapsed = time.time() - start_time
             logger.debug(f"update_models callback: {elapsed}s")
@@ -803,6 +892,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
                 instructions,
                 max_model_size,
                 zero_shot,
+                model_type_select,
             ],
             outputs=[models],
         )
@@ -817,6 +907,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
                 instructions,
                 max_model_size,
                 zero_shot,
+                model_type_select,
             ],
             outputs=[models],
         )
@@ -830,6 +921,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
                 instructions,
                 max_model_size,
                 zero_shot,
+                model_type_select,
             ],
             outputs=[models],
         )
@@ -843,6 +935,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
                 instructions,
                 max_model_size,
                 zero_shot,
+                model_type_select,
             ],
             outputs=[models],
         )
@@ -856,6 +949,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
                 instructions,
                 max_model_size,
                 zero_shot,
+                model_type_select,
             ],
             outputs=[models],
         )
@@ -869,6 +963,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
                 instructions,
                 max_model_size,
                 zero_shot,
+                model_type_select,
             ],
             outputs=[models],
         )
@@ -882,6 +977,21 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
                 instructions,
                 max_model_size,
                 zero_shot,
+                model_type_select,
+            ],
+            outputs=[models],
+        )
+        model_type_select.change(
+            update_models,
+            inputs=[
+                scores,
+                task_select,
+                availability,
+                compatibility,
+                instructions,
+                max_model_size,
+                zero_shot,
+                model_type_select,
             ],
             outputs=[models],
         )
@@ -1023,16 +1133,34 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
 if __name__ == "__main__":
-    logging.getLogger("mteb.load_results.task_results").setLevel(
-        logging.ERROR
-    )  # Warnings related to task split
-    logging.getLogger("mteb.model_meta").setLevel(
-        logging.ERROR
-    )  # Warning related to model metadata (fetch_from_hf=False)
-    logging.getLogger("mteb.load_results.benchmark_results").setLevel(
-        logging.ERROR
-    )  # Warning related to model metadata (fetch_from_hf=False)
+    import os
+    # Add process ID to logging for multiprocessing debugging
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - PID:%(process)d - %(name)s - %(levelname)s - %(message)s",
+        force=True,  # Override any existing handlers
+    )
+    # Flush log handlers immediately (helpful for multiprocessing)
+    for handler in logging.root.handlers:
+        handler.flush()
+    logger.info(f"Starting leaderboard app in process {os.getpid()}")
+    # Suppress specific WARNING messages while keeping INFO level for the app
+    logging.getLogger("mteb.results.task_result").setLevel(logging.ERROR)
+    logging.getLogger("mteb.models.model_meta").setLevel(logging.ERROR)
+    logging.getLogger("mteb.results.benchmark_results").setLevel(logging.ERROR)
     warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
+    warnings.filterwarnings("ignore", message="Could not get source model: .*")
+    warnings.filterwarnings(
+        "ignore", message="No scores data available. Returning empty DataFrame."
+    )
+    warnings.filterwarnings("ignore", message="Main score .* not found in scores")
+    warnings.filterwarnings("ignore", message=".*: Missing subsets .* for split .*")
+    warnings.filterwarnings("ignore", message=".*: Missing splits .*")
     app = get_leaderboard_app()

mteb/load_results.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import logging
 import sys
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from pathlib import Path
 from mteb.abstasks.abstask import AbsTask
@@ -45,8 +45,8 @@ def _model_name_and_revision(
 def load_results(
     results_repo: str = "https://github.com/embeddings-benchmark/results",
     download_latest: bool = True,
-    models: Sequence[ModelMeta] | Sequence[str] | None = None,
-    tasks: Sequence[AbsTask] | Sequence[str] | None = None,
+    models: Iterable[ModelMeta] | Sequence[str] | None = None,
+    tasks: Iterable[AbsTask] | Sequence[str] | None = None,
     validate_and_filter: bool = True,
     require_model_meta: bool = True,
     only_main_score: bool = False,
@@ -83,21 +83,21 @@ def load_results(
     if models is not None:
         models_to_keep = {}
-        for model_path in models:
-            if isinstance(model_path, ModelMeta):
-                models_to_keep[model_path.name] = model_path.revision
+        for model in models:
+            if isinstance(model, ModelMeta):
+                models_to_keep[model.name] = model.revision
             else:
-                models_to_keep[model_path] = None
+                models_to_keep[model] = None
     else:
         models_to_keep = None
-    task_names = {}
+    task_names: dict[str, AbsTask | None] = {}
     if tasks is not None:
-        for task in tasks:
-            if isinstance(task, AbsTask):
-                task_names[task.metadata.name] = task
+        for task_ in tasks:
+            if isinstance(task_, AbsTask):
+                task_names[task_.metadata.name] = task_
             else:
-                task_names[task] = None
+                task_names[task_] = None
     model_results = []
     for model_path in model_paths:

mteb/models/abs_encoder.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import logging
+import warnings
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Sequence
 from typing import Any, Literal, cast, get_args, overload
 from torch.utils.data import DataLoader
+from typing_extensions import Unpack
 import mteb
 from mteb.abstasks.task_metadata import TaskMetadata, TaskType
@@ -18,6 +20,7 @@ from mteb.similarity_functions import (
 from mteb.types import (
     Array,
     BatchedInput,
+    EncodeKwargs,
     PromptType,
 )
@@ -43,7 +46,7 @@ class AbsEncoder(ABC):
     model: Any
     mteb_model_meta: ModelMeta | None = None
     model_prompts: dict[str, str] | None = None
-    instruction_template: str | Callable[[str, PromptType], str] | None = None
+    instruction_template: str | Callable[[str, PromptType | None], str] | None = None
     prompts_dict: dict[str, str] | None = None
     def get_prompt_name(
@@ -110,7 +113,7 @@ class AbsEncoder(ABC):
         if not self.model_prompts:
             return None
         prompt_name = self.get_prompt_name(task_metadata, prompt_type)
-        return self.model_prompts.get(prompt_name)
+        return self.model_prompts.get(prompt_name) if prompt_name else None
     @staticmethod
     @overload
@@ -187,6 +190,7 @@ class AbsEncoder(ABC):
                 except KeyError:
                     msg = f"Task name {task_name} is not valid. {valid_keys_msg}"
                     logger.warning(msg)
+                    warnings.warn(msg)
                     invalid_task_messages.add(msg)
                     invalid_keys.add(task_key)
@@ -232,9 +236,9 @@ class AbsEncoder(ABC):
         if isinstance(prompt, dict) and prompt_type:
             if prompt.get(prompt_type.value):
                 return prompt[prompt_type.value]
-            logger.warning(
-                f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
-            )
+            msg = f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
+            logger.warning(msg)
+            warnings.warn(msg)
             return ""
         if prompt:
@@ -368,7 +372,7 @@ class AbsEncoder(ABC):
         hf_split: str,
         hf_subset: str,
         prompt_type: PromptType | None = None,
-        **kwargs: Any,
+        **kwargs: Unpack[EncodeKwargs],
     ) -> Array:
         """Encodes the given sentences using the encoder.

mteb/models/cache_wrappers/cache_backend_protocol.py CHANGED Viewed

@@ -5,8 +5,6 @@ from typing import Any, Protocol, runtime_checkable
 import numpy as np
-from mteb.types import BatchedInput
 @runtime_checkable
 class CacheBackendProtocol(Protocol):
@@ -26,7 +24,7 @@ class CacheBackendProtocol(Protocol):
             **kwargs: Additional backend-specific arguments.
         """
-    def add(self, item: list[BatchedInput], vectors: np.ndarray) -> None:
+    def add(self, item: list[dict[str, Any]], vectors: np.ndarray) -> None:
         """Add a vector to the cache.
         Args:
@@ -34,7 +32,7 @@ class CacheBackendProtocol(Protocol):
             vectors: Embedding vector of shape (dim,) or (1, dim).
         """
-    def get_vector(self, item: BatchedInput) -> np.ndarray | None:
+    def get_vector(self, item: dict[str, Any]) -> np.ndarray | None:
         """Retrieve the cached vector for the given item.
         Args:
@@ -53,5 +51,5 @@ class CacheBackendProtocol(Protocol):
     def close(self) -> None:
         """Release resources or flush data."""
-    def __contains__(self, item: BatchedInput) -> bool:
+    def __contains__(self, item: dict[str, Any]) -> bool:
         """Check whether the cache contains an item."""

mteb/models/cache_wrappers/cache_backends/_hash_utils.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import hashlib
+from collections.abc import Mapping
+from typing import Any
-from mteb.types import BatchedInput
-def _hash_item(item: BatchedInput) -> str:
+def _hash_item(item: Mapping[str, Any]) -> str:
     item_hash = ""
     if "text" in item:
-        item_hash = hashlib.sha256(item["text"].encode()).hexdigest()
+        item_text: str = item["text"]
+        item_hash = hashlib.sha256(item_text.encode()).hexdigest()
     if "image" in item:
         from PIL import Image

mteb/models/cache_wrappers/cache_backends/faiss_cache.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import json
 import logging
+import warnings
 from pathlib import Path
+from typing import Any
 import numpy as np
@@ -36,7 +38,7 @@ class FaissCache:
         logger.info(f"Initialized FAISS VectorCacheMap in {self.directory}")
         self.load()
-    def add(self, items: list[BatchedInput], vectors: np.ndarray) -> None:
+    def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None:
         """Add vector to FAISS index."""
         import faiss
@@ -71,7 +73,9 @@ class FaissCache:
         try:
             return self.index.reconstruct(idx)
         except Exception:
-            logger.warning(f"Vector id {idx} missing for hash {item_hash}")
+            msg = f"Vector id {idx} missing for hash {item_hash}"
+            logger.warning(msg)
+            warnings.warn(msg)
             return None
     def save(self) -> None:

mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

mteb 2.5.2py3-none-any.whl → 2.7.2py3-none-any.whl