PyPI - mteb - Versions diffs - 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl - Mend

mteb 2.5.2py3-none-any.whl → 2.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (241) hide show

mteb/__init__.py +2 -0
mteb/_create_dataloaders.py +17 -18
mteb/_evaluators/any_sts_evaluator.py +3 -3
mteb/_evaluators/clustering_evaluator.py +2 -2
mteb/_evaluators/evaluator.py +4 -2
mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
mteb/_evaluators/pair_classification_evaluator.py +5 -3
mteb/_evaluators/retrieval_evaluator.py +2 -2
mteb/_evaluators/retrieval_metrics.py +18 -17
mteb/_evaluators/sklearn_evaluator.py +11 -10
mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
mteb/_evaluators/text/summarization_evaluator.py +23 -18
mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
mteb/abstasks/_data_filter/filters.py +1 -1
mteb/abstasks/_data_filter/task_pipelines.py +3 -0
mteb/abstasks/_statistics_calculation.py +18 -10
mteb/abstasks/_stratification.py +18 -18
mteb/abstasks/abstask.py +35 -28
mteb/abstasks/aggregate_task_metadata.py +1 -9
mteb/abstasks/aggregated_task.py +10 -29
mteb/abstasks/classification.py +15 -10
mteb/abstasks/clustering.py +19 -15
mteb/abstasks/clustering_legacy.py +10 -10
mteb/abstasks/image/image_text_pair_classification.py +7 -4
mteb/abstasks/multilabel_classification.py +23 -19
mteb/abstasks/pair_classification.py +20 -11
mteb/abstasks/regression.py +4 -4
mteb/abstasks/retrieval.py +28 -24
mteb/abstasks/retrieval_dataset_loaders.py +2 -2
mteb/abstasks/sts.py +8 -5
mteb/abstasks/task_metadata.py +31 -33
mteb/abstasks/text/bitext_mining.py +39 -28
mteb/abstasks/text/reranking.py +8 -6
mteb/abstasks/text/summarization.py +10 -5
mteb/abstasks/zeroshot_classification.py +8 -4
mteb/benchmarks/benchmark.py +4 -2
mteb/benchmarks/benchmarks/__init__.py +4 -0
mteb/benchmarks/benchmarks/benchmarks.py +112 -11
mteb/benchmarks/get_benchmark.py +14 -55
mteb/cache.py +182 -29
mteb/cli/_display_tasks.py +2 -2
mteb/cli/build_cli.py +110 -14
mteb/cli/generate_model_card.py +43 -23
mteb/deprecated_evaluator.py +63 -49
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
mteb/evaluate.py +44 -33
mteb/filter_tasks.py +25 -26
mteb/get_tasks.py +29 -30
mteb/languages/language_scripts.py +5 -3
mteb/leaderboard/app.py +162 -34
mteb/load_results.py +12 -12
mteb/models/abs_encoder.py +10 -6
mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
mteb/models/cache_wrappers/cache_wrapper.py +2 -2
mteb/models/get_model_meta.py +21 -3
mteb/models/instruct_wrapper.py +28 -8
mteb/models/model_implementations/align_models.py +1 -1
mteb/models/model_implementations/andersborges.py +4 -4
mteb/models/model_implementations/ara_models.py +1 -1
mteb/models/model_implementations/arctic_models.py +8 -8
mteb/models/model_implementations/b1ade_models.py +1 -1
mteb/models/model_implementations/bge_models.py +45 -21
mteb/models/model_implementations/bica_model.py +3 -3
mteb/models/model_implementations/blip2_models.py +2 -2
mteb/models/model_implementations/blip_models.py +16 -16
mteb/models/model_implementations/bm25.py +4 -4
mteb/models/model_implementations/bmretriever_models.py +6 -4
mteb/models/model_implementations/cadet_models.py +1 -1
mteb/models/model_implementations/cde_models.py +11 -4
mteb/models/model_implementations/clip_models.py +6 -6
mteb/models/model_implementations/clips_models.py +3 -3
mteb/models/model_implementations/codefuse_models.py +5 -5
mteb/models/model_implementations/codesage_models.py +3 -3
mteb/models/model_implementations/cohere_models.py +5 -5
mteb/models/model_implementations/cohere_v.py +2 -2
mteb/models/model_implementations/colpali_models.py +3 -3
mteb/models/model_implementations/colqwen_models.py +8 -8
mteb/models/model_implementations/colsmol_models.py +2 -2
mteb/models/model_implementations/conan_models.py +1 -1
mteb/models/model_implementations/dino_models.py +42 -42
mteb/models/model_implementations/e5_instruct.py +23 -4
mteb/models/model_implementations/e5_models.py +9 -9
mteb/models/model_implementations/e5_v.py +6 -6
mteb/models/model_implementations/eagerworks_models.py +1 -1
mteb/models/model_implementations/emillykkejensen_models.py +6 -6
mteb/models/model_implementations/en_code_retriever.py +1 -1
mteb/models/model_implementations/euler_models.py +2 -2
mteb/models/model_implementations/fa_models.py +9 -9
mteb/models/model_implementations/facebookai.py +14 -2
mteb/models/model_implementations/geogpt_models.py +1 -1
mteb/models/model_implementations/gme_v_models.py +6 -5
mteb/models/model_implementations/google_models.py +1 -1
mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
mteb/models/model_implementations/gritlm_models.py +2 -2
mteb/models/model_implementations/gte_models.py +25 -13
mteb/models/model_implementations/hinvec_models.py +1 -1
mteb/models/model_implementations/ibm_granite_models.py +30 -6
mteb/models/model_implementations/inf_models.py +2 -2
mteb/models/model_implementations/jasper_models.py +2 -2
mteb/models/model_implementations/jina_clip.py +48 -10
mteb/models/model_implementations/jina_models.py +18 -11
mteb/models/model_implementations/kblab.py +12 -6
mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
mteb/models/model_implementations/kfst.py +1 -1
mteb/models/model_implementations/kowshik24_models.py +1 -1
mteb/models/model_implementations/lgai_embedding_models.py +1 -1
mteb/models/model_implementations/linq_models.py +1 -1
mteb/models/model_implementations/listconranker.py +1 -1
mteb/models/model_implementations/llm2clip_models.py +6 -6
mteb/models/model_implementations/llm2vec_models.py +8 -8
mteb/models/model_implementations/mcinext_models.py +4 -1
mteb/models/model_implementations/mdbr_models.py +17 -3
mteb/models/model_implementations/misc_models.py +68 -68
mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
mteb/models/model_implementations/mme5_models.py +1 -1
mteb/models/model_implementations/moco_models.py +4 -4
mteb/models/model_implementations/mod_models.py +1 -1
mteb/models/model_implementations/model2vec_models.py +14 -14
mteb/models/model_implementations/moka_models.py +1 -1
mteb/models/model_implementations/nbailab.py +3 -3
mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
mteb/models/model_implementations/nomic_models.py +30 -15
mteb/models/model_implementations/nomic_models_vision.py +1 -1
mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
mteb/models/model_implementations/nvidia_models.py +151 -19
mteb/models/model_implementations/octen_models.py +61 -2
mteb/models/model_implementations/openclip_models.py +13 -13
mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
mteb/models/model_implementations/ops_moa_models.py +1 -1
mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
mteb/models/model_implementations/pawan_models.py +1 -1
mteb/models/model_implementations/piccolo_models.py +1 -1
mteb/models/model_implementations/pixie_models.py +56 -0
mteb/models/model_implementations/promptriever_models.py +4 -4
mteb/models/model_implementations/pylate_models.py +10 -9
mteb/models/model_implementations/qodo_models.py +2 -2
mteb/models/model_implementations/qtack_models.py +1 -1
mteb/models/model_implementations/qwen3_models.py +3 -3
mteb/models/model_implementations/qzhou_models.py +2 -2
mteb/models/model_implementations/random_baseline.py +3 -3
mteb/models/model_implementations/rasgaard_models.py +2 -2
mteb/models/model_implementations/reasonir_model.py +1 -1
mteb/models/model_implementations/repllama_models.py +3 -3
mteb/models/model_implementations/rerankers_custom.py +12 -6
mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
mteb/models/model_implementations/richinfoai_models.py +1 -1
mteb/models/model_implementations/ru_sentence_models.py +20 -20
mteb/models/model_implementations/ruri_models.py +10 -10
mteb/models/model_implementations/salesforce_models.py +3 -3
mteb/models/model_implementations/samilpwc_models.py +1 -1
mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
mteb/models/model_implementations/searchmap_models.py +1 -1
mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
mteb/models/model_implementations/sentence_transformers_models.py +124 -22
mteb/models/model_implementations/shuu_model.py +1 -1
mteb/models/model_implementations/siglip_models.py +20 -20
mteb/models/model_implementations/slm_models.py +416 -0
mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
mteb/models/model_implementations/stella_models.py +17 -4
mteb/models/model_implementations/tarka_models.py +2 -2
mteb/models/model_implementations/text2vec_models.py +9 -3
mteb/models/model_implementations/ua_sentence_models.py +1 -1
mteb/models/model_implementations/uae_models.py +7 -1
mteb/models/model_implementations/vdr_models.py +1 -1
mteb/models/model_implementations/vi_vn_models.py +6 -6
mteb/models/model_implementations/vlm2vec_models.py +3 -3
mteb/models/model_implementations/voyage_models.py +84 -0
mteb/models/model_implementations/voyage_v.py +9 -7
mteb/models/model_implementations/youtu_models.py +1 -1
mteb/models/model_implementations/yuan_models.py +1 -1
mteb/models/model_implementations/yuan_models_en.py +1 -1
mteb/models/model_meta.py +80 -31
mteb/models/models_protocols.py +22 -6
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
mteb/models/search_wrappers.py +33 -18
mteb/models/sentence_transformer_wrapper.py +50 -25
mteb/models/vllm_wrapper.py +327 -0
mteb/py.typed +0 -0
mteb/results/benchmark_results.py +29 -21
mteb/results/model_result.py +52 -22
mteb/results/task_result.py +80 -58
mteb/similarity_functions.py +11 -7
mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
mteb/tasks/classification/est/estonian_valence.py +1 -1
mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
mteb/tasks/classification/multilingual/scala_classification.py +1 -1
mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
mteb/tasks/retrieval/code/code_rag.py +12 -12
mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
mteb/tasks/retrieval/eng/__init__.py +2 -0
mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
mteb/tasks/retrieval/kor/__init__.py +15 -1
mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
mteb/tasks/retrieval/multilingual/__init__.py +2 -0
mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
mteb/tasks/retrieval/nob/norquad.py +2 -2
mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
mteb/tasks/retrieval/vie/__init__.py +14 -6
mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
mteb/types/__init__.py +2 -0
mteb/types/_encoder_io.py +12 -0
mteb/types/_result.py +2 -1
mteb/types/statistics.py +9 -3
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
mteb/models/model_implementations/mxbai_models.py +0 -111
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
{mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0

mteb/cache.py CHANGED Viewed

@@ -1,13 +1,19 @@
+import gzip
+import io
 import json
 import logging
 import os
 import shutil
 import subprocess
+import warnings
 from collections import defaultdict
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from pathlib import Path
 from typing import cast
+import requests
+from pydantic import ValidationError
 import mteb
 from mteb.abstasks import AbsTask
 from mteb.benchmarks.benchmark import Benchmark
@@ -22,8 +28,8 @@ class ResultCache:
     """Class to handle the local cache of MTEB results.
     Examples:
-        >>> from mteb.cache import ResultCache
-        >>> cache = ResultCache(cache_path="~/.cache/mteb") # default
+        >>> import mteb
+        >>> cache = mteb.ResultCache(cache_path="~/.cache/mteb") # default
         >>> cache.download_from_remote() # download the latest results from the remote repository
         >>> result = cache.load_results("task_name", "model_name")
     """
@@ -83,9 +89,9 @@ class ResultCache:
         model_path = results_folder / model_name
         if model_revision is None:
-            logger.warning(
-                "model_revision is not specified, attempting to load the latest revision. To disable this behavior, specify model_revision explicitly."
-            )
+            msg = "`model_revision` is not specified, attempting to load the latest revision. To disable this behavior, specify the 'model_revision` explicitly."
+            logger.warning(msg)
+            warnings.warn(msg)
             # get revs from paths
             revisions = [p for p in model_path.glob("*") if p.is_dir()]
             if not revisions:
@@ -275,21 +281,165 @@ class ResultCache:
         return results_directory
+    def _download_cached_results_from_branch(
+        self,
+        branch: str = "cached-data",
+        filename: str = "__cached_results.json.gz",
+        output_path: Path | None = None,
+        remote: str = "https://github.com/embeddings-benchmark/results",
+        timeout: int = 60,
+        max_size_mb: int = 500,
+    ) -> Path:
+        """Download pre-computed cached results from a specific branch.
+        This is significantly faster than download_from_remote() since it downloads
+        only a compressed cache file instead of cloning the entire repository.
+        The method performs the following steps:
+        1. Downloads a gzipped JSON file from the specified branch
+        2. Validates file size and content type
+        3. Decompresses the gzip content
+        4. Writes the decompressed JSON to disk
+        Args:
+            branch: Branch name to download from (default: "cached-data")
+            filename: Name of the cached results file (default: "__cached_results.json.gz")
+            output_path: Where to save the file. If None, uses mteb/leaderboard/__cached_results.json
+            remote: Base URL of the results repository
+            timeout: Request timeout in seconds (default: 60)
+            max_size_mb: Maximum allowed file size in megabytes (default: 500)
+        Returns:
+            Path to the downloaded and decompressed cache file
+        Raises:
+            requests.exceptions.RequestException: On HTTP errors
+            ValueError: On validation failures (size, content-type)
+            gzip.BadGzipFile: If content is not valid gzip
+            UnicodeDecodeError: If content cannot be decoded as UTF-8
+            PermissionError: If file cannot be written due to permissions
+            OSError: On other file system errors
+        Examples:
+            >>> import mteb
+            >>> cache = mteb.ResultCache()
+            >>> # Download optimized cached results
+            >>> cache_file = cache._download_cached_results_from_branch()
+            >>> # Use custom output path
+            >>> cache_file = cache._download_cached_results_from_branch(
+            ...     output_path=Path("/tmp/my_cache.json")
+            ... )
+        """
+        if output_path is None:
+            # Default to saving in mteb/leaderboard/__cached_results.json
+            # Get the mteb package directory (parent of this file)
+            mteb_package_dir = Path(__file__).parent
+            output_path = mteb_package_dir / "leaderboard" / "__cached_results.json"
+        # Extract repository owner and name from the remote URL
+        # e.g., "https://github.com/embeddings-benchmark/results" -> "embeddings-benchmark/results"
+        repo_path = remote.replace("https://github.com/", "").replace(
+            "http://github.com/", ""
+        )
+        url = f"https://raw.githubusercontent.com/{repo_path}/{branch}/{filename}"
+        logger.info(f"Downloading cached results from {url}")
+        # Step 1: Download with validation
+        max_size_bytes = max_size_mb * 1024 * 1024
+        try:
+            response = requests.get(url, timeout=timeout)
+            response.raise_for_status()
+            # Check if this is a Git LFS pointer file
+            content_type = response.headers.get("content-type", "").lower()
+            if (
+                content_type == "text/plain; charset=utf-8"
+                and b"git-lfs" in response.content
+            ):
+                # Try Git LFS media URL instead
+                media_url = f"https://media.githubusercontent.com/media/{repo_path}/{branch}/{filename}"
+                logger.info(f"Detected Git LFS file, trying media URL: {media_url}")
+                response = requests.get(media_url, timeout=timeout)
+                response.raise_for_status()
+                content_type = response.headers.get("content-type", "").lower()
+            # Validate content-type header
+            expected_content_types = [
+                "application/gzip",
+                "application/octet-stream",
+                "application/x-gzip",
+            ]
+            if content_type and not any(
+                ct in content_type for ct in expected_content_types
+            ):
+                raise Exception(
+                    f"Unexpected content-type: {content_type}. Expected one of: {expected_content_types}"
+                )
+            # Validate file size
+            content_length = len(response.content)
+            if content_length > max_size_bytes:
+                raise ValueError(
+                    f"Downloaded file too large: {content_length} bytes (max: {max_size_bytes})"
+                )
+            logger.info(
+                f"HTTP request successful, content length: {content_length} bytes"
+            )
+            content = response.content
+        except Exception as e:
+            logger.error(f"Unexpected HTTP error: {type(e).__name__}: {e}")
+            raise e
+        # Step 2: Decompress gzip data
+        logger.info("Attempting gzip decompression...")
+        try:
+            with gzip.open(io.BytesIO(content), "rt", encoding="utf-8") as gz_file:
+                data = gz_file.read()
+            logger.info(f"Decompression successful, data length: {len(data)} chars")
+        except Exception as e:
+            logger.error(f"Unexpected decompression error: {type(e).__name__}: {e}")
+            raise e
+        # Step 3: Write to disk
+        logger.info(f"Attempting to write to: {output_path}")
+        # Check parent directory exists and is writable
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            output_path.write_text(data, encoding="utf-8")
+            logger.info(
+                f"File write successful, size: {output_path.stat().st_size} bytes"
+            )
+        except Exception as e:
+            logger.error(f"Unexpected file write error: {type(e).__name__}: {e}")
+            raise e
+        return output_path
     def clear_cache(self) -> None:
         """Clear the local cache directory."""
         if self.cache_path.exists() and self.cache_path.is_dir():
             shutil.rmtree(self.cache_path)
             logger.info(f"Cache directory {self.cache_path} cleared.")
         else:
-            logger.warning(f"Cache directory {self.cache_path} does not exist.")
+            msg = f"Cache directory `{self.cache_path}` does not exist."
+            logger.warning(msg)
+            warnings.warn(msg)
     def __repr__(self) -> str:
         return f"ResultCache(cache_path={self.cache_path})"
     def get_cache_paths(
         self,
-        models: Sequence[str] | Sequence[ModelMeta] | None = None,
-        tasks: Sequence[str] | Sequence[AbsTask] | None = None,
+        models: Sequence[str] | Iterable[ModelMeta] | None = None,
+        tasks: Sequence[str] | Iterable[AbsTask] | None = None,
         require_model_meta: bool = True,
         include_remote: bool = True,
     ) -> list[Path]:
@@ -311,8 +461,8 @@ class ResultCache:
             A list of paths in the cache directory.
         Examples:
-            >>> from mteb.cache import ResultCache
-            >>> cache = ResultCache()
+            >>> import mteb
+            >>> cache = mteb.ResultCache()
             >>>
             >>> # Get all cache paths
             >>> paths = cache.get_cache_paths()
@@ -422,7 +572,7 @@ class ResultCache:
     @staticmethod
     def _filter_paths_by_model_and_revision(
         paths: list[Path],
-        models: Sequence[str] | Sequence[ModelMeta] | None = None,
+        models: Sequence[str] | Iterable[ModelMeta] | None = None,
     ) -> list[Path]:
         """Filter a list of paths by model name and optional revision.
@@ -432,8 +582,9 @@ class ResultCache:
         if not models:
             return paths
-        if isinstance(models[0], ModelMeta):
-            models = cast(list[ModelMeta], models)
+        first_model = next(iter(models))
+        if isinstance(first_model, ModelMeta):
+            models = cast(Iterable[ModelMeta], models)
             name_and_revision = {
                 (m.model_name_as_path(), m.revision or "no_revision_available")
                 for m in models
@@ -444,13 +595,14 @@ class ResultCache:
                 if (p.parent.parent.name, p.parent.name) in name_and_revision
             ]
-        model_names = {m.replace("/", "__").replace(" ", "_") for m in models}
+        str_models = cast(Sequence[str], models)
+        model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
         return [p for p in paths if p.parent.parent.name in model_names]
     @staticmethod
     def _filter_paths_by_task(
         paths: list[Path],
-        tasks: Sequence[str] | Sequence[AbsTask] | None = None,
+        tasks: Sequence[str] | Iterable[AbsTask] | None = None,
     ) -> list[Path]:
         if tasks is not None:
             task_names = set()
@@ -466,8 +618,8 @@ class ResultCache:
     def load_results(
         self,
-        models: Sequence[str] | Sequence[ModelMeta] | None = None,
-        tasks: Sequence[str] | Sequence[AbsTask] | Benchmark | str | None = None,
+        models: Sequence[str] | Iterable[ModelMeta] | None = None,
+        tasks: Sequence[str] | Iterable[AbsTask] | Benchmark | str | None = None,
         require_model_meta: bool = True,
         include_remote: bool = True,
         validate_and_filter: bool = False,
@@ -478,6 +630,7 @@ class ResultCache:
         Args:
             models: A list of model names to load the results for. If None it will load the results for all models.
             tasks: A list of task names to load the results for. If str is passed, then benchmark will be loaded.
+                If Benchmark is passed, then all tasks in the benchmark will be loaded.
                 If None it will load the results for all tasks.
             require_model_meta: If True it will ignore results that do not have a model_meta.json file. If false it attempt to
                 extract the model name and revision from the path.
@@ -490,8 +643,8 @@ class ResultCache:
             A BenchmarkResults object containing the results for the specified models and tasks.
         Examples:
-            >>> from mteb.cache import ResultCache
-            >>> cache = ResultCache()
+            >>> import mteb
+            >>> cache = mteb.ResultCache()
             >>>
             >>> # Load results for specific models and tasks
             >>> results = cache.load_results(
@@ -511,7 +664,7 @@ class ResultCache:
         )
         models_results = defaultdict(list)
-        task_names = {}
+        task_names: dict[str, AbsTask | None] = {}
         if tasks is not None:
             for task in tasks:
                 if isinstance(task, AbsTask):
@@ -529,10 +682,12 @@ class ResultCache:
             )
             if validate_and_filter:
-                task = task_names[task_result.task_name]
+                task_instance = task_names[task_result.task_name]
                 try:
-                    task_result = task_result.validate_and_filter_scores(task=task)
-                except Exception as e:
+                    task_result = task_result.validate_and_filter_scores(
+                        task=task_instance
+                    )
+                except ValidationError as e:
                     logger.info(
                         f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
                     )
@@ -541,7 +696,7 @@ class ResultCache:
             models_results[(model_name, revision)].append(task_result)
         # create BenchmarkResults object
-        models_results = [
+        models_results_object = [
             ModelResult(
                 model_name=model_name,
                 model_revision=revision,
@@ -550,9 +705,7 @@ class ResultCache:
             for (model_name, revision), task_results in models_results.items()
         ]
-        benchmark_results = BenchmarkResults(
-            model_results=models_results,
+        return BenchmarkResults(
+            model_results=models_results_object,
             benchmark=tasks if isinstance(tasks, Benchmark) else None,
         )
-        return benchmark_results

mteb/cli/_display_tasks.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from mteb.abstasks import AbsTask
 from mteb.benchmarks import Benchmark
@@ -31,7 +31,7 @@ def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
         _display_tasks(benchmark.tasks, name=name)
-def _display_tasks(task_list: Sequence[AbsTask], name: str | None = None) -> None:
+def _display_tasks(task_list: Iterable[AbsTask], name: str | None = None) -> None:
     from rich.console import Console
     console = Console()

mteb/cli/build_cli.py CHANGED Viewed

@@ -1,17 +1,19 @@
 import argparse
 import logging
 import os
+import warnings
 from pathlib import Path
 import torch
 from rich.logging import RichHandler
 import mteb
+from mteb.abstasks.abstask import AbsTask
 from mteb.cache import ResultCache
+from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
 from mteb.cli.generate_model_card import generate_model_card
 from mteb.evaluate import OverwriteStrategy
-from ._display_tasks import _display_benchmarks, _display_tasks
+from mteb.types._encoder_io import EncodeKwargs
 logger = logging.getLogger(__name__)
@@ -53,7 +55,7 @@ def run(args: argparse.Namespace) -> None:
     if args.benchmarks:
         benchmarks = mteb.get_benchmarks(names=args.benchmarks)
-        tasks = [t for b in benchmarks for t in b.tasks]
+        tasks = tuple(t for b in benchmarks for t in b.tasks)
     else:
         tasks = mteb.get_tasks(
             categories=args.categories,
@@ -63,21 +65,23 @@ def run(args: argparse.Namespace) -> None:
             eval_splits=args.eval_splits,
         )
-    encode_kwargs = {}
+    encode_kwargs: EncodeKwargs = {}
     if args.batch_size is not None:
         encode_kwargs["batch_size"] = args.batch_size
     overwrite_strategy = args.overwrite_strategy
     if args.overwrite:
-        logger.warning(
-            "`--overwrite` is deprecated, please use `--overwrite-strategy 'always'` instead."
+        warnings.warn(
+            "`--overwrite` is deprecated, please use `--overwrite-strategy 'always'` instead.",
+            DeprecationWarning,
         )
         overwrite_strategy = OverwriteStrategy.ALWAYS.value
     prediction_folder = args.prediction_folder
     if args.save_predictions:
-        logger.warning(
-            "`--save_predictions` is deprecated, please use `--prediction-folder` instead."
+        warnings.warn(
+            "`--save_predictions` is deprecated, please use `--prediction-folder` instead.",
+            DeprecationWarning,
         )
         prediction_folder = args.output_folder
@@ -279,23 +283,25 @@ def _create_meta(args: argparse.Namespace) -> None:
         from_existing = Path(from_existing)
     if output_path.exists() and overwrite:
-        logger.warning("Output path already exists, overwriting.")
+        msg = "Output path already exists, overwriting."
+        logger.warning(msg)
+        warnings.warn(msg)
     elif output_path.exists():
         raise FileExistsError(
             "Output path already exists, use --overwrite to overwrite."
         )
-    tasks = []
+    benchmarks = None
+    tasks: list[AbsTask] = []
     if tasks_names is not None:
-        tasks = mteb.get_tasks(tasks_names)
+        tasks = list(mteb.get_tasks(tasks_names))
     if benchmarks is not None:
         benchmarks = mteb.get_benchmarks(benchmarks)
-        for benchmark in benchmarks:
-            tasks.extend(benchmark.tasks)
     generate_model_card(
         model_name,
-        tasks if len(tasks) > 0 else None,
+        tasks,
+        benchmarks,
         existing_model_card_id_or_path=from_existing,
         results_cache=ResultCache(results_folder),
         output_path=output_path,
@@ -356,6 +362,95 @@ def _add_create_meta_parser(subparsers) -> None:
     parser.set_defaults(func=_create_meta)
+def _add_leaderboard_parser(subparsers) -> None:
+    parser = subparsers.add_parser("leaderboard", help="Launch the MTEB leaderboard")
+    parser.add_argument(
+        "--cache-path",
+        type=str,
+        help="Path to the cache folder containing model results",
+        required=False,
+        default=None,
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="0.0.0.0",
+        help="Host to run the leaderboard server on",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="Port to run the leaderboard server on",
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        default=False,
+        help="Create a public URL for the leaderboard",
+    )
+    parser.set_defaults(func=_leaderboard)
+def _leaderboard(args: argparse.Namespace) -> None:
+    """Launch the MTEB leaderboard with specified cache path."""
+    # Import leaderboard module only when needed to avoid requiring leaderboard dependencies
+    # for other CLI commands
+    try:
+        import gradio as gr
+        from mteb.leaderboard import get_leaderboard_app
+    except ImportError as e:
+        raise ImportError(
+            "Seems like some dependencies are not installed. "
+            + "You can likely install these using: `pip install mteb[leaderboard]`. "
+            + f"{e}"
+        )
+    cache_path = args.cache_path
+    if cache_path:
+        logger.info(f"Using cache path: {cache_path}")
+        cache = ResultCache(cache_path)
+    else:
+        cache = ResultCache()
+        logger.info(f"Using default cache path: {cache.cache_path}")
+    app = get_leaderboard_app(cache)
+    logger.info(f"Starting leaderboard on {args.host}:{args.port}")
+    if args.share:
+        logger.info("Creating public URL...")
+    logging.getLogger("mteb.load_results.task_results").setLevel(
+        logging.ERROR
+    )  # Warnings related to task split
+    logging.getLogger("mteb.model_meta").setLevel(
+        logging.ERROR
+    )  # Warning related to model metadata (fetch_from_hf=False)
+    logging.getLogger("mteb.load_results.benchmark_results").setLevel(
+        logging.ERROR
+    )  # Warning related to model metadata (fetch_from_hf=False)
+    warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
+    # Head content for Tailwind CSS
+    head = """
+    <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
+    """
+    app.launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=args.share,
+        theme=gr.themes.Soft(
+            font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
+        ),
+        head=head,
+    )
 def build_cli() -> argparse.ArgumentParser:
     """Builds the argument parser for the MTEB CLI.
@@ -375,6 +470,7 @@ def build_cli() -> argparse.ArgumentParser:
     _add_available_tasks_parser(subparsers)
     _add_available_benchmarks_parser(subparsers)
     _add_create_meta_parser(subparsers)
+    _add_leaderboard_parser(subparsers)
     return parser

mteb/cli/generate_model_card.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import logging
+import warnings
+from collections.abc import Sequence
 from pathlib import Path
 from huggingface_hub import ModelCard, ModelCardData, repo_exists
-from mteb import BenchmarkResults
 from mteb.abstasks.abstask import AbsTask
+from mteb.benchmarks.benchmark import Benchmark
 from mteb.cache import ResultCache
 logger = logging.getLogger(__name__)
@@ -12,12 +14,13 @@ logger = logging.getLogger(__name__)
 def generate_model_card(
     model_name: str,
-    tasks: list[AbsTask] | None = None,
+    tasks: Sequence[AbsTask] | None = None,
+    benchmarks: Sequence[Benchmark] | None = None,
     existing_model_card_id_or_path: str | Path | None = None,
     results_cache: ResultCache = ResultCache(),
     output_path: Path = Path("model_card.md"),
     add_table_to_model_card: bool = False,
-    models_to_compare: list[str] | None = None,
+    models_to_compare: Sequence[str] | None = None,
     token: str | None = None,
     push_to_hub: bool = False,
 ) -> None:
@@ -26,6 +29,7 @@ def generate_model_card(
     Args:
         model_name: Name of the model.
         tasks: List of tasks to generate results for.
+        benchmarks: A Benchmark or list of benchmarks to generate results for.
         existing_model_card_id_or_path: Path or ID of an existing model card to update.
         results_cache: Instance of ResultCache to load results from.
         output_path: Path to save the generated model card.
@@ -39,16 +43,24 @@ def generate_model_card(
     if existing_model_card_id_or_path:
         existing_model_card = ModelCard.load(existing_model_card_id_or_path)
+    all_tasks: list[AbsTask] = []
+    if tasks is not None:
+        all_tasks.extend(tasks)
+    if benchmarks is not None:
+        for b in benchmarks:
+            all_tasks.extend(b.tasks)
     benchmark_results = results_cache.load_results(
-        [model_name], tasks, only_main_score=True
+        [model_name], all_tasks if all_tasks else None, only_main_score=True
     )
     eval_results = []
     for models_results in benchmark_results.model_results:
         for task_result in models_results.task_results:
             eval_results.extend(task_result.get_hf_eval_results())
-    existing_model_card_data = (
-        existing_model_card.data if existing_model_card else ModelCardData()
+    existing_model_card_data: ModelCardData = (
+        existing_model_card.data if existing_model_card else ModelCardData()  # type: ignore[assignment]
     )
     if existing_model_card_data.eval_results is None:
@@ -78,35 +90,43 @@ def generate_model_card(
             card_data=existing_model_card_data
         )
-    if models_to_compare:
-        benchmark_results = results_cache.load_results(
-            [model_name, *models_to_compare], tasks, only_main_score=True
-        )
     if add_table_to_model_card:
         existing_model_card = _add_table_to_model_card(
-            benchmark_results, existing_model_card
+            results_cache,
+            existing_model_card,
+            (model_name, *models_to_compare) if models_to_compare else (model_name,),
+            benchmarks or [],
         )
-    if push_to_hub:
+    if push_to_hub and existing_model_card_id_or_path:
+        existing_model_card_id_or_path = str(existing_model_card_id_or_path)
         if repo_exists(existing_model_card_id_or_path):
             existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token)
         else:
-            logger.warning(
-                f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
-            )
+            msg = f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
+            logger.warning(msg)
+            warnings.warn(msg)
     existing_model_card.save(output_path)
 def _add_table_to_model_card(
-    results: BenchmarkResults, model_card: ModelCard
+    results_cache: ResultCache,
+    model_card: ModelCard,
+    models: Sequence[str],
+    benchmarks: Sequence[Benchmark],
 ) -> ModelCard:
     original_content = model_card.content
-    results_df = results.to_dataframe()
-    results_df = results_df.set_index("task_name")
-    mteb_content = f"""
-# MTEB results
-{results_df.to_markdown()}
-"""
+    mteb_content = "# MTEB Results\n\n"
+    for benchmark in benchmarks:
+        mteb_content += f"## Benchmark: {benchmark.name}\n\n"
+        benchmark_results = results_cache.load_results(
+            tasks=benchmark,
+            models=models,
+            only_main_score=True,
+        )
+        df_results = benchmark_results.get_benchmark_result()
+        mteb_content += df_results.to_markdown(index=True) + "\n\n"
     model_card.content = original_content + "\n\n" + mteb_content
     return model_card

mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl

mteb 2.5.2py3-none-any.whl → 2.7.2py3-none-any.whl