PyPI - mteb - Versions diffs - 2.5.3__py3-none-any.whl → 2.5.5__py3-none-any.whl - Mend

mteb 2.5.3py3-none-any.whl → 2.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

mteb/_create_dataloaders.py +10 -15
mteb/_evaluators/any_sts_evaluator.py +1 -4
mteb/_evaluators/evaluator.py +2 -1
mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +5 -6
mteb/_evaluators/pair_classification_evaluator.py +3 -1
mteb/_evaluators/retrieval_metrics.py +17 -16
mteb/_evaluators/sklearn_evaluator.py +9 -8
mteb/_evaluators/text/bitext_mining_evaluator.py +23 -16
mteb/_evaluators/text/summarization_evaluator.py +20 -16
mteb/abstasks/_data_filter/filters.py +1 -1
mteb/abstasks/_data_filter/task_pipelines.py +3 -0
mteb/abstasks/_statistics_calculation.py +18 -10
mteb/abstasks/_stratification.py +18 -18
mteb/abstasks/abstask.py +27 -21
mteb/abstasks/aggregate_task_metadata.py +1 -9
mteb/abstasks/aggregated_task.py +3 -16
mteb/abstasks/classification.py +10 -4
mteb/abstasks/clustering.py +18 -14
mteb/abstasks/clustering_legacy.py +8 -8
mteb/abstasks/image/image_text_pair_classification.py +5 -3
mteb/abstasks/multilabel_classification.py +20 -16
mteb/abstasks/pair_classification.py +18 -9
mteb/abstasks/regression.py +3 -3
mteb/abstasks/retrieval.py +12 -9
mteb/abstasks/sts.py +6 -3
mteb/abstasks/task_metadata.py +20 -16
mteb/abstasks/text/bitext_mining.py +36 -25
mteb/abstasks/text/reranking.py +7 -5
mteb/abstasks/text/summarization.py +8 -3
mteb/abstasks/zeroshot_classification.py +5 -2
mteb/benchmarks/benchmark.py +4 -2
mteb/benchmarks/benchmarks/benchmarks.py +22 -1
mteb/benchmarks/get_benchmark.py +14 -55
mteb/cache.py +21 -18
mteb/cli/_display_tasks.py +2 -2
mteb/cli/build_cli.py +8 -8
mteb/cli/generate_model_card.py +39 -20
mteb/deprecated_evaluator.py +56 -43
mteb/evaluate.py +35 -29
mteb/filter_tasks.py +25 -26
mteb/get_tasks.py +25 -27
mteb/languages/language_scripts.py +5 -3
mteb/leaderboard/app.py +1 -1
mteb/load_results.py +12 -12
mteb/models/abs_encoder.py +2 -2
mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
mteb/models/cache_wrappers/cache_backends/faiss_cache.py +2 -1
mteb/models/cache_wrappers/cache_backends/numpy_cache.py +30 -13
mteb/models/cache_wrappers/cache_wrapper.py +2 -2
mteb/models/get_model_meta.py +8 -1
mteb/models/instruct_wrapper.py +11 -5
mteb/models/model_implementations/andersborges.py +2 -2
mteb/models/model_implementations/blip_models.py +8 -8
mteb/models/model_implementations/bm25.py +1 -1
mteb/models/model_implementations/clip_models.py +3 -3
mteb/models/model_implementations/cohere_models.py +1 -1
mteb/models/model_implementations/cohere_v.py +2 -2
mteb/models/model_implementations/dino_models.py +23 -23
mteb/models/model_implementations/emillykkejensen_models.py +3 -3
mteb/models/model_implementations/jina_clip.py +1 -1
mteb/models/model_implementations/jina_models.py +1 -1
mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
mteb/models/model_implementations/llm2clip_models.py +3 -3
mteb/models/model_implementations/moco_models.py +2 -2
mteb/models/model_implementations/model2vec_models.py +1 -1
mteb/models/model_implementations/nomic_models.py +8 -8
mteb/models/model_implementations/openclip_models.py +7 -7
mteb/models/model_implementations/random_baseline.py +3 -3
mteb/models/model_implementations/rasgaard_models.py +1 -1
mteb/models/model_implementations/repllama_models.py +2 -2
mteb/models/model_implementations/rerankers_custom.py +3 -3
mteb/models/model_implementations/rerankers_monot5_based.py +3 -3
mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
mteb/models/model_implementations/siglip_models.py +10 -10
mteb/models/model_implementations/vlm2vec_models.py +1 -1
mteb/models/model_implementations/voyage_v.py +4 -4
mteb/models/model_meta.py +30 -14
mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +5 -5
mteb/models/search_wrappers.py +22 -10
mteb/models/sentence_transformer_wrapper.py +9 -4
mteb/py.typed +0 -0
mteb/results/benchmark_results.py +25 -19
mteb/results/model_result.py +49 -21
mteb/results/task_result.py +45 -51
mteb/similarity_functions.py +11 -7
mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
mteb/tasks/classification/est/estonian_valence.py +1 -1
mteb/tasks/classification/multilingual/scala_classification.py +1 -1
mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
mteb/tasks/retrieval/code/code_rag.py +12 -12
mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
mteb/tasks/retrieval/nob/norquad.py +2 -2
mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
mteb/types/_result.py +2 -1
mteb/types/statistics.py +9 -3
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/METADATA +1 -1
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/RECORD +105 -104
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/WHEEL +0 -0
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/entry_points.txt +0 -0
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/licenses/LICENSE +0 -0
{mteb-2.5.3.dist-info → mteb-2.5.5.dist-info}/top_level.txt +0 -0

mteb/models/model_meta.py CHANGED Viewed

@@ -22,6 +22,7 @@ from huggingface_hub import (
 from huggingface_hub.errors import (
     EntryNotFoundError,
     GatedRepoError,
+    HFValidationError,
     NotASafetensorsRepoError,
     RepositoryNotFoundError,
     SafetensorsParsingError,
@@ -81,7 +82,7 @@ def _get_loader_name(
     return loader.__name__
-_SENTENCE_TRANSFORMER_LIB_NAME = "Sentence Transformers"
+_SENTENCE_TRANSFORMER_LIB_NAME: FRAMEWORKS = "Sentence Transformers"
 class ModelMeta(BaseModel):
@@ -263,10 +264,8 @@ class ModelMeta(BaseModel):
         _kwargs = self.loader_kwargs.copy()
         _kwargs.update(kwargs)
-        model: EncoderProtocol = self.loader(
-            self.name, revision=self.revision, **_kwargs
-        )
-        model.mteb_model_meta = self  # type: ignore
+        model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs)
+        model.mteb_model_meta = self  # type: ignore[misc]
         return model
     def model_name_as_path(self) -> str:
@@ -307,7 +306,7 @@ class ModelMeta(BaseModel):
         embedding_dim = None
         max_tokens = None
-        if model_name and compute_metadata and repo_exists(model_name):
+        if model_name and compute_metadata and _repo_exists(model_name):
             reference = "https://huggingface.co/" + model_name
             card = ModelCard.load(model_name)
             card_data: ModelCardData = card.data
@@ -318,9 +317,8 @@ class ModelMeta(BaseModel):
                 model_config = None
                 logger.warning(f"Can't get configuration for {model_name}. Error: {e}")
-            if (
-                card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME
-                or _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
+            if card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME or (
+                card_data.tags and _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
             ):
                 frameworks.append(_SENTENCE_TRANSFORMER_LIB_NAME)
             else:
@@ -417,7 +415,7 @@ class ModelMeta(BaseModel):
             meta.framework.append("Sentence Transformers")
         meta.modalities = ["text"]
-        if model and compute_metadata and repo_exists(model):
+        if model and compute_metadata and _repo_exists(model):
             # have max_seq_length field
             sbert_config = _get_json_from_hub(
                 model, "sentence_bert_config.json", "model", revision=revision
@@ -435,7 +433,7 @@ class ModelMeta(BaseModel):
                 and config_sbert.get("similarity_fn_name") is not None
             ):
                 meta.similarity_fn_name = ScoringFunction.from_str(
-                    config_sbert.get("similarity_fn_name")
+                    config_sbert["similarity_fn_name"]
                 )
             else:
                 meta.similarity_fn_name = ScoringFunction.COSINE
@@ -516,7 +514,7 @@ class ModelMeta(BaseModel):
                 warnings.warn(msg)
         return_dataset = training_datasets.copy()
-        visited = set()
+        visited: set[str] = set()
         for dataset in training_datasets:
             similar_tasks = _collect_similar_tasks(dataset, visited)
@@ -550,6 +548,8 @@ class ModelMeta(BaseModel):
     @staticmethod
     def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | None:
+        if not model_name:
+            return None
         try:
             safetensors_metadata = get_safetensors_metadata(model_name)
             if len(safetensors_metadata.parameter_count) >= 0:
@@ -563,7 +563,7 @@ class ModelMeta(BaseModel):
             logger.warning(
                 f"Can't calculate number of parameters for {model_name}. Got error {e}"
             )
-            return None
+        return None
     def calculate_num_parameters_from_hub(self) -> int | None:
         """Calculates the number of parameters in the model.
@@ -626,7 +626,7 @@ class ModelMeta(BaseModel):
         if "API" in self.framework or self.name is None:
             return None
-        return self._calculate_memory_usage_mb(self.model_name, self.n_parameters)
+        return self._calculate_memory_usage_mb(self.name, self.n_parameters)
     @staticmethod
     def fetch_release_date(model_name: str) -> StrDate | None:
@@ -786,3 +786,19 @@ def _get_file_on_hub(
     except (GatedRepoError, RepositoryNotFoundError, EntryNotFoundError) as e:
         logger.warning(f"Can't get file {file_name} of {repo_id}: {e}")
         return None
+def _repo_exists(repo_id: str, repo_type: str | None = None) -> bool:
+    """Checks if a repository exists on HuggingFace Hub.
+    Repo exists will raise HFValidationError for invalid local paths
+    Args:
+        repo_id: The repository ID.
+        repo_type: The type of repository (e.g., "model", "dataset", "space").
+    """
+    try:
+        return repo_exists(repo_id=repo_id, repo_type=repo_type)
+    except HFValidationError as e:
+        logger.warning(f"Can't check existence of {repo_id}: {e}")
+        return False

mteb/models/search_encoder_index/search_indexes/faiss_search_index.py CHANGED Viewed

@@ -109,7 +109,7 @@ class FaissSearchIndex:
             ids = ids.tolist()
         if issubclass(self.index_type, faiss.IndexFlatL2):
-            similarities = -np.sqrt(np.maximum(similarities, 0))
+            similarities = (-np.sqrt(np.maximum(similarities, 0))).tolist()
         return similarities, ids
@@ -117,8 +117,8 @@ class FaissSearchIndex:
         self,
         embeddings: Array,
         top_k: int,
-        top_ranked: TopRankedDocumentsType | None = None,
-        query_idx_to_id: dict[int, str] | None = None,
+        top_ranked: TopRankedDocumentsType,
+        query_idx_to_id: dict[int, str],
     ) -> tuple[list[list[float]], list[list[int]]]:
         doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(self.idxs)}
         scores_all: list[list[float]] = []
@@ -136,9 +136,9 @@ class FaissSearchIndex:
                 continue
             candidate_indices = [doc_id_to_idx[doc_id] for doc_id in ranked_ids]
-            d = self.index.d
+            d = self.index.d  # type: ignore[union-attr]
             candidate_embs = np.vstack(
-                [self.index.reconstruct(idx) for idx in candidate_indices]
+                [self.index.reconstruct(idx) for idx in candidate_indices]  # type: ignore[union-attr]
             )
             sub_reranking_index = self.index_type(d)
             sub_reranking_index.add(candidate_embs)

mteb/models/search_wrappers.py CHANGED Viewed

@@ -200,7 +200,7 @@ class SearchEncoderWrapper:
         # Reset the task corpus dataloader to None to free up memory
         self.task_corpus = None
-        results = {qid: {} for qid in query_idx_to_id.values()}
+        results: RetrievalOutputType = {qid: {} for qid in query_idx_to_id.values()}
         for qid in result_heaps:
             for score, corpus_id in result_heaps[qid]:
                 results[qid][corpus_id] = score
@@ -218,13 +218,19 @@ class SearchEncoderWrapper:
         encode_kwargs: dict[str, Any],
     ) -> dict[str, list[tuple[float, str]]]:
         logger.info("Encoding Corpus in batches (this might take a while)...")
+        if self.task_corpus is None:
+            raise ValueError("Corpus must be indexed before searching.")
         itr = range(0, len(self.task_corpus), self.corpus_chunk_size)
-        result_heaps = {qid: [] for qid in query_idx_to_id.values()}
+        result_heaps: dict[str, list[tuple[float, str]]] = {
+            qid: [] for qid in query_idx_to_id.values()
+        }
         for batch_num, corpus_start_idx in enumerate(itr):
             logger.info(f"Encoding Batch {batch_num + 1}/{len(itr)}...")
             corpus_end_idx = min(
-                corpus_start_idx + self.corpus_chunk_size, len(self.task_corpus)
+                corpus_start_idx + self.corpus_chunk_size,
+                len(self.task_corpus),
             )
             sub_corpus = self.task_corpus.select(
                 range(corpus_start_idx, corpus_end_idx)
@@ -249,7 +255,7 @@ class SearchEncoderWrapper:
             scores = self.model.similarity(query_embeddings, sub_corpus_embeddings)
             # get top-k values
-            cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
+            cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = torch.topk(
                 torch.as_tensor(scores),
                 min(
                     top_k + 1,
@@ -258,8 +264,8 @@ class SearchEncoderWrapper:
                 dim=1,
                 largest=True,
             )
-            cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
-            cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
+            cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
+            cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
             sub_corpus_ids = list(sub_corpus_ids)
             result_heaps = self._sort_full_corpus_results(
@@ -319,7 +325,11 @@ class SearchEncoderWrapper:
         Returns:
             A dictionary mapping query IDs to a list of tuples, each containing a relevance score and a document ID.
         """
-        result_heaps = {qid: [] for qid in query_idx_to_id.values()}
+        if self.task_corpus is None:
+            raise ValueError("Corpus must be indexed before searching.")
+        result_heaps: dict[str, list[tuple[float, str]]] = {
+            qid: [] for qid in query_idx_to_id.values()
+        }
         doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)}
         all_doc_embeddings = self.model.encode(
@@ -387,12 +397,12 @@ class SearchEncoderWrapper:
     def _rerank_sort_results(
         self,
-        result_heaps: list[tuple[float, str]],
+        result_heaps: dict[str, list[tuple[float, str]]],
         query_id: str,
         ranked_ids: list[str],
         scores_top_k_idx: torch.Tensor,
         scores_top_k_values: torch.Tensor,
-    ) -> list[tuple[float, str]]:
+    ) -> dict[str, list[tuple[float, str]]]:
         """Sort the heap into descending order list.
         Returns:
@@ -503,6 +513,8 @@ class SearchCrossEncoderWrapper:
             raise ValueError(
                 "CrossEncoder search requires top_ranked documents for reranking."
             )
+        if self.task_corpus is None:
+            raise ValueError("Corpus must be indexed before searching.")
         query_id_to_idx = {row["id"]: i for i, row in enumerate(queries)}
         doc_id_to_idx = {doc["id"]: idx for idx, doc in enumerate(self.task_corpus)}
@@ -542,7 +554,7 @@ class SearchCrossEncoderWrapper:
             hf_subset=hf_subset,
         )
-        results = {qid: {} for qid in queries["id"]}
+        results: RetrievalOutputType = {qid: {} for qid in queries["id"]}
         for (query_id, corpus_id), score in zip(doc_pairs_ids, predictions):
             results[query_id][corpus_id] = float(score)

mteb/models/sentence_transformer_wrapper.py CHANGED Viewed

@@ -103,8 +103,11 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
             logger.warning(msg)
             warnings.warn(msg)
+    def similarity(self, embeddings1: Array, embeddings2: Array) -> Array:
+        """Compute the similarity between two collections of embeddings."""
         if hasattr(self.model, "similarity") and callable(self.model.similarity):
-            self.similarity = self.model.similarity
+            return self.model.similarity(embeddings1, embeddings2)
+        return super().similarity(embeddings1, embeddings2)
     def encode(
         self,
@@ -150,7 +153,7 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
         prompt_name = None
         if self.model_prompts is not None:
             prompt_name = self.get_prompt_name(task_metadata, prompt_type)
-            prompt = self.model_prompts.get(prompt_name, None)
+            prompt = self.model_prompts.get(prompt_name, None)  # type: ignore[arg-type]
         if prompt_name:
             prompt_log = f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}"
         else:
@@ -221,7 +224,7 @@ class SentenceTransformerMultimodalEncoderWrapper(SentenceTransformerEncoderWrap
         prompt_name = None
         if self.model_prompts is not None:
             prompt_name = self.get_prompt_name(task_metadata, prompt_type)
-            prompt = self.model_prompts.get(prompt_name, None)
+            prompt = self.model_prompts.get(prompt_name, None)  # type: ignore[arg-type]
         if prompt_name:
             logger.info(
                 f"Using {prompt_name=} for task={task_metadata.name} {prompt_type=} with {prompt=}"
@@ -234,7 +237,9 @@ class SentenceTransformerMultimodalEncoderWrapper(SentenceTransformerEncoderWrap
         all_embeddings = []
         for batch in inputs:
             batch_column = next(iter(batch.keys()))
-            batched_input = [dict() for _ in range(len(batch[batch_column]))]
+            batched_input: list[dict[str, Any]] = [
+                dict() for _ in range(len(batch[batch_column]))
+            ]
             # transform from {"text": [text1, text2], "image": [image1, image2]} to
             # [{"text": text1, "image": image1}, {"text": text2, "image": image2}]

mteb/py.typed ADDED Viewed

File without changes

mteb/results/benchmark_results.py CHANGED Viewed

@@ -1,10 +1,12 @@
+from __future__ import annotations
 import functools
 import json
 import logging
 import warnings
-from collections.abc import Callable, Iterable, Iterator, Sequence
+from collections.abc import Callable, Iterable, Iterator
 from pathlib import Path
-from typing import Any, Literal
+from typing import Any, Literal, cast
 import pandas as pd
 from packaging.version import InvalidVersion, Version
@@ -33,11 +35,12 @@ from .model_result import ModelResult, _aggregate_and_pivot
 logger = logging.getLogger(__name__)
-# Global cache for model metas and version parsing
 @functools.lru_cache
 def _get_cached_model_metas() -> dict[str, str | None]:
     """Cache model metas to avoid repeated calls."""
-    return {meta.name: meta.revision for meta in get_model_metas()}
+    return {
+        meta.name: meta.revision for meta in get_model_metas() if meta.name is not None
+    }
 @functools.lru_cache(maxsize=10000)
@@ -77,10 +80,10 @@ class BenchmarkResults(BaseModel):
         task_names: list[str] | None = None,
         languages: list[str] | None = None,
         domains: list[TaskDomain] | None = None,
-        task_types: list[TaskType] | None = None,  # type: ignore
+        task_types: list[TaskType] | None = None,
         modalities: list[Modalities] | None = None,
         is_public: bool | None = None,
-    ) -> Self:
+    ) -> BenchmarkResults:
         # TODO: Same as filter_models
         model_results = [
             res._filter_tasks(
@@ -97,7 +100,7 @@ class BenchmarkResults(BaseModel):
             model_results=[res for res in model_results if res.task_results]
         )
-    def select_tasks(self, tasks: Sequence[AbsTask]) -> Self:
+    def select_tasks(self, tasks: Iterable[AbsTask]) -> BenchmarkResults:
         """Select tasks from the benchmark results.
         Args:
@@ -115,7 +118,7 @@ class BenchmarkResults(BaseModel):
         self,
         names: list[str] | list[ModelMeta],
         revisions: list[str | None] | None = None,
-    ) -> Self:
+    ) -> BenchmarkResults:
         """Get models by name and revision.
         Args:
@@ -128,7 +131,7 @@ class BenchmarkResults(BaseModel):
         models_res = []
         _revisions = revisions if revisions is not None else [None] * len(names)
-        name_rev = {}
+        name_rev: dict[str, str | None] = {}
         if len(names) != len(_revisions):
             raise ValueError(
@@ -137,9 +140,12 @@ class BenchmarkResults(BaseModel):
         for name, revision in zip(names, _revisions):
             if isinstance(name, ModelMeta):
+                if name.name is None:
+                    raise ValueError("name in ModelMeta is None. It must be a string.")
                 name_rev[name.name] = name.revision
             else:
-                name_rev[name] = revision
+                name_ = cast(str, name)
+                name_rev[name_] = revision
         for model_res in self.model_results:
             model_name = model_res.model_name
@@ -159,7 +165,7 @@ class BenchmarkResults(BaseModel):
         n_parameters_range: tuple[int | None, int | None] = (None, None),
         use_instructions: bool | None = None,
         zero_shot_on: list[AbsTask] | None = None,
-    ) -> Self:
+    ) -> BenchmarkResults:
         # mostly a utility function for the leaderboard app.
         # I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter.
         # interface would then be the same as the get_models function
@@ -182,7 +188,7 @@ class BenchmarkResults(BaseModel):
         return type(self).model_construct(model_results=new_model_results)
-    def join_revisions(self) -> Self:
+    def join_revisions(self) -> BenchmarkResults:
         """Join revisions of the same model.
         In case of conflicts, the following rules are applied:
@@ -212,10 +218,10 @@ class BenchmarkResults(BaseModel):
         # Use cached model metas
         model_to_main_revision = _get_cached_model_metas()
-        task_df["main_revision"] = task_df["model"].map(model_to_main_revision)  # type: ignore
+        task_df["main_revision"] = task_df["model"].map(model_to_main_revision)
         # Use cached version parsing
-        task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)  # type: ignore
+        task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)
         # Filter out rows without scores first
         task_df = task_df[task_df["has_scores"]]
@@ -259,8 +265,8 @@ class BenchmarkResults(BaseModel):
         # so grouping by original revision ensures consistent ModelResult creation
         for (model, model_revision), group in task_df.groupby(["model", "revision"]):
             model_result = ModelResult.model_construct(
-                model_name=model,
-                model_revision=model_revision,
+                model_name=model,  # type: ignore[arg-type]
+                model_revision=model_revision,  # type: ignore[arg-type]
                 task_results=list(group["task_result"]),
             )
             model_results.append(model_result)
@@ -291,7 +297,7 @@ class BenchmarkResults(BaseModel):
                         {
                             "model": model_res.model_name,
                             "revision": model_res.model_revision,
-                            **model_scores,  # type: ignore
+                            **model_scores,
                         }
                     )
                 except Exception as e:
@@ -404,7 +410,7 @@ class BenchmarkResults(BaseModel):
         return self.benchmark._create_summary_table(self)
-    def __iter__(self) -> Iterator[ModelResult]:
+    def __iter__(self) -> Iterator[ModelResult]:  # type: ignore[override]
         return iter(self.model_results)
     def __getitem__(self, index: int) -> ModelResult:
@@ -426,7 +432,7 @@ class BenchmarkResults(BaseModel):
             out_file.write(self.model_dump_json(indent=2))
     @classmethod
-    def from_validated(cls, **data) -> Self:
+    def from_validated(cls, **data) -> BenchmarkResults:
         """Create BenchmarkResults from validated data.
         Args:

mteb/results/model_result.py CHANGED Viewed

@@ -1,12 +1,14 @@
+from __future__ import annotations
 import logging
 import warnings
-from collections.abc import Callable, Iterable, Sequence
-from typing import Any, Literal
+from collections.abc import Callable, Iterable
+from typing import Any, Literal, cast
 import numpy as np
 import pandas as pd
 from pydantic import BaseModel, ConfigDict, Field
-from typing_extensions import Self
+from typing_extensions import overload
 from mteb.abstasks.abstask import AbsTask
 from mteb.abstasks.task_metadata import (
@@ -58,7 +60,7 @@ def _aggregate_and_pivot(
             index=index_columns,
             columns=columns,
             values="score",
-            aggfunc=aggregation_fn,
+            aggfunc=aggregation_fn,  # type: ignore[arg-type]
         ).reset_index()
     elif format == "long":
         return (
@@ -81,7 +83,7 @@ class ModelResult(BaseModel):
     model_revision: str | None
     task_results: list[TaskResult]
     default_modalities: list[Modalities] = Field(
-        default_factory=lambda: ["text"], alias="modalities"
+        default_factory=lambda: [cast(Modalities, "text")], alias="modalities"
     )
     model_config = (
         ConfigDict(  # to free up the name model_* which is otherwise protected
@@ -95,16 +97,17 @@ class ModelResult(BaseModel):
         return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))"
     @classmethod
-    def from_validated(cls, **data: dict[str, Any]) -> Self:
+    def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
         """Create a ModelResult from validated data.
         Args:
             data: The validated data.
         """
-        data["task_results"] = [
-            TaskResult.from_validated(**res) for res in data["task_results"]
+        data["task_results"] = [  # type: ignore[assignment]
+            TaskResult.from_validated(**res)  # type: ignore[arg-type]
+            for res in data["task_results"]
         ]
-        return cls.model_construct(**data)
+        return cls.model_construct(**data)  # type: ignore[arg-type]
     def _filter_tasks(
         self,
@@ -114,7 +117,7 @@ class ModelResult(BaseModel):
         task_types: list[TaskType] | None = None,
         modalities: list[Modalities] | None = None,
         is_public: bool | None = None,
-    ) -> Self:
+    ) -> ModelResult:
         new_task_results = []
         for task_result in self.task_results:
             if (task_names is not None) and (task_result.task_name not in task_names):
@@ -142,7 +145,7 @@ class ModelResult(BaseModel):
             task_results=new_task_results,
         )
-    def select_tasks(self, tasks: Sequence[AbsTask]) -> Self:
+    def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
         """Select tasks from the ModelResult based on a list of AbsTask objects.
         Args:
@@ -160,6 +163,28 @@ class ModelResult(BaseModel):
             task_results=new_task_results,
         )
+    @overload
+    def _get_scores(
+        self,
+        splits: list[SplitName] | None = None,
+        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
+        scripts: list[ISOLanguageScript] | None = None,
+        getter: Callable[[ScoresDict], Score] | None = None,
+        aggregation: Callable[[list[Score]], Any] | None = None,
+        format: Literal["wide"] = "wide",
+    ) -> dict: ...
+    @overload
+    def _get_scores(
+        self,
+        splits: list[SplitName] | None = None,
+        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
+        scripts: list[ISOLanguageScript] | None = None,
+        getter: Callable[[ScoresDict], Score] | None = None,
+        aggregation: Callable[[list[Score]], Any] | None = None,
+        format: Literal["long"] = "long",
+    ) -> list: ...
     def _get_scores(
         self,
         splits: list[SplitName] | None = None,
@@ -177,21 +202,24 @@ class ModelResult(BaseModel):
             aggregation = aggregation if aggregation is not None else np.mean
         else:
             use_fast = True
+        aggregation = cast(Callable[[list[Score]], Any], aggregation)
+        getter = cast(Callable[[ScoresDict], Score], getter)
         if format == "wide":
             scores = {}
             for res in self.task_results:
                 try:
                     if use_fast:
                         scores[res.task_name] = res._get_score_fast(
-                            splits=splits,  # type: ignore
-                            languages=languages,  # type: ignore
+                            splits=splits,
+                            languages=languages,
                         )
                     else:
                         scores[res.task_name] = res.get_score(
                             splits=splits,
                             languages=languages,
-                            aggregation=aggregation,  # type: ignore
-                            getter=getter,  # type: ignore
+                            aggregation=aggregation,
+                            getter=getter,
                             scripts=scripts,
                         )
                 except Exception as e:
@@ -206,14 +234,14 @@ class ModelResult(BaseModel):
                     if use_fast:
                         score = task_res._get_score_fast(
                             splits=splits,
-                            languages=languages,  # type: ignore
+                            languages=languages,
                         )
                     else:
                         score = task_res.get_score(
                             splits=splits,
                             languages=languages,
-                            aggregation=aggregation,  # type: ignore
-                            getter=getter,  # type: ignore
+                            aggregation=aggregation,
+                            getter=getter,
                             scripts=scripts,
                         )
                     entry = dict(
@@ -317,7 +345,7 @@ class ModelResult(BaseModel):
     def __hash__(self) -> int:
         return id(self)
-    def __iter__(self) -> Iterable[TaskResult]:
+    def __iter__(self) -> Iterable[TaskResult]:  # type: ignore[override]
         return iter(self.task_results)
     def __getitem__(self, index) -> TaskResult:
@@ -370,13 +398,13 @@ class ModelResult(BaseModel):
         return [task_res.task_name for task_res in self.task_results]
     @property
-    def modalities(self) -> list[str]:
+    def modalities(self) -> list[Modalities]:
         """Get all modalities in the task results.
         Returns:
             A list of modalities in the task results.
         """
-        mods = []
+        mods: list[Modalities] = []
         for task_res in self.task_results:
             task_modalities = getattr(task_res, "modalities", [])
             mods.extend(task_modalities)

mteb 2.5.3__py3-none-any.whl → 2.5.5__py3-none-any.whl

mteb 2.5.3py3-none-any.whl → 2.5.5py3-none-any.whl