PyPI - mteb - Versions diffs - 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl - Mend

mteb 2.5.2py3-none-any.whl → 2.7.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (529) hide show

mteb/abstasks/retrieval_dataset_loaders.py CHANGED Viewed

@@ -1,5 +1,7 @@
+from __future__ import annotations
 import logging
-from typing import TypedDict
+from typing import TYPE_CHECKING, TypedDict
 from datasets import (
     Dataset,
@@ -11,13 +13,14 @@ from datasets import (
     load_dataset,
 )
-from mteb.types import (
-    CorpusDatasetType,
-    InstructionDatasetType,
-    QueryDatasetType,
-    RelevantDocumentsType,
-    TopRankedDocumentsType,
-)
+if TYPE_CHECKING:
+    from mteb.types import (
+        CorpusDatasetType,
+        InstructionDatasetType,
+        QueryDatasetType,
+        RelevantDocumentsType,
+        TopRankedDocumentsType,
+    )
 logger = logging.getLogger(__name__)
@@ -73,28 +76,36 @@ class RetrievalDatasetLoader:
         self.config = config if config != "default" else None
         self.dataset_configs = get_dataset_config_names(self.hf_repo, self.revision)
-    def load(self) -> RetrievalSplitData:
+    def load(
+        self,
+        num_proc: int = 1,
+    ) -> RetrievalSplitData:
         """Loads the dataset split for the specified configuration.
+        Args:
+            num_proc: The number of processes to use.
         Returns:
             A dictionary containing the corpus, queries, relevant documents, instructions (if applicable), and top-ranked documents (if applicable).
         """
         top_ranked = None
-        qrels = self._load_qrels()
-        corpus = self._load_corpus()
-        queries = self._load_queries()
+        qrels = self._load_qrels(num_proc)
+        corpus = self._load_corpus(num_proc)
+        queries = self._load_queries(num_proc)
         queries = queries.filter(
             lambda x: x["id"] in qrels.keys(), desc="Filtering queries by qrels"
         )
         if any(c.endswith("top_ranked") for c in self.dataset_configs):
-            top_ranked = self._load_top_ranked()
+            top_ranked = self._load_top_ranked(num_proc)
         if any(c.endswith("instruction") for c in self.dataset_configs):
-            instructions = self._load_instructions()
-            queries = _combine_queries_with_instructions_datasets(queries, instructions)
+            instructions = self._load_instructions(num_proc)
+            queries = _combine_queries_with_instructions_datasets(
+                queries, instructions, num_proc
+            )
         return RetrievalSplitData(
             corpus=corpus,
@@ -117,49 +128,50 @@ class RetrievalDatasetLoader:
             f"Split {self.split} not found in {splits}. Please specify a valid split."
         )
-    def _load_dataset_split(self, config: str) -> Dataset:
+    def _load_dataset_split(self, config: str, num_proc: int) -> Dataset:
         return load_dataset(
             self.hf_repo,
             config,
             split=self._get_split(config),
             trust_remote_code=self.trust_remote_code,
             revision=self.revision,
+            num_proc=num_proc,
         )
-    def _load_corpus(self) -> CorpusDatasetType:
-        logger.info("Loading Corpus...")
+    def _load_corpus(self, num_proc: int) -> CorpusDatasetType:
         config = f"{self.config}-corpus" if self.config is not None else "corpus"
-        corpus_ds = self._load_dataset_split(config)
+        logger.info("Loading corpus subset: %s", config)
+        corpus_ds = self._load_dataset_split(config, num_proc)
         if "_id" in corpus_ds.column_names:
             corpus_ds = corpus_ds.cast_column("_id", Value("string")).rename_column(
                 "_id", "id"
             )
         logger.info("Loaded %d %s Documents.", len(corpus_ds), self.split.upper())
-        logger.info("Doc Example: %s", corpus_ds[0])
+        logger.debug("Doc Example: %s", corpus_ds[0])
         return corpus_ds
-    def _load_queries(self) -> QueryDatasetType:
-        logger.info("Loading Queries...")
+    def _load_queries(self, num_proc: int) -> QueryDatasetType:
         config = f"{self.config}-queries" if self.config is not None else "queries"
+        logger.info("Loading queries subset: %s", config)
         if "query" in self.dataset_configs:
             config = "query"
-        queries_ds = self._load_dataset_split(config)
+        queries_ds = self._load_dataset_split(config, num_proc)
         if "_id" in queries_ds.column_names:
             queries_ds = queries_ds.cast_column("_id", Value("string")).rename_column(
                 "_id", "id"
             )
         logger.info("Loaded %d %s queries.", len(queries_ds), self.split.upper())
-        logger.info("Query Example: %s", queries_ds[0])
+        logger.debug("Query Example: %s", queries_ds[0])
         return queries_ds
-    def _load_qrels(self) -> RelevantDocumentsType:
-        logger.info("Loading qrels...")
+    def _load_qrels(self, num_proc: int) -> RelevantDocumentsType:
         config = f"{self.config}-qrels" if self.config is not None else "default"
+        logger.info("Loading qrels subset: %s", config)
         if config == "default" and config not in self.dataset_configs:
             if "qrels" in self.dataset_configs:
                 config = "qrels"
@@ -168,7 +180,7 @@ class RetrievalDatasetLoader:
                     "No qrels or default config found. Please specify a valid config or ensure the dataset has qrels."
                 )
-        qrels_ds = self._load_dataset_split(config)
+        qrels_ds = self._load_dataset_split(config, num_proc)
         qrels_ds = qrels_ds.select_columns(["query-id", "corpus-id", "score"])
         qrels_ds = qrels_ds.cast(
@@ -191,13 +203,12 @@ class RetrievalDatasetLoader:
         logger.info("Loaded %d %s qrels.", len(qrels_dict), self.split.upper())
         return qrels_dict
-    def _load_top_ranked(self) -> TopRankedDocumentsType:
-        logger.info("Loading Top Ranked")
+    def _load_top_ranked(self, num_proc: int) -> TopRankedDocumentsType:
         config = (
             f"{self.config}-top_ranked" if self.config is not None else "top_ranked"
         )
-        top_ranked_ds = self._load_dataset_split(config)
+        logger.info("Loading top ranked subset: %s", config)
+        top_ranked_ds = self._load_dataset_split(config, num_proc)
         top_ranked_ds = top_ranked_ds.cast(
             Features(
                 {
@@ -215,13 +226,12 @@ class RetrievalDatasetLoader:
         logger.info(f"Top ranked loaded: {len(top_ranked_ds)}")
         return top_ranked_dict
-    def _load_instructions(self) -> InstructionDatasetType:
-        logger.info("Loading Instructions")
+    def _load_instructions(self, num_proc: int) -> InstructionDatasetType:
         config = (
             f"{self.config}-instruction" if self.config is not None else "instruction"
         )
-        instructions_ds = self._load_dataset_split(config)
+        logger.info("Loading instruction subset: %s", config)
+        instructions_ds = self._load_dataset_split(config, num_proc)
         instructions_ds = instructions_ds.cast(
             Features(
                 {
@@ -236,6 +246,7 @@ class RetrievalDatasetLoader:
 def _combine_queries_with_instructions_datasets(
     queries_dataset: QueryDatasetType,
     instruction_dataset: InstructionDatasetType | dict[str, str],
+    num_proc: int,
 ) -> Dataset:
     if isinstance(instruction_dataset, Dataset):
         instruction_to_query_idx = {
@@ -248,4 +259,4 @@ def _combine_queries_with_instructions_datasets(
         row["instruction"] = instruction_to_query_idx[row["id"]]
         return row
-    return queries_dataset.map(_add_instruction_to_query)
+    return queries_dataset.map(_add_instruction_to_query, num_proc=num_proc)

mteb/abstasks/sts.py CHANGED Viewed

@@ -1,19 +1,14 @@
+from __future__ import annotations
 import logging
-from pathlib import Path
-from typing import Any, TypedDict, cast
+from typing import TYPE_CHECKING, Any, TypedDict, cast
-from datasets import Dataset
 from scipy.stats import pearsonr, spearmanr
 from mteb._evaluators import AnySTSEvaluator
-from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
 from mteb.models import EncoderProtocol
-from mteb.types import PromptType
 from mteb.types.statistics import (
-    ImageStatistics,
-    ScoreStatistics,
     SplitDescriptiveStatistics,
-    TextStatistics,
 )
 from ._statistics_calculation import (
@@ -23,6 +18,20 @@ from ._statistics_calculation import (
 )
 from .abstask import AbsTask
+if TYPE_CHECKING:
+    from pathlib import Path
+    from datasets import Dataset
+    from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
+    from mteb.models import MTEBModels
+    from mteb.types import EncodeKwargs, PromptType
+    from mteb.types.statistics import (
+        ImageStatistics,
+        ScoreStatistics,
+        TextStatistics,
+    )
 logger = logging.getLogger(__name__)
@@ -103,14 +112,18 @@ class AbsTaskSTS(AbsTask):
     def _evaluate_subset(
         self,
-        model: EncoderProtocol,
+        model: MTEBModels,
         data_split: Dataset,
-        encode_kwargs: dict[str, Any],
+        encode_kwargs: EncodeKwargs,
         hf_split: str,
         hf_subset: str,
         prediction_folder: Path | None = None,
+        num_proc: int = 1,
         **kwargs: Any,
     ) -> STSMetrics:
+        if not isinstance(model, EncoderProtocol):
+            raise TypeError("Expected model to be an instance of EncoderProtocol")
         normalized_scores = list(map(self._normalize, data_split["score"]))
         data_split = data_split.select_columns(list(self.column_names))
@@ -124,7 +137,11 @@ class AbsTaskSTS(AbsTask):
             input2_prompt_type=self.input2_prompt_type,
             **kwargs,
         )
-        scores = evaluator(model, encode_kwargs=encode_kwargs)
+        scores = evaluator(
+            model,
+            encode_kwargs=encode_kwargs,
+            num_proc=num_proc,
+        )
         if prediction_folder:
             self._save_task_predictions(
@@ -142,7 +159,7 @@ class AbsTaskSTS(AbsTask):
     ) -> STSMetrics:
         def compute_corr(x: list[float], y: list[float]) -> tuple[float, float]:
             """Return (pearson, spearman) correlations between x and y."""
-            return pearsonr(x, y)[0], spearmanr(x, y)[0]
+            return float(pearsonr(x, y)[0]), float(spearmanr(x, y)[0])
         cosine_pearson, cosine_spearman = compute_corr(
             normalized_scores, scores["cosine_scores"]
@@ -179,7 +196,7 @@ class AbsTaskSTS(AbsTask):
         self, split: str, hf_subset: str | None = None, compute_overall: bool = False
     ) -> AnySTSDescriptiveStatistics:
         first_column, second_column = self.column_names
-        self.dataset = cast(dict[str, dict[str, Dataset]], self.dataset)
+        self.dataset = cast("dict[str, dict[str, Dataset]]", self.dataset)
         if hf_subset:
             sentence1 = self.dataset[hf_subset][split][first_column]
@@ -233,9 +250,11 @@ class AbsTaskSTS(AbsTask):
             label_statistics=labels_statistics,
         )
-    def _push_dataset_to_hub(self, repo_name: str) -> None:
+    def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
         self._upload_dataset_to_hub(
-            repo_name, [self.column_names[0], self.column_names[1], "score"]
+            repo_name,
+            [self.column_names[0], self.column_names[1], "score"],
+            num_proc=num_proc,
         )
     def _normalize(self, x: float) -> float:

mteb/abstasks/task_metadata.py CHANGED Viewed

@@ -1,8 +1,10 @@
+from __future__ import annotations
 import json
 import logging
 from collections.abc import Sequence
 from pathlib import Path
-from typing import Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, cast
 from huggingface_hub import (
     DatasetCard,
@@ -16,13 +18,11 @@ from pydantic import (
     ConfigDict,
     field_validator,
 )
-from typing_extensions import Required, TypedDict
+from typing_extensions import Required, TypedDict  # noqa: TC002
 import mteb
 from mteb.languages import check_language_code
 from mteb.types import (
-    HFSubset,
-    ISOLanguageScript,
     Languages,
     Licenses,
     Modalities,
@@ -30,7 +30,17 @@ from mteb.types import (
     StrDate,
     StrURL,
 )
-from mteb.types.statistics import DescriptiveStatistics
+if TYPE_CHECKING:
+    from huggingface_hub import (
+        CardData,
+    )
+    from mteb.types import (
+        HFSubset,
+        ISOLanguageScript,
+    )
+    from mteb.types.statistics import DescriptiveStatistics
 logger = logging.getLogger(__name__)
@@ -150,7 +160,7 @@ _TASK_TYPE = (
     "InstructionReranking",
 ) + MIEB_TASK_TYPE
-TaskType = Literal[_TASK_TYPE]
+TaskType = Literal[_TASK_TYPE]  # type: ignore[valid-type]
 """The type of the task. E.g. includes "Classification", "Retrieval" and "Clustering"."""
@@ -192,8 +202,10 @@ AnnotatorType = Literal[
 """The type of the annotators. Is often important for understanding the quality of a dataset."""
-PromptDict = TypedDict(
-    "PromptDict", {prompt_type.value: str for prompt_type in PromptType}, total=False
+PromptDict = TypedDict(  # type: ignore[misc]
+    "PromptDict",
+    {prompt_type.value: str for prompt_type in PromptType},
+    total=False,
 )
 """A dictionary containing the prompt used for the task.
@@ -365,7 +377,7 @@ class TaskMetadata(BaseModel):
         """Return a dictionary mapping huggingface subsets to languages."""
         if isinstance(self.eval_langs, dict):
             return self.eval_langs
-        return {"default": self.eval_langs}  # type: ignore
+        return {"default": cast("list[str]", self.eval_langs)}
     @property
     def intext_citation(self, include_cite: bool = True) -> str:
@@ -376,9 +388,8 @@ class TaskMetadata(BaseModel):
         if include_cite and cite:
             # check for whitespace in the citation
             if " " in cite:
-                logger.warning(
-                    "Citation contains whitespace. Please ensure that the citation is correctly formatted."
-                )
+                msg = "Citation contains whitespace. Please ensure that the citation is correctly formatted."
+                logger.warning(msg)
             return f"\\cite{{{cite}}}"
         return cite
@@ -414,7 +425,7 @@ class TaskMetadata(BaseModel):
         for subset, subset_value in stats.items():
             if subset == "hf_subset_descriptive_stats":
                 continue
-            n_samples[subset] = subset_value["num_samples"]  # type: ignore
+            n_samples[subset] = subset_value["num_samples"]
         return n_samples
     @property
@@ -447,7 +458,7 @@ class TaskMetadata(BaseModel):
         Raises:
             ValueError: If the prompt type is not recognized.
         """
-        if prompt_type is None:
+        if prompt_type is None or self.category is None:
             return self.modalities
         query_modalities, doc_modalities = self.category.split("2")
         category_to_modality: dict[str, Modalities] = {
@@ -467,7 +478,7 @@ class TaskMetadata(BaseModel):
     def _create_dataset_card_data(
         self,
-        existing_dataset_card_data: DatasetCardData | None = None,
+        existing_dataset_card_data: CardData | None = None,
     ) -> tuple[DatasetCardData, dict[str, Any]]:
         """Create a DatasetCardData object from the task metadata.
@@ -483,7 +494,6 @@ class TaskMetadata(BaseModel):
         dataset_type = [
             *self._hf_task_type(),
             *self._hf_task_category(),
-            *self._hf_subtypes(),
         ]
         languages = self._hf_languages()
@@ -502,12 +512,13 @@ class TaskMetadata(BaseModel):
         tags = ["mteb"] + self.modalities
-        descriptive_stats = self.descriptive_stats
-        if descriptive_stats is not None:
-            for split, split_stat in descriptive_stats.items():
+        descriptive_stats = ""
+        if self.descriptive_stats is not None:
+            descriptive_stats_ = self.descriptive_stats
+            for split, split_stat in descriptive_stats_.items():
                 if len(split_stat.get("hf_subset_descriptive_stats", {})) > 10:
                     split_stat.pop("hf_subset_descriptive_stats", {})
-            descriptive_stats = json.dumps(descriptive_stats, indent=4)
+            descriptive_stats = json.dumps(descriptive_stats_, indent=4)
         dataset_card_data_params = existing_dataset_card_data.to_dict()
         # override the existing values
@@ -584,10 +595,8 @@ class TaskMetadata(BaseModel):
     def _hf_subtypes(self) -> list[str]:
         # to get full list of available task_ids execute
-        # requests.post("https://huggingface.co/api/validate-yaml", json={
-        #   "content": "---\ntask_ids: 'test'\n---",
-        #   "repoType": "dataset"
-        # })
+        # https://huggingface.co/api/datasets-tags-by-type?type=task_ids
+        # ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
         mteb_to_hf_subtype = {
             "Article retrieval": ["document-retrieval"],
             "Conversational retrieval": ["conversational", "utterance-retrieval"],
@@ -609,7 +618,7 @@ class TaskMetadata(BaseModel):
                 "hate-speech-detection",
             ],
             "Thematic clustering": [],
-            "Scientific Reranking": [],
+            "Scientific Reranking": ["text-scoring"],
             "Claim verification": ["fact-checking", "fact-checking-retrieval"],
             "Topic classification": ["topic-classification"],
             "Code retrieval": [],
@@ -617,21 +626,21 @@ class TaskMetadata(BaseModel):
             "Cross-Lingual Semantic Discrimination": [],
             "Textual Entailment": ["natural-language-inference"],
             "Counterfactual Detection": [],
-            "Emotion classification": [],
+            "Emotion classification": ["sentiment-classification"],
             "Reasoning as Retrieval": [],
             "Rendered Texts Understanding": [],
             "Image Text Retrieval": [],
             "Object recognition": [],
             "Scene recognition": [],
             "Caption Pairing": ["image-captioning"],
-            "Emotion recognition": [],
+            "Emotion recognition": ["sentiment-scoring"],
             "Textures recognition": [],
             "Activity recognition": [],
             "Tumor detection": [],
             "Duplicate Detection": [],
             "Rendered semantic textual similarity": [
                 "semantic-similarity-scoring",
-                "rendered semantic textual similarity",
+                "semantic-similarity-classification",
             ],
             "Intent classification": [
                 "intent-classification",
@@ -645,10 +654,8 @@ class TaskMetadata(BaseModel):
     def _hf_task_type(self) -> list[str]:
         # to get full list of task_types execute:
-        # requests.post("https://huggingface.co/api/validate-yaml", json={
-        #     "content": "---\ntask_categories: ['test']\n---", "repoType": "dataset"
-        # }).json()
-        # or look at https://huggingface.co/tasks
+        # https://huggingface.co/api/datasets-tags-by-type?type=task_categories
+        # ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
         mteb_task_type_to_datasets = {
             # Text
             "BitextMining": ["translation"],
@@ -667,7 +674,7 @@ class TaskMetadata(BaseModel):
             "Any2AnyRetrieval": ["visual-document-retrieval"],
             "Any2AnyMultilingualRetrieval": ["visual-document-retrieval"],
             "VisionCentricQA": ["visual-question-answering"],
-            "ImageClustering": ["image-clustering"],
+            "ImageClustering": ["image-feature-extraction"],
             "ImageClassification": ["image-classification"],
             "ImageMultilabelClassification": ["image-classification"],
             "DocumentUnderstanding": ["visual-document-retrieval"],
@@ -695,11 +702,11 @@ class TaskMetadata(BaseModel):
     def _hf_languages(self) -> list[str]:
         languages: list[str] = []
-        if self.is_multilingual:
-            for val in list(self.eval_langs.values()):
+        if self.is_multilingual and isinstance(self.eval_langs, dict):
+            for val in self.eval_langs.values():
                 languages.extend(val)
         else:
-            languages = self.eval_langs
+            languages = cast("list[str]", self.eval_langs)
         # value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters),
         # or a special value like "code", "multilingual".
         readme_langs = []
@@ -711,7 +718,7 @@ class TaskMetadata(BaseModel):
                 readme_langs.append(lang_name)
         return sorted(set(readme_langs))
-    def _hf_license(self) -> str:
+    def _hf_license(self) -> str | None:
         dataset_license = self.license
         if dataset_license:
             license_mapping = {

mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

mteb 2.5.2py3-none-any.whl → 2.7.9py3-none-any.whl