PyPI - mteb - Versions diffs - 2.0.5__py3-none-any.whl → 2.1.1__py3-none-any.whl - Mend

mteb 2.0.5py3-none-any.whl → 2.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (156) hide show

mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json ADDED Viewed

@@ -0,0 +1,184 @@
+{
+    "train": {
+        "num_samples": 16500,
+        "number_of_characters": 118992,
+        "documents_text_statistics": null,
+        "documents_image_statistics": {
+            "min_image_width": 447,
+            "average_image_width": 1401.1196666666667,
+            "max_image_width": 2743,
+            "min_image_height": 376,
+            "average_image_height": 1685.2892,
+            "max_image_height": 5257,
+            "unique_images": 14981
+        },
+        "queries_text_statistics": {
+            "total_text_length": 118992,
+            "min_text_length": 13,
+            "average_text_length": 79.328,
+            "max_text_length": 204,
+            "unique_texts": 1499
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 1499,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.0,
+            "max_relevant_docs_per_query": 1,
+            "unique_relevant_docs": 1499
+        },
+        "top_ranked_statistics": null,
+        "hf_subset_descriptive_stats": {
+            "en": {
+                "num_samples": 3300,
+                "number_of_characters": 20947,
+                "documents_text_statistics": null,
+                "documents_image_statistics": {
+                    "min_image_width": 653,
+                    "average_image_width": 1388.4603333333334,
+                    "max_image_width": 2464,
+                    "min_image_height": 878,
+                    "average_image_height": 1691.6246666666666,
+                    "max_image_height": 3533,
+                    "unique_images": 2996
+                },
+                "queries_text_statistics": {
+                    "total_text_length": 20947,
+                    "min_text_length": 31,
+                    "average_text_length": 69.82333333333334,
+                    "max_text_length": 142,
+                    "unique_texts": 300
+                },
+                "queries_image_statistics": null,
+                "relevant_docs_statistics": {
+                    "num_relevant_docs": 300,
+                    "min_relevant_docs_per_query": 1,
+                    "average_relevant_docs_per_query": 1.0,
+                    "max_relevant_docs_per_query": 1,
+                    "unique_relevant_docs": 300
+                },
+                "top_ranked_statistics": null
+            },
+            "es": {
+                "num_samples": 3300,
+                "number_of_characters": 24935,
+                "documents_text_statistics": null,
+                "documents_image_statistics": {
+                    "min_image_width": 447,
+                    "average_image_width": 1370.8263333333334,
+                    "max_image_width": 2743,
+                    "min_image_height": 376,
+                    "average_image_height": 1709.195,
+                    "max_image_height": 5257,
+                    "unique_images": 2997
+                },
+                "queries_text_statistics": {
+                    "total_text_length": 24935,
+                    "min_text_length": 35,
+                    "average_text_length": 83.11666666666666,
+                    "max_text_length": 153,
+                    "unique_texts": 300
+                },
+                "queries_image_statistics": null,
+                "relevant_docs_statistics": {
+                    "num_relevant_docs": 300,
+                    "min_relevant_docs_per_query": 1,
+                    "average_relevant_docs_per_query": 1.0,
+                    "max_relevant_docs_per_query": 1,
+                    "unique_relevant_docs": 300
+                },
+                "top_ranked_statistics": null
+            },
+            "fr": {
+                "num_samples": 3300,
+                "number_of_characters": 25217,
+                "documents_text_statistics": null,
+                "documents_image_statistics": {
+                    "min_image_width": 780,
+                    "average_image_width": 1402.3566666666666,
+                    "max_image_width": 2579,
+                    "min_image_height": 756,
+                    "average_image_height": 1689.5696666666668,
+                    "max_image_height": 2912,
+                    "unique_images": 2998
+                },
+                "queries_text_statistics": {
+                    "total_text_length": 25217,
+                    "min_text_length": 37,
+                    "average_text_length": 84.05666666666667,
+                    "max_text_length": 152,
+                    "unique_texts": 299
+                },
+                "queries_image_statistics": null,
+                "relevant_docs_statistics": {
+                    "num_relevant_docs": 299,
+                    "min_relevant_docs_per_query": 1,
+                    "average_relevant_docs_per_query": 1.0,
+                    "max_relevant_docs_per_query": 1,
+                    "unique_relevant_docs": 299
+                },
+                "top_ranked_statistics": null
+            },
+            "de": {
+                "num_samples": 3300,
+                "number_of_characters": 23029,
+                "documents_text_statistics": null,
+                "documents_image_statistics": {
+                    "min_image_width": 828,
+                    "average_image_width": 1394.5596666666668,
+                    "max_image_width": 2366,
+                    "min_image_height": 756,
+                    "average_image_height": 1686.0596666666668,
+                    "max_image_height": 2827,
+                    "unique_images": 2994
+                },
+                "queries_text_statistics": {
+                    "total_text_length": 23029,
+                    "min_text_length": 35,
+                    "average_text_length": 76.76333333333334,
+                    "max_text_length": 143,
+                    "unique_texts": 300
+                },
+                "queries_image_statistics": null,
+                "relevant_docs_statistics": {
+                    "num_relevant_docs": 300,
+                    "min_relevant_docs_per_query": 1,
+                    "average_relevant_docs_per_query": 1.0,
+                    "max_relevant_docs_per_query": 1,
+                    "unique_relevant_docs": 300
+                },
+                "top_ranked_statistics": null
+            },
+            "it": {
+                "num_samples": 3300,
+                "number_of_characters": 24864,
+                "documents_text_statistics": null,
+                "documents_image_statistics": {
+                    "min_image_width": 788,
+                    "average_image_width": 1449.3953333333334,
+                    "max_image_width": 2583,
+                    "min_image_height": 804,
+                    "average_image_height": 1649.997,
+                    "max_image_height": 2168,
+                    "unique_images": 2996
+                },
+                "queries_text_statistics": {
+                    "total_text_length": 24864,
+                    "min_text_length": 13,
+                    "average_text_length": 82.88,
+                    "max_text_length": 204,
+                    "unique_texts": 300
+                },
+                "queries_image_statistics": null,
+                "relevant_docs_statistics": {
+                    "num_relevant_docs": 300,
+                    "min_relevant_docs_per_query": 1,
+                    "average_relevant_docs_per_query": 1.0,
+                    "max_relevant_docs_per_query": 1,
+                    "unique_relevant_docs": 300
+                },
+                "top_ranked_statistics": null
+            }
+        }
+    }
+}

mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+    "test": {
+        "num_samples": 22637,
+        "number_of_characters": 21218611,
+        "documents_text_statistics": {
+            "total_text_length": 21197901,
+            "min_text_length": 7,
+            "average_text_length": 945.7015837608744,
+            "max_text_length": 37834,
+            "unique_texts": 22415
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 20710,
+            "min_text_length": 22,
+            "average_text_length": 93.28828828828829,
+            "max_text_length": 250,
+            "unique_texts": 222
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 1059,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 4.77027027027027,
+            "max_relevant_docs_per_query": 57,
+            "unique_relevant_docs": 491
+        },
+        "top_ranked_statistics": null
+    }
+}

mteb/descriptive_stats/STS/SICK-NL-STS.json ADDED Viewed

@@ -0,0 +1,28 @@
+{
+    "test": {
+        "num_samples": 4902,
+        "number_of_characters": 463327,
+        "unique_pairs": 4902,
+        "text1_statistics": {
+            "total_text_length": 233941,
+            "min_text_length": 10,
+            "average_text_length": 47.72358221134231,
+            "max_text_length": 158,
+            "unique_texts": 3378
+        },
+        "text2_statistics": {
+            "total_text_length": 229386,
+            "min_text_length": 10,
+            "average_text_length": 46.79436964504284,
+            "max_text_length": 158,
+            "unique_texts": 3327
+        },
+        "image1_statistics": null,
+        "image2_statistics": null,
+        "label_statistics": {
+            "min_score": 1.0,
+            "avg_score": 3.528012039368932,
+            "max_score": 5.0
+        }
+    }
+}

mteb/languages/check_language_code.py CHANGED Viewed

@@ -13,7 +13,15 @@ def check_language_code(code: str) -> None:
     Args:
         code: The language code to check.
     """
-    lang, script = code.split("-")
+    lang = None
+    script = None
+    if "-" in code:
+        lang, script = code.split("-")
+    elif code[0].isupper():
+        script = code
+    else:
+        lang = code
     if script == "Code":
         if lang in PROGRAMMING_LANGS:
             return  # override for code
@@ -21,11 +29,11 @@ def check_language_code(code: str) -> None:
             raise ValueError(
                 f"Programming language {lang} is not a valid programming language."
             )
-    if lang not in ISO_TO_LANGUAGE:
+    if lang is not None and lang not in ISO_TO_LANGUAGE:
         raise ValueError(
             f"Invalid language code: {lang}, you can find valid ISO 639-3 codes in {path_to_lang_codes}"
         )
-    if script not in ISO_TO_SCRIPT:
+    if script is not None and script not in ISO_TO_SCRIPT:
         raise ValueError(
             f"Invalid script code: {script}, you can find valid ISO 15924 codes in {path_to_lang_scripts}"
         )

mteb/languages/language_scripts.py CHANGED Viewed

@@ -3,6 +3,8 @@ from dataclasses import dataclass
 from typing_extensions import Self
+from mteb.languages import check_language_code
 @dataclass
 class LanguageScripts:
@@ -46,8 +48,10 @@ class LanguageScripts:
                 if len(lang_script) == 2:
                     normalized_langs.add(lang_script[0])
                     lang_script_codes.add(lang)
+                    check_language_code(lang)
                     script_codes.add(lang_script[1])
                 else:
+                    check_language_code(lang)
                     normalized_langs.add(lang)
         return cls(

mteb/leaderboard/text_segments.py CHANGED Viewed

@@ -53,7 +53,7 @@ ACKNOWLEDGEMENT = """
         <img src="https://play-lh.googleusercontent.com/HdfHZ5jnfMM1Ep7XpPaVdFIVSRx82wKlRC_qmnHx9H1E4aWNp4WKoOcH0x95NAnuYg" width="60" height="55" style="padding: 10px;">
     </a>
     <a href="https://huggingface.co">
-        <img src="https://raw.githubusercontent.com/embeddings-benchmark/mteb/main/docs/images/hf_logo.png" width="60" height="55" style="padding: 10px;">
+        <img src="https://raw.githubusercontent.com/embeddings-benchmark/mteb/main/docs/images/logos/hf_logo.png" width="60" height="55" style="padding: 10px;">
     </a>
 </div>

mteb/models/model_implementations/b1ade_models.py CHANGED Viewed

@@ -2,7 +2,7 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
 b1ade_training_data = {
-    # We are in teh process of submitting a paper outlining our process of creating b1ade using model merging and knowledge distillation.
+    # We are in the process of submitting a paper outlining our process of creating b1ade using model merging and knowledge distillation.
     # Similar to mixedbread models, we do not train on any data (except the MSMarco training split) of MTEB.
     "MSMARCO",
 }

mteb/models/model_implementations/bge_models.py CHANGED Viewed

@@ -62,7 +62,7 @@ bge_m3_training_data = {
     # mMARCO-ZH
     # LawGPT
     # NLI-zh2, LeCaRDv2,
-    # NLI, MultiLongDoc (their syntetic)
+    # NLI, MultiLongDoc (their synthetic)
     # + synthetic data
 }
@@ -141,7 +141,6 @@ bge_chinese_training_data = {
 # https://huggingface.co/BAAI/bge-m3/discussions/29
 bgem3_languages = [
     "afr-Latn",  # af
-    # als
     "amh-Ethi",  # am
     # an
     # ar
@@ -151,7 +150,6 @@ bgem3_languages = [
     # av
     # az
     "azj-Latn",  # azb
-    # ba
     # bar
     # bcl
     "ben-Beng",  # be

mteb/models/model_implementations/bmretriever_models.py CHANGED Viewed

@@ -48,7 +48,7 @@ class BMRetrieverWrapper(InstructSentenceTransformerModel):
         if padding_side is not None:
             tokenizer_params["padding_side"] = padding_side
         kwargs.setdefault("tokenizer_args", {}).update(tokenizer_params)
-        kwargs.setdefault("config_args", {}).update(revison=revision)
+        kwargs.setdefault("config_args", {}).update(revision=revision)
         transformer = Transformer(
             model_name,

mteb/models/model_implementations/gme_v_models.py CHANGED Viewed

@@ -39,7 +39,7 @@ class Encoder(torch.nn.Module):
         self.max_length = max_length
         self.normalize = normalize
         self.processor.tokenizer.padding_side = "right"
-        self.defualt_instruction = "You are a helpful assistant."
+        self.default_instruction = "You are a helpful assistant."
     def forward(
         self,
@@ -103,7 +103,7 @@ class Encoder(torch.nn.Module):
         instruction=None,
         **kwargs,
     ):
-        instruction = instruction or self.defualt_instruction
+        instruction = instruction or self.default_instruction
         # Inputs must be batched
         input_texts, input_images = [], []
         for t, i in zip(texts, images):

mteb/models/model_implementations/ibm_granite_models.py CHANGED Viewed

@@ -79,7 +79,7 @@ granite_training_data = {
     "MIRACLReranking",
     # Multilingual MrTydi Triples
     "MrTidyRetrieval",
-    # Sadeeem Question Asnwering
+    # Sadeeem Question Answering
     # DBPedia Title-Body Pairs
     "DBPedia",
     "DBPedia-NL",  # translated from hotpotQA (not trained on)

mteb/models/model_implementations/inf_models.py CHANGED Viewed

@@ -4,7 +4,7 @@ from mteb.models.model_meta import (
 )
 from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
-inf_retreiver_v1_training_data = {
+inf_retriever_v1_training_data = {
     # eng_Latn
     "ArguAna",
     "CQADupstackRetrieval",
@@ -66,7 +66,7 @@ inf_retriever_v1 = ModelMeta(
     adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct",
     public_training_code=None,
     public_training_data=None,
-    training_datasets=inf_retreiver_v1_training_data,
+    training_datasets=inf_retriever_v1_training_data,
     citation=INF_RETRIEVER_CITATION,
 )
@@ -92,6 +92,6 @@ inf_retriever_v1_1_5b = ModelMeta(
     adapted_from="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
     public_training_code=None,
     public_training_data=None,
-    training_datasets=inf_retreiver_v1_training_data,
+    training_datasets=inf_retriever_v1_training_data,
     citation=INF_RETRIEVER_CITATION,
 )

mteb/models/model_implementations/jina_models.py CHANGED Viewed

@@ -310,9 +310,19 @@ class JinaV4Wrapper(AbsEncoder):
         text_embeddings = None
         image_embeddings = None
         if "text" in inputs.dataset.features:
-            text_embeddings = self.get_text_embeddings(inputs, **kwargs)
+            text_embeddings = self.get_text_embeddings(
+                inputs,
+                task_metadata=task_metadata,
+                prompt_type=prompt_type,
+                **kwargs,
+            )
         if "image" in inputs.dataset.features:
-            image_embeddings = self.get_image_embeddings(inputs, **kwargs)
+            image_embeddings = self.get_image_embeddings(
+                inputs,
+                task_metadata=task_metadata,
+                prompt_type=prompt_type,
+                **kwargs,
+            )
         if text_embeddings is not None and image_embeddings is not None:
             if len(text_embeddings) != len(image_embeddings):

mteb/models/model_implementations/llm2vec_models.py CHANGED Viewed

@@ -23,7 +23,7 @@ def llm2vec_instruction(instruction):
 llm2vec_supervised_training_data = {
     # source, section g1: https://arxiv.org/pdf/2404.05961
-    # splits assumed but unkown
+    # splits assumed but unknown
     "HotpotQA",
     "HotpotQA-PL",  # translation not trained on
     "HotpotQA-NL",  # translation not trained on

mteb/models/model_implementations/misc_models.py CHANGED Viewed

@@ -382,7 +382,7 @@ Mihaiii__Venusaur = ModelMeta(
     reference="https://huggingface.co/Mihaiii/Venusaur",
     similarity_fn_name=ScoringFunction.COSINE,
     use_instructions=None,
-    training_datasets=None,  # source model is unkown
+    training_datasets=None,  # source model is unknown
     # {"Mihaiii/qa-assistant"},
     adapted_from="Mihaiii/test14",
     superseded_by=None,
@@ -1516,7 +1516,7 @@ openbmb__minicpm_embedding = ModelMeta(
     superseded_by=None,
 )
-silma_ai__silma_embeddding_matryoshka_v0_1 = ModelMeta(
+silma_ai__silma_embedding_matryoshka_v0_1 = ModelMeta(
     name="silma-ai/silma-embeddding-matryoshka-v0.1",
     revision="a520977a9542ebdb8a7206df6b7ff6977f1886ea",
     release_date="2024-10-12",

mteb/models/model_implementations/mxbai_models.py CHANGED Viewed

@@ -5,7 +5,7 @@ from mteb.models.model_meta import (
 from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
 mixedbread_training_data = {
-    # from correspondance:
+    # from correspondence:
     # as mentioned in our blog post
     # (https://www.mixedbread.com/blog/mxbai-embed-large-v1#built-for-rag-and-real-world-use-cases:~:text=During%20the%20whole,related%20use%20cases.)
     # We do not train on any data (except the MSMarco training split) of MTEB. We have a strong filtering process to ensure the OOD setting. That's true

mteb/models/model_implementations/salesforce_models.py CHANGED Viewed

@@ -27,7 +27,7 @@ SFR_TRAINING_DATA = {  # inherits from e5
     "HotpotQA-PL",  # translation not trained on
     "HotpotQA-NL",  # translation not trained on
     # source: https://github.com/embeddings-benchmark/leaderboard/issues/41
-    # qoute: In the realm of Semantic Textual Similarity (STS), it is trained on STS12, STS22, and STSBenchmark
+    # quote: In the realm of Semantic Textual Similarity (STS), it is trained on STS12, STS22, and STSBenchmark
     "STS12",
     "STS22",
     "STSBenchmark",

mteb/models/model_implementations/seed_1_6_embedding_models.py CHANGED Viewed

@@ -344,7 +344,7 @@ TASK_NAME_TO_INSTRUCTION = {
     "SprintDuplicateQuestions": "Retrieve semantically similar text\n{}",
     "TwitterSemEval2015": "Retrieve semantically similar text\n{}",
     "TwitterURLCorpus": "Retrieve semantically similar text\n{}",
-    "CQADupstackGamingRetrieval": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given questionn\n{}",
+    "CQADupstackGamingRetrieval": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question\n{}",
     "CQADupstackUnixRetrieval": "Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question\n{}",
     "DuRetrieval": "为这个句子生成表示以用于检索相关内容：{}",
     "T2Retrieval": "为这个句子生成表示以用于检索相关内容：{}",

mteb/models/model_implementations/voyage_v.py CHANGED Viewed

@@ -51,7 +51,13 @@ def _downsample_image(
 def voyage_v_loader(model_name, **kwargs):
     requires_package(
         voyage_v_loader,
-        "voyageai and tenacity",
+        "voyageai",
+        model_name,
+        "pip install 'mteb[voyage_v]'",
+    )
+    requires_package(
+        voyage_v_loader,
+        "tenacity",
         model_name,
         "pip install 'mteb[voyage_v]'",
     )
@@ -65,11 +71,9 @@ def voyage_v_loader(model_name, **kwargs):
             **kwargs: Any,
         ):
             requires_image_dependencies()
-            from torchvision import transforms
             self.model_name = model_name.split("/")[-1]
             self.vo = voyageai.Client()
-            self.tensor_to_image = transforms.Compose([transforms.PILToTensor()])
         @retry(
             stop=stop_after_attempt(6),  # Stop after 6 attempts
@@ -126,10 +130,7 @@ def voyage_v_loader(model_name, **kwargs):
             for batch in tqdm(
                 images, disable=not show_progress_bar, desc="Image Encoding"
             ):
-                batch_images = [
-                    [_downsample_image(self.tensor_to_image(image))]
-                    for image in batch["image"]
-                ]
+                batch_images = [[_downsample_image(image)] for image in batch["image"]]
                 embeddings = self._multimodal_embed(
                     batch_images, model=self.model_name, input_type=input_type
                 ).embeddings
@@ -163,8 +164,7 @@ def voyage_v_loader(model_name, **kwargs):
                     inputs, disable=not show_progress_bar, desc="Interleaved Encoding"
                 ):
                     batch_images = [
-                        _downsample_image(self.tensor_to_image(image))
-                        for image in batch["image"]
+                        _downsample_image(image) for image in batch["image"]
                     ]
                     batch_texts = batch["text"]
                     interleaved_inputs = [

mteb/results/task_result.py CHANGED Viewed

@@ -32,7 +32,7 @@ from mteb.types import (
 logger = logging.getLogger(__name__)
-class Criterias(HelpfulStrEnum):
+class Criteria(HelpfulStrEnum):
     """Enum for criteria to check when merging TaskResult objects."""
     MTEB_VERSION = "mteb_version"
@@ -671,7 +671,7 @@ class TaskResult(BaseModel):
     def is_mergeable(
         self,
         result: TaskResult | AbsTask,
-        criteria: list[str] | list[Criterias] = [
+        criteria: list[str] | list[Criteria] = [
             "mteb_version",
             "dataset_revision",
         ],
@@ -688,9 +688,7 @@ class TaskResult(BaseModel):
         Returns:
             True if the TaskResult object can be merged with the other object, False otherwise.
         """
-        criteria = [
-            Criterias.from_str(c) if isinstance(c, str) else c for c in criteria
-        ]
+        criteria = [Criteria.from_str(c) if isinstance(c, str) else c for c in criteria]
         if isinstance(result, TaskResult):
             name = result.task_name
             revision = result.dataset_revision
@@ -709,14 +707,14 @@ class TaskResult(BaseModel):
                 )
             return False
-        if Criterias.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
+        if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
             if raise_error:
                 raise ValueError(
                     f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} and {mteb_version})"
                 )
             return False
-        if Criterias.DATASET_REVISION in criteria and self.dataset_revision != revision:
+        if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
             if raise_error:
                 raise ValueError(
                     f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
@@ -728,7 +726,7 @@ class TaskResult(BaseModel):
     def merge(
         self,
         new_results: TaskResult,
-        criteria: list[str] | list[Criterias] = [
+        criteria: list[str] | list[Criteria] = [
             "mteb_version",
             "dataset_revision",
         ],

mteb/tasks/classification/dan/angry_tweets_classification.py CHANGED Viewed

@@ -9,7 +9,7 @@ class AngryTweetsClassification(AbsTaskClassification):
             "path": "DDSC/angry-tweets",
             "revision": "20b0e6081892e78179356fada741b7afa381443d",
         },
-        description="A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets",
+        description="A sentiment dataset with 3 classes (positive, negative, neutral) for Danish tweets",
         reference="https://aclanthology.org/2021.nodalida-main.53/",
         type="Classification",
         category="t2c",
@@ -47,7 +47,7 @@ class AngryTweetsClassificationV2(AbsTaskClassification):
             "path": "mteb/angry_tweets",
             "revision": "b9475fb66a13befda4fa9871cd92343bb2c0eb77",
         },
-        description="""A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets
+        description="""A sentiment dataset with 3 classes (positive, negative, neutral) for Danish tweets
         This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
         reference="https://aclanthology.org/2021.nodalida-main.53/",
         type="Classification",

mteb/tasks/classification/eng/legal_bench_classification.py CHANGED Viewed

@@ -2641,7 +2641,7 @@ class InternationalCitizenshipQuestionsLegalBenchClassification(AbsTaskClassific
 class JCrewBlockerLegalBenchClassification(AbsTaskClassification):
     metadata = TaskMetadata(
         name="JCrewBlockerLegalBenchClassification",
-        description="The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of detemining whether the J.Crew Blocker is present in the document.",
+        description="The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of determining whether the J.Crew Blocker is present in the document.",
         reference="https://huggingface.co/datasets/nguha/legalbench",
         dataset={
             "path": "mteb/JCrewBlockerLegalBenchClassification",
@@ -2677,7 +2677,7 @@ class JCrewBlockerLegalBenchClassification(AbsTaskClassification):
 class JCrewBlockerLegalBenchClassificationV2(AbsTaskClassification):
     metadata = TaskMetadata(
         name="JCrewBlockerLegalBenchClassification.v2",
-        description="""The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of detemining whether the J.Crew Blocker is present in the document.
+        description="""The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of determining whether the J.Crew Blocker is present in the document.
         This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
         reference="https://huggingface.co/datasets/nguha/legalbench",
         dataset={
@@ -4500,7 +4500,7 @@ class OverrulingLegalBenchClassificationV2(AbsTaskClassification):
 class PersonalJurisdictionLegalBenchClassification(AbsTaskClassification):
     metadata = TaskMetadata(
         name="PersonalJurisdictionLegalBenchClassification",
-        description="""Given a fact pattern describing the set of contacts between a plaintiff, defendant, and forum, determine if a court in that forum could excercise personal jurisdiction over the defendant.""",
+        description="""Given a fact pattern describing the set of contacts between a plaintiff, defendant, and forum, determine if a court in that forum could exercise personal jurisdiction over the defendant.""",
         reference="https://huggingface.co/datasets/nguha/legalbench",
         dataset={
             "path": "mteb/PersonalJurisdictionLegalBenchClassification",

mteb 2.0.5__py3-none-any.whl → 2.1.1__py3-none-any.whl

mteb 2.0.5py3-none-any.whl → 2.1.1py3-none-any.whl