PyPI - mteb - Versions diffs - 2.6.4__py3-none-any.whl → 2.6.6__py3-none-any.whl - Mend

mteb 2.6.4py3-none-any.whl → 2.6.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

mteb/models/model_meta.py CHANGED Viewed

@@ -17,6 +17,7 @@ from huggingface_hub import (
     get_safetensors_metadata,
     hf_hub_download,
     list_repo_commits,
+    model_info,
     repo_exists,
 )
 from huggingface_hub.errors import (
@@ -56,6 +57,10 @@ FRAMEWORKS = Literal[
     "PyLate",
     "ColBERT",
     "ColPali",
+    "GGUF",
+    "safetensors",
+    "ONNX",
+    "Transformers",
 ]
 MODEL_TYPES = Literal["dense", "cross-encoder", "late-interaction"]
@@ -82,9 +87,6 @@ def _get_loader_name(
     return loader.__name__
-_SENTENCE_TRANSFORMER_LIB_NAME: FRAMEWORKS = "Sentence Transformers"
 class ModelMeta(BaseModel):
     """The model metadata object.
@@ -319,14 +321,10 @@ class ModelMeta(BaseModel):
                 model_config = None
                 logger.warning(f"Can't get configuration for {model_name}. Error: {e}")
-            if card_data.library_name == _SENTENCE_TRANSFORMER_LIB_NAME or (
-                card_data.tags and _SENTENCE_TRANSFORMER_LIB_NAME in card_data.tags
-            ):
-                frameworks.append(_SENTENCE_TRANSFORMER_LIB_NAME)
-            else:
-                msg = "Model library not recognized, defaulting to Sentence Transformers loader."
-                logger.warning(msg)
-                warnings.warn(msg)
+            hf_frameworks = (
+                cls._get_frameworks_from_hf_tags(model_name) if model_name else []
+            )
+            frameworks.extend(hf_frameworks)
             if revision is None:
                 revisions = _get_repo_commits(model_name, "model")
@@ -386,8 +384,6 @@ class ModelMeta(BaseModel):
             else model.model_card_data.base_model
         )
         meta = cls._from_hub(name, revision, compute_metadata)
-        if _SENTENCE_TRANSFORMER_LIB_NAME not in meta.framework:
-            meta.framework.append("Sentence Transformers")
         meta.revision = model.model_card_data.base_model_revision or meta.revision
         meta.max_tokens = model.max_seq_length
         meta.embed_dim = model.get_sentence_embedding_dimension()
@@ -413,8 +409,6 @@ class ModelMeta(BaseModel):
             The generated ModelMeta.
         """
         meta = cls._from_hub(model, revision, compute_metadata)
-        if _SENTENCE_TRANSFORMER_LIB_NAME not in meta.framework:
-            meta.framework.append("Sentence Transformers")
         meta.modalities = ["text"]
         if model and compute_metadata and _repo_exists(model):
@@ -461,8 +455,6 @@ class ModelMeta(BaseModel):
         from mteb.models import CrossEncoderWrapper
         meta = cls._from_hub(model.model.name_or_path, revision, compute_metadata)
-        if _SENTENCE_TRANSFORMER_LIB_NAME not in meta.framework:
-            meta.framework.append("Sentence Transformers")
         meta.revision = model.config._commit_hash or meta.revision
         meta.loader = CrossEncoderWrapper
         meta.embed_dim = None
@@ -644,6 +636,43 @@ class ModelMeta(BaseModel):
             return release_date
         return None
+    @staticmethod
+    def _get_frameworks_from_hf_tags(model_name: str) -> list[FRAMEWORKS]:
+        """Extract frameworks supported by the model from HuggingFace model tags.
+        Args:
+            model_name: HuggingFace model name
+        Returns:
+            List of framework names found in tags. Defaults to empty list if no frameworks found.
+        """
+        try:
+            info = model_info(model_name)
+            if not info.tags:
+                return []
+        except Exception as e:
+            logger.warning(
+                f"Failed to fetch frameworks from HuggingFace tags for {model_name}: {e}"
+            )
+            return []
+        # Mapping from HuggingFace tags to MTEB framework names
+        tag_to_framework: dict[str, FRAMEWORKS] = {
+            "sentence-transformers": "Sentence Transformers",
+            "transformers": "Transformers",
+            "onnx": "ONNX",
+            "safetensors": "safetensors",
+            "gguf": "GGUF",
+        }
+        frameworks: list[FRAMEWORKS] = []
+        for framework_tag in tag_to_framework.keys():
+            if framework_tag in info.tags:
+                frameworks.append(tag_to_framework[framework_tag])
+        return frameworks
     def to_python(self) -> str:
         """Returns a string representation of the model."""
         return _pydantic_instance_to_code(self)

mteb/results/benchmark_results.py CHANGED Viewed

@@ -432,11 +432,11 @@ class BenchmarkResults(BaseModel):
             out_file.write(self.model_dump_json(indent=2))
     @classmethod
-    def from_validated(cls, **data) -> BenchmarkResults:
+    def from_validated(cls, **data: Any) -> BenchmarkResults:
         """Create BenchmarkResults from validated data.
         Args:
-            data: Dictionary containing the data.
+            **data: Arbitrary keyword arguments containing the data.
         Returns:
             An instance of BenchmarkResults.

mteb/tasks/classification/kur/kurdish_sentiment_classification.py CHANGED Viewed

@@ -25,7 +25,7 @@ class KurdishSentimentClassification(AbsTaskClassification):
         dialect=["Sorani"],
         sample_creation="found",
         bibtex_citation=r"""
-@article{article,
+@article{badawi2024kurdisent,
   author = {Badawi, Soran and Kazemi, Arefeh and Rezaie, Vali},
   doi = {10.1007/s10579-023-09716-6},
   journal = {Language Resources and Evaluation},
@@ -62,7 +62,7 @@ class KurdishSentimentClassificationV2(AbsTaskClassification):
         dialect=["Sorani"],
         sample_creation="found",
         bibtex_citation=r"""
-@article{article,
+@article{badawi2024kurdisent,
   author = {Badawi, Soran and Kazemi, Arefeh and Rezaie, Vali},
   doi = {10.1007/s10579-023-09716-6},
   journal = {Language Resources and Evaluation},

mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py CHANGED Viewed

@@ -25,7 +25,7 @@ class HUMEWikiCitiesClustering(AbsTaskClusteringLegacy):
         dialect=[],
         sample_creation="found",
         bibtex_citation=r"""
-@online{wikidump,
+@online{wikidump2024,
   author = {Wikimedia Foundation},
   title = {Wikimedia Downloads},
   url = {https://dumps.wikimedia.org},

mteb/tasks/clustering/eng/wiki_cities_clustering.py CHANGED Viewed

@@ -25,7 +25,7 @@ class WikiCitiesClustering(AbsTaskClusteringLegacy):
         dialect=[],
         sample_creation="found",
         bibtex_citation=r"""
-@online{wikidump,
+@online{wikidump2024,
   author = {Wikimedia Foundation},
   title = {Wikimedia Downloads},
   url = {https://dumps.wikimedia.org},

mteb/tasks/clustering/zho/cmteb_clustering.py CHANGED Viewed

@@ -226,7 +226,7 @@ class ThuNewsClusteringFastS2S(AbsTaskClustering):
         dialect=[],
         sample_creation="found",
         bibtex_citation=r"""
-@software{THUCTC,
+@software{sun2016thuctc,
   author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
   note = {THU Chinese Text Classification Toolkit},
   publisher = {THU Natural Language Processing Lab},
@@ -285,7 +285,7 @@ class ThuNewsClusteringFastP2P(AbsTaskClustering):
         dialect=[],
         sample_creation="found",
         bibtex_citation=r"""
-@software{THUCTC,
+@software{sun2016thuctc,
   author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
   note = {THU Chinese Text Classification Toolkit},
   publisher = {THU Natural Language Processing Lab},

mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py CHANGED Viewed

@@ -44,7 +44,7 @@ class WikipediaRerankingMultilingual(AbsTaskRetrieval):
         dialect=[],
         sample_creation="LM-generated and verified",
         bibtex_citation=r"""
-@online{wikidump,
+@online{wikidump2024,
   author = {Wikimedia Foundation},
   title = {Wikimedia Downloads},
   url = {https://dumps.wikimedia.org},

mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py CHANGED Viewed

@@ -25,7 +25,7 @@ class CUB200I2I(AbsTaskRetrieval):
         modalities=["image"],
         sample_creation="created",
         bibtex_citation=r"""
-@article{article,
+@article{welinder2010caltech,
   author = {Welinder, Peter and Branson, Steve and Mita, Takeshi and Wah, Catherine and Schroff, Florian and Belongie, Serge and Perona, Pietro},
   month = {09},
   pages = {},

mteb/tasks/retrieval/vie/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from .argu_ana_vn_retrieval import ArguAnaVN
-from .climate_fevervn_retrieval import ClimateFEVERVN
+from .climate_fevervn_retrieval import ClimateFEVERVN, NanoClimateFEVERVN
 from .cqa_dupstack_android_vn_retrieval import CQADupstackAndroidVN
 from .cqa_dupstack_gis_vn_retrieval import CQADupstackGisVN
 from .cqa_dupstack_mathematica_vn_retrieval import CQADupstackMathematicaVN
@@ -10,19 +10,20 @@ from .cqa_dupstack_tex_vn_retrieval import CQADupstackTexVN
 from .cqa_dupstack_unix_vn_retrieval import CQADupstackUnixVN
 from .cqa_dupstack_webmasters_vn_retrieval import CQADupstackWebmastersVN
 from .cqa_dupstack_wordpress_vn_retrieval import CQADupstackWordpressVN
-from .db_pedia_vn_retrieval import DBPediaVN
-from .fevervn_retrieval import FEVERVN
+from .db_pedia_vn_retrieval import DBPediaVN, NanoDBPediaVN
+from .fevervn_retrieval import FEVERVN, NanoFEVERVN
 from .fi_qa2018_vn_retrieval import FiQA2018VN
 from .green_node_table_markdown_retrieval import GreenNodeTableMarkdownRetrieval
-from .hotpot_qavn_retrieval import HotpotQAVN
-from .msmarcovn_retrieval import MSMARCOVN
+from .hotpot_qavn_retrieval import HotpotQAVN, NanoHotpotQAVN
+from .msmarcovn_retrieval import MSMARCOVN, NanoMSMARCOVN
 from .nf_corpus_vn_retrieval import NFCorpusVN
-from .nqvn_retrieval import NQVN
+from .nqvn_retrieval import NQVN, NanoNQVN
 from .quora_vn_retrieval import QuoraVN
 from .sci_fact_vn_retrieval import SciFactVN
 from .scidocsvn_retrieval import SCIDOCSVN
 from .touche2020_vn_retrieval import Touche2020VN
 from .treccovidvn_retrieval import TRECCOVIDVN
+from .tvpl_retrieval import TVPLRetrieval
 from .vie_qu_ad_retrieval import VieQuADRetrieval
 from .zac_legal_text_retrieval import ZacLegalTextRetrieval
@@ -49,8 +50,15 @@ __all__ = [
     "GreenNodeTableMarkdownRetrieval",
     "HotpotQAVN",
     "NFCorpusVN",
+    "NanoClimateFEVERVN",
+    "NanoDBPediaVN",
+    "NanoFEVERVN",
+    "NanoHotpotQAVN",
+    "NanoMSMARCOVN",
+    "NanoNQVN",
     "QuoraVN",
     "SciFactVN",
+    "TVPLRetrieval",
     "Touche2020VN",
     "VieQuADRetrieval",
     "ZacLegalTextRetrieval",

mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py CHANGED Viewed

@@ -36,3 +36,42 @@ class ClimateFEVERVN(AbsTaskRetrieval):
 """,
         adapted_from=["ClimateFEVER"],
     )
+class NanoClimateFEVERVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoClimateFEVER-VN",
+        description="NanoClimateFEVERVN is a small version of A translated dataset from CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
+        dataset={
+            "path": "GreenNode/nano-climate-fever-vn",
+            "revision": "1852e852f07403d4529a8520d52b91ff6d57869b",
+        },
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=["Encyclopaedic", "Written"],
+        task_subtypes=["Claim verification"],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Given a claim about climate change, retrieve documents that support or refute the claim"
+        },
+        adapted_from=["ClimateFEVER-VN"],
+    )

mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py CHANGED Viewed

@@ -36,3 +36,42 @@ class DBPediaVN(AbsTaskRetrieval):
 """,
         adapted_from=["DBPedia"],
     )
+class NanoDBPediaVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoDBPedia-VN",
+        description="NanoDBPediaVN is a small version of A translated dataset from DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://github.com/iai-group/DBpedia-Entity/",
+        dataset={
+            "path": "GreenNode/nano-dbpedia-vn",
+            "revision": "bbc3259bc63bf1e250d7034024092cc3230d5850",
+        },
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=["Written", "Encyclopaedic"],
+        task_subtypes=[],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Given a query, retrieve relevant entity descriptions from DBPedia"
+        },
+        adapted_from=["DBPedia-VN"],
+    )

mteb/tasks/retrieval/vie/fevervn_retrieval.py CHANGED Viewed

@@ -36,3 +36,42 @@ class FEVERVN(AbsTaskRetrieval):
 """,
         adapted_from=["FEVER"],
     )
+class NanoFEVERVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoFEVER-VN",
+        dataset={
+            "path": "GreenNode/nano-fever-vn",
+            "revision": "457ca6b058ed19b28f2359e2d816d7527af6bef8",
+        },
+        description="NanoFEVERVN is a small version of A translated dataset from FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://fever.ai/",
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=["Encyclopaedic", "Written"],
+        task_subtypes=["Claim verification"],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Given a claim, retrieve documents that support or refute the claim"
+        },
+        adapted_from=["FEVER-VN"],
+    )

mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py CHANGED Viewed

@@ -36,3 +36,42 @@ class HotpotQAVN(AbsTaskRetrieval):
 """,
         adapted_from=["HotpotQA"],
     )
+class NanoHotpotQAVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoHotpotQA-VN",
+        dataset={
+            "path": "GreenNode/nano-hotpotqa-vn",
+            "revision": "f4de19a2fae1a582de114e5bcd178bb262183113",
+        },
+        description="NanoHotpotQAVN is a small version of A translated dataset from HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://hotpotqa.github.io/",
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=["Web", "Written"],
+        task_subtypes=["Question answering"],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Given a multi-hop question, retrieve documents that can help answer the question"
+        },
+        adapted_from=["HotpotQA-VN"],
+    )

mteb/tasks/retrieval/vie/msmarcovn_retrieval.py CHANGED Viewed

@@ -47,3 +47,51 @@ class MSMARCOVN(AbsTaskRetrieval):
 """,
         adapted_from=["MSMARCO"],
     )
+class NanoMSMARCOVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoMSMARCO-VN",
+        dataset={
+            "path": "GreenNode/nano-msmarco-vn",
+            "revision": "f149369c82ec228b05b0f6677699ab4bfbab73f6",
+        },
+        description="NanoMSMARCOVN is a small version of A translated dataset from MS MARCO is a collection of datasets focused on deep learning in search The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://microsoft.github.io/msmarco/",
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["dev"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=[
+            "Encyclopaedic",
+            "Academic",
+            "Blog",
+            "News",
+            "Medical",
+            "Government",
+            "Reviews",
+            "Non-fiction",
+            "Social",
+            "Web",
+        ],
+        task_subtypes=["Question answering"],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={"query": "Given a query, retrieve relevant documents from MS MARCO-VN"},
+        adapted_from=["MSMARCO-VN"],
+    )

mteb/tasks/retrieval/vie/nqvn_retrieval.py CHANGED Viewed

@@ -36,3 +36,42 @@ class NQVN(AbsTaskRetrieval):
 """,
         adapted_from=["NQ"],
     )
+class NanoNQVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoNQ-VN",
+        dataset={
+            "path": "GreenNode/nano-nq-vn",
+            "revision": "1ad4d6556fe0e5314994839089ce070fb0db8b19",
+        },
+        description="NanoNQVN is a small version of A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://ai.google.com/research/NaturalQuestions/",
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=["Written", "Encyclopaedic"],
+        task_subtypes=["Question answering"],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Given a question, retrieve Wikipedia passages that answer the question"
+        },
+        adapted_from=["NQ-VN"],
+    )

mteb/tasks/retrieval/vie/tvpl_retrieval.py ADDED Viewed

@@ -0,0 +1,42 @@
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+TEST_SAMPLES = 2048
+class TVPLRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="TVPLRetrieval",
+        description="A Vietnamese dataset for evaluating legal text retrieval. From Thu vien phap luat (TVPL) dataset: Optimizing Answer Generator in Vietnamese Legal Question Answering Systems Using Language Models.",
+        reference="https://aclanthology.org/2020.coling-main.233.pdf",
+        dataset={
+            "path": "GreenNode/TVPL-Retrieval-VN",
+            "revision": "6661dba4dfedff606537732d9f35f2c3738b081a",
+        },
+        type="Retrieval",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        dialect=[],
+        annotations_creators="human-annotated",
+        domains=["Legal"],
+        task_subtypes=["Question answering"],
+        sample_creation="found",
+        bibtex_citation=r"""
+@article{10.1145/3732938,
+  address = {New York, NY, USA},
+  author = {Le, Huong and Luu, Ngoc and Nguyen, Thanh and Dao, Tuan and Dinh, Sang},
+  doi = {10.1145/3732938},
+  issn = {2375-4699},
+  journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
+  publisher = {Association for Computing Machinery},
+  title = {Optimizing Answer Generator in Vietnamese Legal Question Answering Systems Using Language Models},
+  url = {https://doi.org/10.1145/3732938},
+  year = {2025},
+}
+""",
+    )

mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py CHANGED Viewed

@@ -24,5 +24,19 @@ class ZacLegalTextRetrieval(AbsTaskRetrieval):
         annotations_creators="human-annotated",
         dialect=[],
         sample_creation="found",
-        bibtex_citation="",  # TODO: Add bibtex citation when the paper is published
+        bibtex_citation=r"""
+@inproceedings{10.1007/978-981-95-1746-6_17,
+  address = {Singapore},
+  author = {Pham, Bao Loc
+and Hoang, Quoc Viet
+and Luu, Quy Tung
+and Vo, Trong Thu},
+  booktitle = {Proceedings of the Fifth International Conference on Intelligent Systems and Networks},
+  isbn = {978-981-95-1746-6},
+  pages = {153--163},
+  publisher = {Springer Nature Singapore},
+  title = {GN-TRVN: A Benchmark for Vietnamese Table Markdown Retrieval Task},
+  year = {2026},
+}
+""",
     )

{mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mteb
-Version: 2.6.4
+Version: 2.6.6
 Summary: Massive Text Embedding Benchmark
 Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
 Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -32,8 +32,6 @@ Requires-Dist: rich>=0.0.0
 Requires-Dist: pytrec-eval-terrier>=0.5.6
 Requires-Dist: pydantic>=2.0.0
 Requires-Dist: polars>=0.20.22
-Requires-Dist: torch<2.9.0; python_full_version < "3.14"
-Requires-Dist: torch>=2.9.0; python_full_version >= "3.14"
 Provides-Extra: image
 Requires-Dist: torchvision>0.2.1; extra == "image"
 Requires-Dist: transformers[torch-vision,vision]; extra == "image"
@@ -97,6 +95,8 @@ Requires-Dist: colpali_engine>=0.3.12; python_full_version < "3.14" and extra ==
 Provides-Extra: colqwen3
 Requires-Dist: transformers>=4.57; extra == "colqwen3"
 Requires-Dist: torchvision>=0.22.1; extra == "colqwen3"
+Provides-Extra: sauerkrautlm-colpali
+Requires-Dist: sauerkrautlm-colpali>=0.1.0; python_full_version < "3.14" and extra == "sauerkrautlm-colpali"
 Provides-Extra: xet
 Requires-Dist: huggingface_hub>=0.32.0; extra == "xet"
 Provides-Extra: youtu

mteb 2.6.4__py3-none-any.whl → 2.6.6__py3-none-any.whl

mteb 2.6.4py3-none-any.whl → 2.6.6py3-none-any.whl