PyPI - mteb - Versions diffs - 2.6.5__py3-none-any.whl → 2.6.6__py3-none-any.whl - Mend

mteb 2.6.5py3-none-any.whl → 2.6.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+    "test": {
+        "num_samples": 102198,
+        "number_of_characters": 47870352,
+        "documents_text_statistics": {
+            "total_text_length": 47719757,
+            "min_text_length": 9,
+            "average_text_length": 472.01951591046225,
+            "max_text_length": 8686,
+            "unique_texts": 101097
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 150595,
+            "min_text_length": 30,
+            "average_text_length": 136.78019981834694,
+            "max_text_length": 404,
+            "unique_texts": 1099
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 3401,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 3.089009990917348,
+            "max_relevant_docs_per_query": 5,
+            "unique_relevant_docs": 1123
+        },
+        "top_ranked_statistics": null
+    }
+}

mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+    "test": {
+        "num_samples": 132137,
+        "number_of_characters": 43323279,
+        "documents_text_statistics": {
+            "total_text_length": 43311486,
+            "min_text_length": 11,
+            "average_text_length": 328.5778249819823,
+            "max_text_length": 8576,
+            "unique_texts": 131814
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 11793,
+            "min_text_length": 6,
+            "average_text_length": 36.62422360248447,
+            "max_text_length": 100,
+            "unique_texts": 321
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 11620,
+            "min_relevant_docs_per_query": 31,
+            "average_relevant_docs_per_query": 36.08695652173913,
+            "max_relevant_docs_per_query": 1288,
+            "unique_relevant_docs": 32537
+        },
+        "top_ranked_statistics": null
+    }
+}

mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+    "test": {
+        "num_samples": 106558,
+        "number_of_characters": 48164581,
+        "documents_text_statistics": {
+            "total_text_length": 47886101,
+            "min_text_length": 9,
+            "average_text_length": 472.6783768310499,
+            "max_text_length": 8689,
+            "unique_texts": 101308
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 278480,
+            "min_text_length": 11,
+            "average_text_length": 53.04380952380952,
+            "max_text_length": 196,
+            "unique_texts": 5124
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 6254,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.1912380952380952,
+            "max_relevant_docs_per_query": 15,
+            "unique_relevant_docs": 1324
+        },
+        "top_ranked_statistics": null
+    }
+}

mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+    "test": {
+        "num_samples": 117974,
+        "number_of_characters": 35927363,
+        "documents_text_statistics": {
+            "total_text_length": 35335613,
+            "min_text_length": 22,
+            "average_text_length": 316.47705838625023,
+            "max_text_length": 4105,
+            "unique_texts": 111651
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 591750,
+            "min_text_length": 21,
+            "average_text_length": 93.61651637399146,
+            "max_text_length": 280,
+            "unique_texts": 6321
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 12642,
+            "min_relevant_docs_per_query": 2,
+            "average_relevant_docs_per_query": 2.0,
+            "max_relevant_docs_per_query": 2,
+            "unique_relevant_docs": 11874
+        },
+        "top_ranked_statistics": null
+    }
+}

mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+    "dev": {
+        "num_samples": 107153,
+        "number_of_characters": 33316879,
+        "documents_text_statistics": {
+            "total_text_length": 33200903,
+            "min_text_length": 2,
+            "average_text_length": 320.30199218561575,
+            "max_text_length": 1712,
+            "unique_texts": 103641
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 115976,
+            "min_text_length": 8,
+            "average_text_length": 33.15494568324757,
+            "max_text_length": 190,
+            "unique_texts": 3498
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 3700,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.0577472841623785,
+            "max_relevant_docs_per_query": 4,
+            "unique_relevant_docs": 3698
+        },
+        "top_ranked_statistics": null
+    }
+}

mteb/descriptive_stats/Retrieval/NanoNQ-VN.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+    "test": {
+        "num_samples": 104095,
+        "number_of_characters": 52312680,
+        "documents_text_statistics": {
+            "total_text_length": 52220289,
+            "min_text_length": 10,
+            "average_text_length": 510.98673124908265,
+            "max_text_length": 10245,
+            "unique_texts": 102181
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 92391,
+            "min_text_length": 22,
+            "average_text_length": 48.62684210526316,
+            "max_text_length": 113,
+            "unique_texts": 1900
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 2283,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.201578947368421,
+            "max_relevant_docs_per_query": 4,
+            "unique_relevant_docs": 2283
+        },
+        "top_ranked_statistics": null
+    }
+}

mteb/descriptive_stats/Retrieval/TVPLRetrieval.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+    "test": {
+        "num_samples": 20561,
+        "number_of_characters": 10832770,
+        "documents_text_statistics": {
+            "total_text_length": 9929303,
+            "min_text_length": 9,
+            "average_text_length": 938.8524016641452,
+            "max_text_length": 6319,
+            "unique_texts": 10573
+        },
+        "documents_image_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 903467,
+            "min_text_length": 13,
+            "average_text_length": 90.48242363545317,
+            "max_text_length": 228,
+            "unique_texts": 9985
+        },
+        "queries_image_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 11158,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 1.1174762143214823,
+            "max_relevant_docs_per_query": 8,
+            "unique_relevant_docs": 10576
+        },
+        "top_ranked_statistics": null
+    }
+}

mteb/models/model_implementations/octen_models.py CHANGED Viewed

@@ -205,7 +205,7 @@ Octen_Embedding_8B = ModelMeta(
     name="bflhc/Octen-Embedding-8B",
     languages=multilingual_langs,
     open_weights=True,
-    revision="2030603c2926ab005fafd824fac5911e271be21f",
+    revision="f7db178d5a82fb841f606a6a67c423cead2fdbba",
     release_date="2025-12-23",
     n_parameters=7567295488,
     memory_usage_mb=14433,

mteb/tasks/retrieval/vie/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from .argu_ana_vn_retrieval import ArguAnaVN
-from .climate_fevervn_retrieval import ClimateFEVERVN
+from .climate_fevervn_retrieval import ClimateFEVERVN, NanoClimateFEVERVN
 from .cqa_dupstack_android_vn_retrieval import CQADupstackAndroidVN
 from .cqa_dupstack_gis_vn_retrieval import CQADupstackGisVN
 from .cqa_dupstack_mathematica_vn_retrieval import CQADupstackMathematicaVN
@@ -10,19 +10,20 @@ from .cqa_dupstack_tex_vn_retrieval import CQADupstackTexVN
 from .cqa_dupstack_unix_vn_retrieval import CQADupstackUnixVN
 from .cqa_dupstack_webmasters_vn_retrieval import CQADupstackWebmastersVN
 from .cqa_dupstack_wordpress_vn_retrieval import CQADupstackWordpressVN
-from .db_pedia_vn_retrieval import DBPediaVN
-from .fevervn_retrieval import FEVERVN
+from .db_pedia_vn_retrieval import DBPediaVN, NanoDBPediaVN
+from .fevervn_retrieval import FEVERVN, NanoFEVERVN
 from .fi_qa2018_vn_retrieval import FiQA2018VN
 from .green_node_table_markdown_retrieval import GreenNodeTableMarkdownRetrieval
-from .hotpot_qavn_retrieval import HotpotQAVN
-from .msmarcovn_retrieval import MSMARCOVN
+from .hotpot_qavn_retrieval import HotpotQAVN, NanoHotpotQAVN
+from .msmarcovn_retrieval import MSMARCOVN, NanoMSMARCOVN
 from .nf_corpus_vn_retrieval import NFCorpusVN
-from .nqvn_retrieval import NQVN
+from .nqvn_retrieval import NQVN, NanoNQVN
 from .quora_vn_retrieval import QuoraVN
 from .sci_fact_vn_retrieval import SciFactVN
 from .scidocsvn_retrieval import SCIDOCSVN
 from .touche2020_vn_retrieval import Touche2020VN
 from .treccovidvn_retrieval import TRECCOVIDVN
+from .tvpl_retrieval import TVPLRetrieval
 from .vie_qu_ad_retrieval import VieQuADRetrieval
 from .zac_legal_text_retrieval import ZacLegalTextRetrieval
@@ -49,8 +50,15 @@ __all__ = [
     "GreenNodeTableMarkdownRetrieval",
     "HotpotQAVN",
     "NFCorpusVN",
+    "NanoClimateFEVERVN",
+    "NanoDBPediaVN",
+    "NanoFEVERVN",
+    "NanoHotpotQAVN",
+    "NanoMSMARCOVN",
+    "NanoNQVN",
     "QuoraVN",
     "SciFactVN",
+    "TVPLRetrieval",
     "Touche2020VN",
     "VieQuADRetrieval",
     "ZacLegalTextRetrieval",

mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py CHANGED Viewed

@@ -36,3 +36,42 @@ class ClimateFEVERVN(AbsTaskRetrieval):
 """,
         adapted_from=["ClimateFEVER"],
     )
+class NanoClimateFEVERVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoClimateFEVER-VN",
+        description="NanoClimateFEVERVN is a small version of A translated dataset from CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
+        dataset={
+            "path": "GreenNode/nano-climate-fever-vn",
+            "revision": "1852e852f07403d4529a8520d52b91ff6d57869b",
+        },
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=["Encyclopaedic", "Written"],
+        task_subtypes=["Claim verification"],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Given a claim about climate change, retrieve documents that support or refute the claim"
+        },
+        adapted_from=["ClimateFEVER-VN"],
+    )

mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py CHANGED Viewed

@@ -36,3 +36,42 @@ class DBPediaVN(AbsTaskRetrieval):
 """,
         adapted_from=["DBPedia"],
     )
+class NanoDBPediaVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoDBPedia-VN",
+        description="NanoDBPediaVN is a small version of A translated dataset from DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://github.com/iai-group/DBpedia-Entity/",
+        dataset={
+            "path": "GreenNode/nano-dbpedia-vn",
+            "revision": "bbc3259bc63bf1e250d7034024092cc3230d5850",
+        },
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=["Written", "Encyclopaedic"],
+        task_subtypes=[],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Given a query, retrieve relevant entity descriptions from DBPedia"
+        },
+        adapted_from=["DBPedia-VN"],
+    )

mteb/tasks/retrieval/vie/fevervn_retrieval.py CHANGED Viewed

@@ -36,3 +36,42 @@ class FEVERVN(AbsTaskRetrieval):
 """,
         adapted_from=["FEVER"],
     )
+class NanoFEVERVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoFEVER-VN",
+        dataset={
+            "path": "GreenNode/nano-fever-vn",
+            "revision": "457ca6b058ed19b28f2359e2d816d7527af6bef8",
+        },
+        description="NanoFEVERVN is a small version of A translated dataset from FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://fever.ai/",
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=["Encyclopaedic", "Written"],
+        task_subtypes=["Claim verification"],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Given a claim, retrieve documents that support or refute the claim"
+        },
+        adapted_from=["FEVER-VN"],
+    )

mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py CHANGED Viewed

@@ -36,3 +36,42 @@ class HotpotQAVN(AbsTaskRetrieval):
 """,
         adapted_from=["HotpotQA"],
     )
+class NanoHotpotQAVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoHotpotQA-VN",
+        dataset={
+            "path": "GreenNode/nano-hotpotqa-vn",
+            "revision": "f4de19a2fae1a582de114e5bcd178bb262183113",
+        },
+        description="NanoHotpotQAVN is a small version of A translated dataset from HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://hotpotqa.github.io/",
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=["Web", "Written"],
+        task_subtypes=["Question answering"],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Given a multi-hop question, retrieve documents that can help answer the question"
+        },
+        adapted_from=["HotpotQA-VN"],
+    )

mteb/tasks/retrieval/vie/msmarcovn_retrieval.py CHANGED Viewed

@@ -47,3 +47,51 @@ class MSMARCOVN(AbsTaskRetrieval):
 """,
         adapted_from=["MSMARCO"],
     )
+class NanoMSMARCOVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoMSMARCO-VN",
+        dataset={
+            "path": "GreenNode/nano-msmarco-vn",
+            "revision": "f149369c82ec228b05b0f6677699ab4bfbab73f6",
+        },
+        description="NanoMSMARCOVN is a small version of A translated dataset from MS MARCO is a collection of datasets focused on deep learning in search The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://microsoft.github.io/msmarco/",
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["dev"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=[
+            "Encyclopaedic",
+            "Academic",
+            "Blog",
+            "News",
+            "Medical",
+            "Government",
+            "Reviews",
+            "Non-fiction",
+            "Social",
+            "Web",
+        ],
+        task_subtypes=["Question answering"],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={"query": "Given a query, retrieve relevant documents from MS MARCO-VN"},
+        adapted_from=["MSMARCO-VN"],
+    )

mteb/tasks/retrieval/vie/nqvn_retrieval.py CHANGED Viewed

@@ -36,3 +36,42 @@ class NQVN(AbsTaskRetrieval):
 """,
         adapted_from=["NQ"],
     )
+class NanoNQVN(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NanoNQ-VN",
+        dataset={
+            "path": "GreenNode/nano-nq-vn",
+            "revision": "1ad4d6556fe0e5314994839089ce070fb0db8b19",
+        },
+        description="NanoNQVN is a small version of A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
+        reference="https://ai.google.com/research/NaturalQuestions/",
+        type="Retrieval",
+        category="t2t",
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="machine-translated and LM verified",
+        domains=["Written", "Encyclopaedic"],
+        task_subtypes=["Question answering"],
+        bibtex_citation=r"""
+@misc{pham2025vnmtebvietnamesemassivetext,
+  archiveprefix = {arXiv},
+  author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
+  eprint = {2507.21500},
+  primaryclass = {cs.CL},
+  title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
+  url = {https://arxiv.org/abs/2507.21500},
+  year = {2025},
+}
+""",
+        prompt={
+            "query": "Given a question, retrieve Wikipedia passages that answer the question"
+        },
+        adapted_from=["NQ-VN"],
+    )

mteb/tasks/retrieval/vie/tvpl_retrieval.py ADDED Viewed

@@ -0,0 +1,42 @@
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+TEST_SAMPLES = 2048
+class TVPLRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="TVPLRetrieval",
+        description="A Vietnamese dataset for evaluating legal text retrieval. From Thu vien phap luat (TVPL) dataset: Optimizing Answer Generator in Vietnamese Legal Question Answering Systems Using Language Models.",
+        reference="https://aclanthology.org/2020.coling-main.233.pdf",
+        dataset={
+            "path": "GreenNode/TVPL-Retrieval-VN",
+            "revision": "6661dba4dfedff606537732d9f35f2c3738b081a",
+        },
+        type="Retrieval",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["vie-Latn"],
+        main_score="ndcg_at_10",
+        date=("2025-07-29", "2025-07-30"),
+        license="cc-by-sa-4.0",
+        dialect=[],
+        annotations_creators="human-annotated",
+        domains=["Legal"],
+        task_subtypes=["Question answering"],
+        sample_creation="found",
+        bibtex_citation=r"""
+@article{10.1145/3732938,
+  address = {New York, NY, USA},
+  author = {Le, Huong and Luu, Ngoc and Nguyen, Thanh and Dao, Tuan and Dinh, Sang},
+  doi = {10.1145/3732938},
+  issn = {2375-4699},
+  journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
+  publisher = {Association for Computing Machinery},
+  title = {Optimizing Answer Generator in Vietnamese Legal Question Answering Systems Using Language Models},
+  url = {https://doi.org/10.1145/3732938},
+  year = {2025},
+}
+""",
+    )

mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py CHANGED Viewed

@@ -24,5 +24,19 @@ class ZacLegalTextRetrieval(AbsTaskRetrieval):
         annotations_creators="human-annotated",
         dialect=[],
         sample_creation="found",
-        bibtex_citation="",  # TODO: Add bibtex citation when the paper is published
+        bibtex_citation=r"""
+@inproceedings{10.1007/978-981-95-1746-6_17,
+  address = {Singapore},
+  author = {Pham, Bao Loc
+and Hoang, Quoc Viet
+and Luu, Quy Tung
+and Vo, Trong Thu},
+  booktitle = {Proceedings of the Fifth International Conference on Intelligent Systems and Networks},
+  isbn = {978-981-95-1746-6},
+  pages = {153--163},
+  publisher = {Springer Nature Singapore},
+  title = {GN-TRVN: A Benchmark for Vietnamese Table Markdown Retrieval Task},
+  year = {2026},
+}
+""",
     )

{mteb-2.6.5.dist-info → mteb-2.6.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mteb
-Version: 2.6.5
+Version: 2.6.6
 Summary: Massive Text Embedding Benchmark
 Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
 Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -32,8 +32,6 @@ Requires-Dist: rich>=0.0.0
 Requires-Dist: pytrec-eval-terrier>=0.5.6
 Requires-Dist: pydantic>=2.0.0
 Requires-Dist: polars>=0.20.22
-Requires-Dist: torch<2.9.0; python_full_version < "3.14"
-Requires-Dist: torch>=2.9.0; python_full_version >= "3.14"
 Provides-Extra: image
 Requires-Dist: torchvision>0.2.1; extra == "image"
 Requires-Dist: transformers[torch-vision,vision]; extra == "image"

{mteb-2.6.5.dist-info → mteb-2.6.6.dist-info}/RECORD RENAMED Viewed

@@ -1254,13 +1254,19 @@ mteb/descriptive_stats/Retrieval/NQ-VN.json,sha256=lz7Jb865vUqLOxZhd8StxxAmlyNg-
 mteb/descriptive_stats/Retrieval/NQ.json,sha256=ylIFn-uHev-jkcua8SUmiDCRanM9uCkvRElU-kIGIJg,1014
 mteb/descriptive_stats/Retrieval/NQHardNegatives.json,sha256=uPcQxhFQ9R7HGcEu8c9U4K1a5yYntN-mVK4anaRHtNo,986
 mteb/descriptive_stats/Retrieval/NanoArguAnaRetrieval.json,sha256=Qv5QaFK0wXUec-9rv6K71oTgwdeOWxPpGEA-gu0-BkI,976
+mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json,sha256=8isr2BOTbbFU4_Ivwof3-MTdxngG2SMl_GYrD_vbg3Q,1010
 mteb/descriptive_stats/Retrieval/NanoClimateFeverRetrieval.json,sha256=CdBXQhfQhtKG9_64I6AXDV4giSRppmLZDB3S8M28TOA,973
+mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json,sha256=xhNOXfcG-shzlptKuHBu9dkRXQAbmlknQqu8vhxKb6g,1012
 mteb/descriptive_stats/Retrieval/NanoDBPediaRetrieval.json,sha256=eAZJSH9WPk6AVkonlshmX9RHqq-b6iLTPmzO3yJFesk,974
+mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json,sha256=vb5rzqP3SHVo3R85xRaS-nXUfH0b6KQMAbFSmK6U--o,1010
 mteb/descriptive_stats/Retrieval/NanoFEVERRetrieval.json,sha256=ac505XP13F5NRmCaQPwKdH-v9JTESsieu-K1IEa4j-I,971
 mteb/descriptive_stats/Retrieval/NanoFiQA2018Retrieval.json,sha256=eB00Q60zKfJmIY6HO083-eWIKo1STY8z4WdzRrKMI4I,973
+mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json,sha256=JVJUfPow4rwxuUuMNJ_ygusaYDm1s7tBJX5IzUSfXLQ,998
 mteb/descriptive_stats/Retrieval/NanoHotpotQARetrieval.json,sha256=mAgjR7ekGKqk0QtiZxK-iuPWJIFWi9yvAO6j9liz-iQ,972
+mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json,sha256=InXDVB08Q11Pb9IU2H2s7rZT_DXnbdpcs2duj66EdHI,1008
 mteb/descriptive_stats/Retrieval/NanoMSMARCORetrieval.json,sha256=_2ap0Bglk-hVK2rYJy3E4ECVm6Kf3yqhvWYQ99ZXruM,970
 mteb/descriptive_stats/Retrieval/NanoNFCorpusRetrieval.json,sha256=BjRdljofdnrJqn8BQdRpoxoanU5-XdeSn48085N2o4Q,977
+mteb/descriptive_stats/Retrieval/NanoNQ-VN.json,sha256=l_49qFdL8DtxaZ9i9lX5dJcxG1KnjvaOO-eyuVWsUAM,1010
 mteb/descriptive_stats/Retrieval/NanoNQRetrieval.json,sha256=q9TBcEqGV8fmoK4_32a-yDLhGN6FAj049XuN95Hhiiw,969
 mteb/descriptive_stats/Retrieval/NanoQuoraRetrieval.json,sha256=sG9ZgROl8kqDk3n2Rmb7zMgUmu0S8LqILZvjdevf-rQ,967
 mteb/descriptive_stats/Retrieval/NanoSCIDOCSRetrieval.json,sha256=J0a84pa1TupKlHC5Oi9zqhOkmKz2TqlcxPPXt58zuBU,973
@@ -1346,6 +1352,7 @@ mteb/descriptive_stats/Retrieval/TRECCOVID.json,sha256=VMICXZ2lA7GfiUyudOxYGRnMm
 mteb/descriptive_stats/Retrieval/TRECDL2019.json,sha256=6BOe9qATrKaRz8_cCMbqwXiu-ZiZq--Cm37uwTqSvJs,1013
 mteb/descriptive_stats/Retrieval/TRECDL2020.json,sha256=0WFbaPL2dyp5FZ1Wf0yAOVhndUfxP11sep562RHMplA,1014
 mteb/descriptive_stats/Retrieval/TV2Nordretrieval.json,sha256=_oDN_OfptM5ak-d4OXA-RU0hrtjvfg18jhir8CckxZ0,985
+mteb/descriptive_stats/Retrieval/TVPLRetrieval.json,sha256=m-t90SylbkuUUyu-MprOpB27h8xkoqjA2ebhvq5Vl98,1007
 mteb/descriptive_stats/Retrieval/TempReasonL1.json,sha256=-MpwGucuNT0aKOMWwGld9POo_vkSnjpnih8xIFnN5d4,975
 mteb/descriptive_stats/Retrieval/TempReasonL2Context.json,sha256=Gd2cVFAsdF1RHHWIbKI9hZLWgrbFzp8p0xoa6NU1uGM,996
 mteb/descriptive_stats/Retrieval/TempReasonL2Fact.json,sha256=om5WmIGXJLeMI-b0Tp7-odKRH-S9kx6OHXlnAD62rLk,992
@@ -1535,7 +1542,7 @@ mteb/models/model_implementations/nomic_models.py,sha256=dmQC_cWg6hAmiBHK7fXoXEi
 mteb/models/model_implementations/nomic_models_vision.py,sha256=usCKfZCR7aEi_DnNmVAYjH-lXx_ipQkBVtUAmhJ90QI,6870
 mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py,sha256=6dTGtK1GiaYdpJ4IQFgCCOkGyHQyuEUatKs-Uv-1YmE,6450
 mteb/models/model_implementations/nvidia_models.py,sha256=_lLfFl4-uSKpZdj_SDpdKiI2Gb5C1GgPqWSS-QdlYMM,21768
-mteb/models/model_implementations/octen_models.py,sha256=J_-eNARXLgN8H_v5fobOr01RXK-G3oWdv02hG4L_gWY,7511
+mteb/models/model_implementations/octen_models.py,sha256=FwQAcB_z6bFohpFlNQK2ugLBEOQUu533auOhrNqMxaM,7511
 mteb/models/model_implementations/openai_models.py,sha256=905BajYi_XyOZgqU3AeKpwIttLoUitaAyc48sTWI6Jg,9482
 mteb/models/model_implementations/openclip_models.py,sha256=MyosgeYSrgBXGuGFtI2Tyxksxpb7bADFJVSYFCLweVA,11622
 mteb/models/model_implementations/opensearch_neural_sparse_models.py,sha256=TnIHut_IHvplvovlcTZ-PWnEldTzcru5JdUIaTH-8Do,8636
@@ -2474,9 +2481,9 @@ mteb/tasks/retrieval/swe/swe_faq_retrieval.py,sha256=s-o7IM_l7giuK4bJMdYkq2CtE0Q
 mteb/tasks/retrieval/swe/swedn_retrieval.py,sha256=RFcpp0u-EKIwSRXR37tJ0_haY6Jvlfj8DWCgrD-0tnU,1512
 mteb/tasks/retrieval/tur/__init__.py,sha256=tAKhhsTK6meiZwRMIvbx7_ye90JAAW3dlS8iI0r_vg8,84
 mteb/tasks/retrieval/tur/tur_hist_quad.py,sha256=s7S5RrdwPx-0aatUwbgFbuLtj8927yQUHp1SEODfAl0,3669
-mteb/tasks/retrieval/vie/__init__.py,sha256=j69iltc-is1oqx0oIV1RVjjM46LLH-JJQzKnxm4cYvc,2142
+mteb/tasks/retrieval/vie/__init__.py,sha256=8k8aUndynSTP72j75e2tcU-8omMuGzOVZp3KxIAGaBg,2419
 mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py,sha256=wmE6syUs0sLs7xgIOxXQuiQzpxrskdsTc5sK46v1YEQ,1754
-mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py,sha256=4GMO5qSYbP0pFtf1yklMZNqFgh8qi1Xo2IXQDl9t14s,1849
+mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py,sha256=eonoS9NWKw-okR9Eqe4B8YgzGSbw0t7FcNpt0JwxyKU,3788
 mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py,sha256=1c6s1C0j1x7kE92WMv9JB4I_rdsHboyP-QILU-18rQ4,1851
 mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py,sha256=h--L4OiLIalxHnSulEiUZjMo7JRxjia-mKOnnoaOkzI,1813
 mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py,sha256=Jm5-2YbfBObFW_Ygwu03PAnSNMcZkH_7SL8L18KVWvQ,1857
@@ -2487,21 +2494,22 @@ mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py,sha256=9EiLKJrpRXACmxZ
 mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py,sha256=7Mr2sZrAKzFDeMT_7eQQ_52OKzefGFAnkcHmO4lntIo,1824
 mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py,sha256=2zDcrsCfcTAcybUmTpGeJQxUxNpkY7Ha8Tf0xwfqTcQ,1810
 mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py,sha256=ppFPam-3AXXVLVp_DiXeHaSr16Va44_-eRkOH0m5ypo,1821
-mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py,sha256=hOiwz2bcayDW6VrCvsIGeYh1TT7koByM76rZZwtp9KA,1754
-mteb/tasks/retrieval/vie/fevervn_retrieval.py,sha256=xLGoXefGk1l1AFiOSf2Ja0fM_rAQp4tdaR8H6jJqYlI,1853
+mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py,sha256=9YEubKLDCMJhck_EjY4r3VzAFDu-P4SWR5CLnHdSkTQ,3571
+mteb/tasks/retrieval/vie/fevervn_retrieval.py,sha256=JLrpB90G5c7ZR2jM9GsYE2YQ51qTnn5FH-LDzO99Z1Q,3768
 mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py,sha256=FGfFuLzRCTuupRxZdjVbBiwCOSspb3vwvtNAKvyXjso,1714
 mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py,sha256=O7iIcuvqhrHjB7J1VxH9YJ3v6cuFFBQdrrnYwLgeRfE,2429
-mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py,sha256=FYWj8EhnfwDuPRxZ8uTeGkfa2Q-jDU2bliTmp975Coc,1837
-mteb/tasks/retrieval/vie/msmarcovn_retrieval.py,sha256=xtJ1-rjx4slwSR8p6NedqItTk-79ZzT2f9FlDOhbzkE,1958
+mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py,sha256=Vg_YI8YbZpXMmwZXS-2KLRutL2Nehw5tW231S2qShd4,3753
+mteb/tasks/retrieval/vie/msmarcovn_retrieval.py,sha256=syDFYmXL2xK3xCQrBAopGul8_3pDZzBdIjMpk2XbA1s,3951
 mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py,sha256=4S8IDJ-TVjKEy2teM8GOeDzHIZR8txkPvX0sGDYIyqs,1780
-mteb/tasks/retrieval/vie/nqvn_retrieval.py,sha256=tQT2t6XcflVRM78t_5TujWD27e9uCMrsfN0DBjDBY0E,1744
+mteb/tasks/retrieval/vie/nqvn_retrieval.py,sha256=f8LmUGAmsMnCdn-ovfPcpX12X4rmdpXj3F-q6GwjBEc,3551
 mteb/tasks/retrieval/vie/quora_vn_retrieval.py,sha256=VkgKCFbDkOuZAsMl36lOr-MuvbhNfE8zUmmiySW9lSY,1837
 mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py,sha256=7F3wSU9N2BAj4Jmzw7sjbcxTyYDYs_3I1434X3riaZ4,1773
 mteb/tasks/retrieval/vie/scidocsvn_retrieval.py,sha256=WlcfDfF43jsNf9D_Bl3k02RiiPdedORID6CEEMAYTLc,1815
 mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py,sha256=DKcNwCCdANt7hNr3fLao9jkIJJjfxJ0jLLbD7_b-KnE,1752
 mteb/tasks/retrieval/vie/treccovidvn_retrieval.py,sha256=ZlFFL37Zd_sbKXaUZx41XTxps-nnOi3PnBNCy9KvlJU,1826
+mteb/tasks/retrieval/vie/tvpl_retrieval.py,sha256=CGwgT9spHONw9cOeuum_BS7khZbooqoNqJgVV6Utfic,1611
 mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py,sha256=eZh1rR43iXDHoylOGKjrUCopzEujE-1GSGTn2TMrkro,3621
-mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py,sha256=Y93j0EwG6-bcc0DMLvHP9q3r9b_3xLXu6YBR0Q5HDho,985
+mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py,sha256=BI2GbbkOPnWQpbn9ul6ShHugAZ994iiS7hVi5v1K17Y,1386
 mteb/tasks/retrieval/zho/__init__.py,sha256=dIN-rPfrEjkCuUCha8SpQdlzWYY6IMO_HLxebcBhQxA,438
 mteb/tasks/retrieval/zho/cmteb_retrieval.py,sha256=DXNkvMQQZsKv1U5L_0boKEXGLDPn4RfauIlxwb0f-EQ,10789
 mteb/tasks/retrieval/zho/le_ca_r_dv2_retrieval.py,sha256=O7kNB_7rpgG7_KsKC0SUKG42dhx66Rakk77uy4Iufk0,1293
@@ -2604,9 +2612,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
 mteb/types/_result.py,sha256=UKNokV9pu3G74MGebocU512aU_fFU9I9nPKnrG9Q0iE,1035
 mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
 mteb/types/statistics.py,sha256=GwkBPmAr18Onu-vHtzHs0PFrhCozdOMiT13HwnWL4ZM,3961
-mteb-2.6.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mteb-2.6.5.dist-info/METADATA,sha256=27kspNt-a7zJ0Ihl2nB5m4Ak1-hba5xQjBuqGnCFWcQ,14397
-mteb-2.6.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mteb-2.6.5.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
-mteb-2.6.5.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
-mteb-2.6.5.dist-info/RECORD,,
+mteb-2.6.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mteb-2.6.6.dist-info/METADATA,sha256=s0uH9FABmjhyRn2bwsWVFFxjRtJWEYbQaqEuavtj_mY,14281
+mteb-2.6.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mteb-2.6.6.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
+mteb-2.6.6.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
+mteb-2.6.6.dist-info/RECORD,,

{mteb-2.6.5.dist-info → mteb-2.6.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{mteb-2.6.5.dist-info → mteb-2.6.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mteb-2.6.5.dist-info → mteb-2.6.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mteb-2.6.5.dist-info → mteb-2.6.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

mteb 2.6.5__py3-none-any.whl → 2.6.6__py3-none-any.whl

mteb 2.6.5py3-none-any.whl → 2.6.6py3-none-any.whl