evalscope 0.5.5rc0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/__init__.py +0 -3
- evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
- evalscope/backend/rag_eval/__init__.py +4 -0
- evalscope/backend/rag_eval/backend_manager.py +80 -0
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +2 -0
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +34 -0
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +277 -0
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +119 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +83 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +247 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +170 -0
- evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
- evalscope/backend/rag_eval/cmteb/arguments.py +61 -0
- evalscope/backend/rag_eval/cmteb/base.py +91 -0
- evalscope/backend/rag_eval/cmteb/task_template.py +85 -0
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +61 -0
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +151 -0
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +70 -0
- evalscope/backend/rag_eval/ragas/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/arguments.py +47 -0
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +91 -0
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +99 -0
- evalscope/backend/rag_eval/ragas/task_template.py +61 -0
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +263 -0
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +72 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- evalscope/evaluator/evaluator.py +1 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +19 -0
- evalscope/models/api/openai_api.py +2 -2
- evalscope/perf/http_client.py +1 -1
- evalscope/perf/openai_api.py +2 -0
- evalscope/run.py +4 -0
- evalscope/utils/logger.py +44 -14
- evalscope/utils/task_utils.py +3 -0
- evalscope/version.py +2 -2
- {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/METADATA +95 -99
- {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/RECORD +49 -18
- {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/WHEEL +1 -1
- {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification
|
|
2
|
+
from mteb.abstasks.TaskMetadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Ocnli(AbsTaskPairClassification):
|
|
6
|
+
metadata = TaskMetadata(
|
|
7
|
+
name="Ocnli",
|
|
8
|
+
description="Original Chinese Natural Language Inference dataset",
|
|
9
|
+
reference="https://arxiv.org/abs/2010.05444",
|
|
10
|
+
dataset={
|
|
11
|
+
"path": "C-MTEB/OCNLI",
|
|
12
|
+
"revision": "66e76a618a34d6d565d5538088562851e6daa7ec",
|
|
13
|
+
},
|
|
14
|
+
type="PairClassification",
|
|
15
|
+
category="s2s",
|
|
16
|
+
modalities=["text"],
|
|
17
|
+
eval_splits=["validation"],
|
|
18
|
+
eval_langs=["cmn-Hans"],
|
|
19
|
+
main_score="max_accuracy",
|
|
20
|
+
date=None,
|
|
21
|
+
domains=None,
|
|
22
|
+
task_subtypes=None,
|
|
23
|
+
license=None,
|
|
24
|
+
annotations_creators=None,
|
|
25
|
+
dialect=None,
|
|
26
|
+
sample_creation=None,
|
|
27
|
+
bibtex_citation="""@misc{hu2020ocnli,
|
|
28
|
+
title={OCNLI: Original Chinese Natural Language Inference},
|
|
29
|
+
author={Hai Hu and Kyle Richardson and Liang Xu and Lu Li and Sandra Kuebler and Lawrence S. Moss},
|
|
30
|
+
year={2020},
|
|
31
|
+
eprint={2010.05444},
|
|
32
|
+
archivePrefix={arXiv},
|
|
33
|
+
primaryClass={cs.CL}
|
|
34
|
+
}""",
|
|
35
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def dataset_transform(self):
|
|
39
|
+
self.dataset = self.dataset.rename_column("sent1", "sentence1")
|
|
40
|
+
self.dataset = self.dataset.rename_column("sent2", "sentence2")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Cmnli(AbsTaskPairClassification):
|
|
44
|
+
metadata = TaskMetadata(
|
|
45
|
+
name="Cmnli",
|
|
46
|
+
description="Chinese Multi-Genre NLI",
|
|
47
|
+
reference="https://huggingface.co/datasets/clue/viewer/cmnli",
|
|
48
|
+
dataset={
|
|
49
|
+
"path": "C-MTEB/CMNLI",
|
|
50
|
+
"revision": "41bc36f332156f7adc9e38f53777c959b2ae9766",
|
|
51
|
+
},
|
|
52
|
+
type="PairClassification",
|
|
53
|
+
category="s2s",
|
|
54
|
+
modalities=["text"],
|
|
55
|
+
eval_splits=["validation", "test"],
|
|
56
|
+
eval_langs=["cmn-Hans"],
|
|
57
|
+
main_score="max_accuracy",
|
|
58
|
+
date=None,
|
|
59
|
+
domains=None,
|
|
60
|
+
task_subtypes=None,
|
|
61
|
+
license=None,
|
|
62
|
+
annotations_creators=None,
|
|
63
|
+
dialect=None,
|
|
64
|
+
sample_creation=None,
|
|
65
|
+
bibtex_citation="""@inproceedings{xu-etal-2020-clue,
|
|
66
|
+
title = "{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark",
|
|
67
|
+
author = "Xu, Liang and
|
|
68
|
+
Hu, Hai and
|
|
69
|
+
Zhang, Xuanwei and
|
|
70
|
+
Li, Lu and
|
|
71
|
+
Cao, Chenjie and
|
|
72
|
+
Li, Yudong and
|
|
73
|
+
Xu, Yechen and
|
|
74
|
+
Sun, Kai and
|
|
75
|
+
Yu, Dian and
|
|
76
|
+
Yu, Cong and
|
|
77
|
+
Tian, Yin and
|
|
78
|
+
Dong, Qianqian and
|
|
79
|
+
Liu, Weitang and
|
|
80
|
+
Shi, Bo and
|
|
81
|
+
Cui, Yiming and
|
|
82
|
+
Li, Junyi and
|
|
83
|
+
Zeng, Jun and
|
|
84
|
+
Wang, Rongzhao and
|
|
85
|
+
Xie, Weijian and
|
|
86
|
+
Li, Yanting and
|
|
87
|
+
Patterson, Yina and
|
|
88
|
+
Tian, Zuoyu and
|
|
89
|
+
Zhang, Yiwen and
|
|
90
|
+
Zhou, He and
|
|
91
|
+
Liu, Shaoweihua and
|
|
92
|
+
Zhao, Zhe and
|
|
93
|
+
Zhao, Qipeng and
|
|
94
|
+
Yue, Cong and
|
|
95
|
+
Zhang, Xinrui and
|
|
96
|
+
Yang, Zhengliang and
|
|
97
|
+
Richardson, Kyle and
|
|
98
|
+
Lan, Zhenzhong",
|
|
99
|
+
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
|
|
100
|
+
month = dec,
|
|
101
|
+
year = "2020",
|
|
102
|
+
address = "Barcelona, Spain (Online)",
|
|
103
|
+
publisher = "International Committee on Computational Linguistics",
|
|
104
|
+
url = "https://aclanthology.org/2020.coling-main.419",
|
|
105
|
+
doi = "10.18653/v1/2020.coling-main.419",
|
|
106
|
+
pages = "4762--4772",
|
|
107
|
+
}""",
|
|
108
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def dataset_transform(self):
|
|
112
|
+
self.dataset = self.dataset.rename_column("sent1", "sentence1")
|
|
113
|
+
self.dataset = self.dataset.rename_column("sent2", "sentence2")
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from mteb.abstasks.AbsTaskReranking import AbsTaskReranking
|
|
2
|
+
from mteb.abstasks.TaskMetadata import HFSubset, TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class T2Reranking(AbsTaskReranking):
|
|
7
|
+
metadata = TaskMetadata(
|
|
8
|
+
name="T2Reranking",
|
|
9
|
+
description="T2Ranking: A large-scale Chinese Benchmark for Passage Ranking",
|
|
10
|
+
reference="https://arxiv.org/abs/2304.03679",
|
|
11
|
+
dataset={
|
|
12
|
+
"path": "C-MTEB/T2Reranking",
|
|
13
|
+
"revision": "76631901a18387f85eaa53e5450019b87ad58ef9",
|
|
14
|
+
},
|
|
15
|
+
type="Reranking",
|
|
16
|
+
category="s2s",
|
|
17
|
+
modalities=["text"],
|
|
18
|
+
eval_splits=["dev"],
|
|
19
|
+
eval_langs=["cmn-Hans"],
|
|
20
|
+
main_score="map",
|
|
21
|
+
date=None,
|
|
22
|
+
form=None,
|
|
23
|
+
domains=None,
|
|
24
|
+
task_subtypes=None,
|
|
25
|
+
license=None,
|
|
26
|
+
annotations_creators=None,
|
|
27
|
+
dialect=None,
|
|
28
|
+
sample_creation=None,
|
|
29
|
+
bibtex_citation="""@misc{xie2023t2ranking,
|
|
30
|
+
title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
|
|
31
|
+
author={Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma},
|
|
32
|
+
year={2023},
|
|
33
|
+
eprint={2304.03679},
|
|
34
|
+
archivePrefix={arXiv},
|
|
35
|
+
primaryClass={cs.IR}
|
|
36
|
+
}""",
|
|
37
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class MMarcoReranking(AbsTaskReranking):
|
|
42
|
+
metadata = TaskMetadata(
|
|
43
|
+
name="MMarcoReranking",
|
|
44
|
+
description="mMARCO is a multilingual version of the MS MARCO passage ranking dataset",
|
|
45
|
+
reference="https://github.com/unicamp-dl/mMARCO",
|
|
46
|
+
dataset={
|
|
47
|
+
"path": "C-MTEB/Mmarco-reranking",
|
|
48
|
+
"revision": "8e0c766dbe9e16e1d221116a3f36795fbade07f6",
|
|
49
|
+
},
|
|
50
|
+
type="Reranking",
|
|
51
|
+
category="s2s",
|
|
52
|
+
modalities=["text"],
|
|
53
|
+
eval_splits=["dev"],
|
|
54
|
+
eval_langs=["cmn-Hans"],
|
|
55
|
+
main_score="map",
|
|
56
|
+
date=None,
|
|
57
|
+
form=None,
|
|
58
|
+
domains=None,
|
|
59
|
+
task_subtypes=None,
|
|
60
|
+
license=None,
|
|
61
|
+
annotations_creators=None,
|
|
62
|
+
dialect=None,
|
|
63
|
+
sample_creation=None,
|
|
64
|
+
bibtex_citation="""@misc{bonifacio2021mmarco,
|
|
65
|
+
title={mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset},
|
|
66
|
+
author={Luiz Henrique Bonifacio and Vitor Jeronymo and Hugo Queiroz Abonizio and Israel Campiotti and Marzieh Fadaee and and Roberto Lotufo and Rodrigo Nogueira},
|
|
67
|
+
year={2021},
|
|
68
|
+
eprint={2108.13897},
|
|
69
|
+
archivePrefix={arXiv},
|
|
70
|
+
primaryClass={cs.CL}
|
|
71
|
+
}""",
|
|
72
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class CMedQAv1(AbsTaskReranking):
|
|
77
|
+
metadata = TaskMetadata(
|
|
78
|
+
name="CMedQAv1",
|
|
79
|
+
description="Chinese community medical question answering",
|
|
80
|
+
reference="https://github.com/zhangsheng93/cMedQA",
|
|
81
|
+
dataset={
|
|
82
|
+
"path": "C-MTEB/CMedQAv1-reranking",
|
|
83
|
+
"revision": "8d7f1e942507dac42dc58017c1a001c3717da7df",
|
|
84
|
+
},
|
|
85
|
+
type="Reranking",
|
|
86
|
+
category="s2s",
|
|
87
|
+
modalities=["text"],
|
|
88
|
+
eval_splits=["test"],
|
|
89
|
+
eval_langs=["cmn-Hans"],
|
|
90
|
+
main_score="map",
|
|
91
|
+
date=("2017-01-01", "2017-07-26"),
|
|
92
|
+
domains=["Medical", "Written"],
|
|
93
|
+
task_subtypes=[],
|
|
94
|
+
license="Not specified",
|
|
95
|
+
annotations_creators="expert-annotated",
|
|
96
|
+
dialect=[],
|
|
97
|
+
sample_creation="found",
|
|
98
|
+
bibtex_citation="""@article{zhang2017chinese,
|
|
99
|
+
title={Chinese Medical Question Answer Matching Using End-to-End Character-Level Multi-Scale CNNs},
|
|
100
|
+
author={Zhang, Sheng and Zhang, Xin and Wang, Hui and Cheng, Jiajun and Li, Pei and Ding, Zhaoyun},
|
|
101
|
+
journal={Applied Sciences},
|
|
102
|
+
volume={7},
|
|
103
|
+
number={8},
|
|
104
|
+
pages={767},
|
|
105
|
+
year={2017},
|
|
106
|
+
publisher={Multidisciplinary Digital Publishing Institute}
|
|
107
|
+
}""",
|
|
108
|
+
descriptive_stats={
|
|
109
|
+
"n_samples": {"test": 2000},
|
|
110
|
+
"avg_character_length": {"test": 165},
|
|
111
|
+
},
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class CMedQAv2(AbsTaskReranking):
|
|
116
|
+
metadata = TaskMetadata(
|
|
117
|
+
name="CMedQAv2",
|
|
118
|
+
description="Chinese community medical question answering",
|
|
119
|
+
reference="https://github.com/zhangsheng93/cMedQA2",
|
|
120
|
+
dataset={
|
|
121
|
+
"path": "C-MTEB/CMedQAv2-reranking",
|
|
122
|
+
"revision": "23d186750531a14a0357ca22cd92d712fd512ea0",
|
|
123
|
+
},
|
|
124
|
+
type="Reranking",
|
|
125
|
+
category="s2s",
|
|
126
|
+
modalities=["text"],
|
|
127
|
+
eval_splits=["test"],
|
|
128
|
+
eval_langs=["cmn-Hans"],
|
|
129
|
+
main_score="map",
|
|
130
|
+
date=None,
|
|
131
|
+
form=None,
|
|
132
|
+
domains=None,
|
|
133
|
+
task_subtypes=None,
|
|
134
|
+
license=None,
|
|
135
|
+
annotations_creators=None,
|
|
136
|
+
dialect=None,
|
|
137
|
+
sample_creation=None,
|
|
138
|
+
bibtex_citation="""@ARTICLE{8548603,
|
|
139
|
+
author={S. Zhang and X. Zhang and H. Wang and L. Guo and S. Liu},
|
|
140
|
+
journal={IEEE Access},
|
|
141
|
+
title={Multi-Scale Attentive Interaction Networks for Chinese Medical Question Answer Selection},
|
|
142
|
+
year={2018},
|
|
143
|
+
volume={6},
|
|
144
|
+
number={},
|
|
145
|
+
pages={74061-74071},
|
|
146
|
+
keywords={Biomedical imaging;Data mining;Semantics;Medical services;Feature extraction;Knowledge discovery;Medical question answering;interactive attention;deep learning;deep neural networks},
|
|
147
|
+
doi={10.1109/ACCESS.2018.2883637},
|
|
148
|
+
ISSN={2169-3536},
|
|
149
|
+
month={},}""",
|
|
150
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
151
|
+
)
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
from mteb import AbsTaskRetrieval
|
|
2
|
+
from mteb.abstasks.TaskMetadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class T2Retrieval(AbsTaskRetrieval):
|
|
6
|
+
ignore_identical_ids = True
|
|
7
|
+
|
|
8
|
+
metadata = TaskMetadata(
|
|
9
|
+
name="T2Retrieval",
|
|
10
|
+
description="T2Ranking: A large-scale Chinese Benchmark for Passage Ranking",
|
|
11
|
+
reference="https://arxiv.org/abs/2304.03679",
|
|
12
|
+
dataset={
|
|
13
|
+
"path": "C-MTEB/T2Retrieval",
|
|
14
|
+
"revision": "8731a845f1bf500a4f111cf1070785c793d10e64",
|
|
15
|
+
"qrel_revision": "1c83b8d1544e529875e3f6930f3a1fcf749a8e97",
|
|
16
|
+
},
|
|
17
|
+
type="Retrieval",
|
|
18
|
+
category="s2p",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["dev"],
|
|
21
|
+
eval_langs=["cmn-Hans"],
|
|
22
|
+
main_score="ndcg_at_10",
|
|
23
|
+
date=None,
|
|
24
|
+
domains=None,
|
|
25
|
+
task_subtypes=None,
|
|
26
|
+
license=None,
|
|
27
|
+
annotations_creators=None,
|
|
28
|
+
dialect=None,
|
|
29
|
+
sample_creation=None,
|
|
30
|
+
bibtex_citation="""@misc{xie2023t2ranking,
|
|
31
|
+
title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
|
|
32
|
+
author={Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma},
|
|
33
|
+
year={2023},
|
|
34
|
+
eprint={2304.03679},
|
|
35
|
+
archivePrefix={arXiv},
|
|
36
|
+
primaryClass={cs.IR}
|
|
37
|
+
}""",
|
|
38
|
+
descriptive_stats={
|
|
39
|
+
"n_samples": None,
|
|
40
|
+
"avg_character_length": {
|
|
41
|
+
"dev": {
|
|
42
|
+
"average_document_length": 874.1184182791619,
|
|
43
|
+
"average_query_length": 10.938847974750132,
|
|
44
|
+
"num_documents": 118605,
|
|
45
|
+
"num_queries": 22812,
|
|
46
|
+
"average_relevant_docs_per_query": 5.213571804313519,
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
},
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class MMarcoRetrieval(AbsTaskRetrieval):
|
|
54
|
+
ignore_identical_ids = True
|
|
55
|
+
|
|
56
|
+
metadata = TaskMetadata(
|
|
57
|
+
name="MMarcoRetrieval",
|
|
58
|
+
description="MMarcoRetrieval",
|
|
59
|
+
reference="https://arxiv.org/abs/2309.07597",
|
|
60
|
+
dataset={
|
|
61
|
+
"path": "C-MTEB/MMarcoRetrieval",
|
|
62
|
+
"revision": "539bbde593d947e2a124ba72651aafc09eb33fc2",
|
|
63
|
+
"qrel_revision": "bae08bb7bddbedb96c7e7db52018a55167b67f89",
|
|
64
|
+
},
|
|
65
|
+
type="Retrieval",
|
|
66
|
+
category="s2p",
|
|
67
|
+
modalities=["text"],
|
|
68
|
+
eval_splits=["dev"],
|
|
69
|
+
eval_langs=["cmn-Hans"],
|
|
70
|
+
main_score="ndcg_at_10",
|
|
71
|
+
date=None,
|
|
72
|
+
domains=None,
|
|
73
|
+
task_subtypes=None,
|
|
74
|
+
license=None,
|
|
75
|
+
annotations_creators=None,
|
|
76
|
+
dialect=None,
|
|
77
|
+
sample_creation=None,
|
|
78
|
+
bibtex_citation="""@misc{xiao2024cpack,
|
|
79
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
80
|
+
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
81
|
+
year={2024},
|
|
82
|
+
eprint={2309.07597},
|
|
83
|
+
archivePrefix={arXiv},
|
|
84
|
+
primaryClass={cs.CL}
|
|
85
|
+
}""",
|
|
86
|
+
descriptive_stats={
|
|
87
|
+
"n_samples": None,
|
|
88
|
+
"avg_character_length": {
|
|
89
|
+
"dev": {
|
|
90
|
+
"average_document_length": 114.41787048392986,
|
|
91
|
+
"average_query_length": 10.51131805157593,
|
|
92
|
+
"num_documents": 106813,
|
|
93
|
+
"num_queries": 6980,
|
|
94
|
+
"average_relevant_docs_per_query": 1.0654727793696275,
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class DuRetrieval(AbsTaskRetrieval):
|
|
102
|
+
metadata = TaskMetadata(
|
|
103
|
+
name="DuRetrieval",
|
|
104
|
+
description="A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine",
|
|
105
|
+
reference="https://aclanthology.org/2022.emnlp-main.357.pdf",
|
|
106
|
+
dataset={
|
|
107
|
+
"path": "C-MTEB/DuRetrieval",
|
|
108
|
+
"revision": "a1a333e290fe30b10f3f56498e3a0d911a693ced",
|
|
109
|
+
"qrel_revision": "497b7bd1bbb25cb3757ff34d95a8be50a3de2279",
|
|
110
|
+
},
|
|
111
|
+
type="Retrieval",
|
|
112
|
+
category="s2p",
|
|
113
|
+
modalities=["text"],
|
|
114
|
+
eval_splits=["dev"],
|
|
115
|
+
eval_langs=["cmn-Hans"],
|
|
116
|
+
main_score="ndcg_at_10",
|
|
117
|
+
date=None,
|
|
118
|
+
domains=None,
|
|
119
|
+
task_subtypes=None,
|
|
120
|
+
license=None,
|
|
121
|
+
annotations_creators=None,
|
|
122
|
+
dialect=None,
|
|
123
|
+
sample_creation=None,
|
|
124
|
+
bibtex_citation="""@misc{qiu2022dureaderretrieval,
|
|
125
|
+
title={DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine},
|
|
126
|
+
author={Yifu Qiu and Hongyu Li and Yingqi Qu and Ying Chen and Qiaoqiao She and Jing Liu and Hua Wu and Haifeng Wang},
|
|
127
|
+
year={2022},
|
|
128
|
+
eprint={2203.10232},
|
|
129
|
+
archivePrefix={arXiv},
|
|
130
|
+
primaryClass={cs.CL}
|
|
131
|
+
}""",
|
|
132
|
+
descriptive_stats={
|
|
133
|
+
"n_samples": None,
|
|
134
|
+
"avg_character_length": {
|
|
135
|
+
"dev": {
|
|
136
|
+
"average_document_length": 331.3219967800322,
|
|
137
|
+
"average_query_length": 9.289,
|
|
138
|
+
"num_documents": 100001,
|
|
139
|
+
"num_queries": 2000,
|
|
140
|
+
"average_relevant_docs_per_query": 4.9195,
|
|
141
|
+
}
|
|
142
|
+
},
|
|
143
|
+
},
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class CovidRetrieval(AbsTaskRetrieval):
|
|
148
|
+
metadata = TaskMetadata(
|
|
149
|
+
name="CovidRetrieval",
|
|
150
|
+
description="COVID-19 news articles",
|
|
151
|
+
reference="https://arxiv.org/abs/2203.03367",
|
|
152
|
+
dataset={
|
|
153
|
+
"path": "C-MTEB/CovidRetrieval",
|
|
154
|
+
"revision": "1271c7809071a13532e05f25fb53511ffce77117",
|
|
155
|
+
"qrel_revision": "a9f41b7cdf24785531d12417ce0d1157ed4b39ca",
|
|
156
|
+
},
|
|
157
|
+
type="Retrieval",
|
|
158
|
+
category="s2p",
|
|
159
|
+
modalities=["text"],
|
|
160
|
+
eval_splits=["dev"],
|
|
161
|
+
eval_langs=["cmn-Hans"],
|
|
162
|
+
main_score="ndcg_at_10",
|
|
163
|
+
date=None,
|
|
164
|
+
domains=None,
|
|
165
|
+
task_subtypes=None,
|
|
166
|
+
license=None,
|
|
167
|
+
annotations_creators=None,
|
|
168
|
+
dialect=None,
|
|
169
|
+
sample_creation=None,
|
|
170
|
+
bibtex_citation=None,
|
|
171
|
+
descriptive_stats={
|
|
172
|
+
"n_samples": None,
|
|
173
|
+
"avg_character_length": {
|
|
174
|
+
"dev": {
|
|
175
|
+
"average_document_length": 332.4152658473415,
|
|
176
|
+
"average_query_length": 25.9304531085353,
|
|
177
|
+
"num_documents": 100001,
|
|
178
|
+
"num_queries": 949,
|
|
179
|
+
"average_relevant_docs_per_query": 1.0105374077976819,
|
|
180
|
+
}
|
|
181
|
+
},
|
|
182
|
+
},
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class CmedqaRetrieval(AbsTaskRetrieval):
|
|
187
|
+
metadata = TaskMetadata(
|
|
188
|
+
name="CmedqaRetrieval",
|
|
189
|
+
description="Online medical consultation text. Used the CMedQAv2 as its underlying dataset.",
|
|
190
|
+
reference="https://aclanthology.org/2022.emnlp-main.357.pdf",
|
|
191
|
+
dataset={
|
|
192
|
+
"path": "C-MTEB/CmedqaRetrieval",
|
|
193
|
+
"revision": "cd540c506dae1cf9e9a59c3e06f42030d54e7301",
|
|
194
|
+
"qrel_revision": "279d737f36c731c8ff6e2b055f31fe02216fa23d",
|
|
195
|
+
},
|
|
196
|
+
type="Retrieval",
|
|
197
|
+
category="s2p",
|
|
198
|
+
modalities=["text"],
|
|
199
|
+
eval_splits=["dev"],
|
|
200
|
+
eval_langs=["cmn-Hans"],
|
|
201
|
+
main_score="ndcg_at_10",
|
|
202
|
+
date=None,
|
|
203
|
+
domains=None,
|
|
204
|
+
task_subtypes=None,
|
|
205
|
+
license=None,
|
|
206
|
+
annotations_creators=None,
|
|
207
|
+
dialect=None,
|
|
208
|
+
sample_creation=None,
|
|
209
|
+
bibtex_citation=None,
|
|
210
|
+
descriptive_stats={
|
|
211
|
+
"n_samples": None,
|
|
212
|
+
"avg_character_length": {
|
|
213
|
+
"dev": {
|
|
214
|
+
"average_document_length": 307.7710222897771,
|
|
215
|
+
"average_query_length": 48.470367591897976,
|
|
216
|
+
"num_documents": 100001,
|
|
217
|
+
"num_queries": 3999,
|
|
218
|
+
"average_relevant_docs_per_query": 1.86271567891973,
|
|
219
|
+
}
|
|
220
|
+
},
|
|
221
|
+
},
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class EcomRetrieval(AbsTaskRetrieval):
|
|
226
|
+
ignore_identical_ids = True
|
|
227
|
+
|
|
228
|
+
metadata = TaskMetadata(
|
|
229
|
+
name="EcomRetrieval",
|
|
230
|
+
description="EcomRetrieval",
|
|
231
|
+
reference="https://arxiv.org/abs/2203.03367",
|
|
232
|
+
dataset={
|
|
233
|
+
"path": "C-MTEB/EcomRetrieval",
|
|
234
|
+
"revision": "687de13dc7294d6fd9be10c6945f9e8fec8166b9",
|
|
235
|
+
"qrel_revision": "39c90699b034ec22ac45b3abf5b0bbb5ffd421f9",
|
|
236
|
+
},
|
|
237
|
+
type="Retrieval",
|
|
238
|
+
category="s2p",
|
|
239
|
+
modalities=["text"],
|
|
240
|
+
eval_splits=["dev"],
|
|
241
|
+
eval_langs=["cmn-Hans"],
|
|
242
|
+
main_score="ndcg_at_10",
|
|
243
|
+
date=None,
|
|
244
|
+
domains=None,
|
|
245
|
+
task_subtypes=None,
|
|
246
|
+
license=None,
|
|
247
|
+
annotations_creators=None,
|
|
248
|
+
dialect=None,
|
|
249
|
+
sample_creation=None,
|
|
250
|
+
bibtex_citation=None,
|
|
251
|
+
descriptive_stats={
|
|
252
|
+
"n_samples": None,
|
|
253
|
+
"avg_character_length": {
|
|
254
|
+
"dev": {
|
|
255
|
+
"average_document_length": 32.98041664189015,
|
|
256
|
+
"average_query_length": 6.798,
|
|
257
|
+
"num_documents": 100902,
|
|
258
|
+
"num_queries": 1000,
|
|
259
|
+
"average_relevant_docs_per_query": 1.0,
|
|
260
|
+
}
|
|
261
|
+
},
|
|
262
|
+
},
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class MedicalRetrieval(AbsTaskRetrieval):
|
|
267
|
+
ignore_identical_ids = True
|
|
268
|
+
|
|
269
|
+
metadata = TaskMetadata(
|
|
270
|
+
name="MedicalRetrieval",
|
|
271
|
+
description="MedicalRetrieval",
|
|
272
|
+
reference="https://arxiv.org/abs/2203.03367",
|
|
273
|
+
dataset={
|
|
274
|
+
"path": "C-MTEB/MedicalRetrieval",
|
|
275
|
+
"revision": "2039188fb5800a9803ba5048df7b76e6fb151fc6",
|
|
276
|
+
"qrel_revision": "37b8efec53c54c3d9c6af212f6710b62ccdf895c",
|
|
277
|
+
},
|
|
278
|
+
type="Retrieval",
|
|
279
|
+
category="s2p",
|
|
280
|
+
modalities=["text"],
|
|
281
|
+
eval_splits=["dev"],
|
|
282
|
+
eval_langs=["cmn-Hans"],
|
|
283
|
+
main_score="ndcg_at_10",
|
|
284
|
+
date=None,
|
|
285
|
+
domains=None,
|
|
286
|
+
task_subtypes=None,
|
|
287
|
+
license=None,
|
|
288
|
+
annotations_creators=None,
|
|
289
|
+
dialect=None,
|
|
290
|
+
sample_creation=None,
|
|
291
|
+
bibtex_citation=None,
|
|
292
|
+
descriptive_stats={
|
|
293
|
+
"n_samples": None,
|
|
294
|
+
"avg_character_length": {
|
|
295
|
+
"dev": {
|
|
296
|
+
"average_document_length": 122.04231725066585,
|
|
297
|
+
"average_query_length": 17.938,
|
|
298
|
+
"num_documents": 100999,
|
|
299
|
+
"num_queries": 1000,
|
|
300
|
+
"average_relevant_docs_per_query": 1.0,
|
|
301
|
+
}
|
|
302
|
+
},
|
|
303
|
+
},
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
class VideoRetrieval(AbsTaskRetrieval):
|
|
308
|
+
ignore_identical_ids = True
|
|
309
|
+
|
|
310
|
+
metadata = TaskMetadata(
|
|
311
|
+
name="VideoRetrieval",
|
|
312
|
+
description="VideoRetrieval",
|
|
313
|
+
reference="https://arxiv.org/abs/2203.03367",
|
|
314
|
+
dataset={
|
|
315
|
+
"path": "C-MTEB/VideoRetrieval",
|
|
316
|
+
"revision": "58c2597a5943a2ba48f4668c3b90d796283c5639",
|
|
317
|
+
"qrel_revision": "faa71382b6a29cf1778d1f436b963e75cb5b927c",
|
|
318
|
+
},
|
|
319
|
+
type="Retrieval",
|
|
320
|
+
category="s2p",
|
|
321
|
+
modalities=["text"],
|
|
322
|
+
eval_splits=["dev"],
|
|
323
|
+
eval_langs=["cmn-Hans"],
|
|
324
|
+
main_score="ndcg_at_10",
|
|
325
|
+
date=None,
|
|
326
|
+
domains=None,
|
|
327
|
+
task_subtypes=None,
|
|
328
|
+
license=None,
|
|
329
|
+
annotations_creators=None,
|
|
330
|
+
dialect=None,
|
|
331
|
+
sample_creation=None,
|
|
332
|
+
bibtex_citation=None,
|
|
333
|
+
descriptive_stats={
|
|
334
|
+
"n_samples": None,
|
|
335
|
+
"avg_character_length": {
|
|
336
|
+
"dev": {
|
|
337
|
+
"average_document_length": 31.048855642524522,
|
|
338
|
+
"average_query_length": 7.365,
|
|
339
|
+
"num_documents": 100930,
|
|
340
|
+
"num_queries": 1000,
|
|
341
|
+
"average_relevant_docs_per_query": 1.0,
|
|
342
|
+
}
|
|
343
|
+
},
|
|
344
|
+
},
|
|
345
|
+
)
|