evalscope 0.5.5rc1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/__init__.py +0 -3
- evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
- evalscope/backend/rag_eval/__init__.py +4 -0
- evalscope/backend/rag_eval/backend_manager.py +80 -0
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +2 -0
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +34 -0
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +277 -0
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +119 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +83 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +247 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +170 -0
- evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
- evalscope/backend/rag_eval/cmteb/arguments.py +61 -0
- evalscope/backend/rag_eval/cmteb/base.py +91 -0
- evalscope/backend/rag_eval/cmteb/task_template.py +85 -0
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +61 -0
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +151 -0
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +70 -0
- evalscope/backend/rag_eval/ragas/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/arguments.py +47 -0
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +91 -0
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +99 -0
- evalscope/backend/rag_eval/ragas/task_template.py +61 -0
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +263 -0
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +72 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- evalscope/evaluator/evaluator.py +1 -0
- evalscope/models/api/openai_api.py +2 -2
- evalscope/perf/http_client.py +1 -1
- evalscope/perf/openai_api.py +2 -0
- evalscope/run.py +4 -0
- evalscope/utils/logger.py +44 -14
- evalscope/utils/task_utils.py +3 -0
- evalscope/version.py +2 -2
- {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0.dist-info}/METADATA +95 -99
- {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0.dist-info}/RECORD +48 -17
- {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0.dist-info}/WHEEL +1 -1
- {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
from mteb.abstasks.AbsTaskSTS import AbsTaskSTS
|
|
2
|
+
from mteb.abstasks.TaskMetadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
class ATEC(AbsTaskSTS):
|
|
5
|
+
metadata = TaskMetadata(
|
|
6
|
+
name="ATEC",
|
|
7
|
+
dataset={
|
|
8
|
+
"path": "C-MTEB/ATEC",
|
|
9
|
+
"revision": "0f319b1142f28d00e055a6770f3f726ae9b7d865",
|
|
10
|
+
},
|
|
11
|
+
description="A Chinese dataset for textual relatedness",
|
|
12
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
13
|
+
type="STS",
|
|
14
|
+
category="s2s",
|
|
15
|
+
modalities=["text"],
|
|
16
|
+
eval_splits=["validation", "test"],
|
|
17
|
+
eval_langs=["cmn-Hans"],
|
|
18
|
+
main_score="cosine_spearman",
|
|
19
|
+
date=None,
|
|
20
|
+
domains=None,
|
|
21
|
+
task_subtypes=None,
|
|
22
|
+
license=None,
|
|
23
|
+
annotations_creators=None,
|
|
24
|
+
dialect=None,
|
|
25
|
+
sample_creation=None,
|
|
26
|
+
bibtex_citation="""@inproceedings{raghu-etal-2021-end,
|
|
27
|
+
title = "End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs",
|
|
28
|
+
author = "Raghu, Dinesh and
|
|
29
|
+
Agarwal, Shantanu and
|
|
30
|
+
Joshi, Sachindra and
|
|
31
|
+
{Mausam}",
|
|
32
|
+
editor = "Moens, Marie-Francine and
|
|
33
|
+
Huang, Xuanjing and
|
|
34
|
+
Specia, Lucia and
|
|
35
|
+
Yih, Scott Wen-tau",
|
|
36
|
+
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
|
|
37
|
+
month = nov,
|
|
38
|
+
year = "2021",
|
|
39
|
+
address = "Online and Punta Cana, Dominican Republic",
|
|
40
|
+
publisher = "Association for Computational Linguistics",
|
|
41
|
+
url = "https://aclanthology.org/2021.emnlp-main.357",
|
|
42
|
+
doi = "10.18653/v1/2021.emnlp-main.357",
|
|
43
|
+
pages = "4348--4366",
|
|
44
|
+
abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
|
|
45
|
+
}""",
|
|
46
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
51
|
+
metadata_dict = super().metadata_dict
|
|
52
|
+
metadata_dict["min_score"] = 0
|
|
53
|
+
metadata_dict["max_score"] = 1
|
|
54
|
+
return metadata_dict
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class BQ(AbsTaskSTS):
|
|
58
|
+
metadata = TaskMetadata(
|
|
59
|
+
name="BQ",
|
|
60
|
+
dataset={
|
|
61
|
+
"path": "C-MTEB/BQ",
|
|
62
|
+
"revision": "e3dda5e115e487b39ec7e618c0c6a29137052a55",
|
|
63
|
+
},
|
|
64
|
+
description="A Chinese dataset for textual relatedness",
|
|
65
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
66
|
+
type="STS",
|
|
67
|
+
category="s2s",
|
|
68
|
+
modalities=["text"],
|
|
69
|
+
eval_splits=["validation", "test"],
|
|
70
|
+
eval_langs=["cmn-Hans"],
|
|
71
|
+
main_score="cosine_spearman",
|
|
72
|
+
date=None,
|
|
73
|
+
domains=None,
|
|
74
|
+
task_subtypes=None,
|
|
75
|
+
license=None,
|
|
76
|
+
annotations_creators=None,
|
|
77
|
+
dialect=None,
|
|
78
|
+
sample_creation=None,
|
|
79
|
+
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
80
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
81
|
+
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
82
|
+
year={2024},
|
|
83
|
+
eprint={2309.07597},
|
|
84
|
+
archivePrefix={arXiv},
|
|
85
|
+
primaryClass={cs.CL},
|
|
86
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
87
|
+
}""",
|
|
88
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
93
|
+
metadata_dict = super().metadata_dict
|
|
94
|
+
metadata_dict["min_score"] = 0
|
|
95
|
+
metadata_dict["max_score"] = 1
|
|
96
|
+
return metadata_dict
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class LCQMC(AbsTaskSTS):
|
|
100
|
+
metadata = TaskMetadata(
|
|
101
|
+
name="LCQMC",
|
|
102
|
+
dataset={
|
|
103
|
+
"path": "C-MTEB/LCQMC",
|
|
104
|
+
"revision": "17f9b096f80380fce5ed12a9be8be7784b337daf",
|
|
105
|
+
},
|
|
106
|
+
description="A Chinese dataset for textual relatedness",
|
|
107
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
108
|
+
type="STS",
|
|
109
|
+
category="s2s",
|
|
110
|
+
modalities=["text"],
|
|
111
|
+
eval_splits=["test"],
|
|
112
|
+
eval_langs=["cmn-Hans"],
|
|
113
|
+
main_score="cosine_spearman",
|
|
114
|
+
date=None,
|
|
115
|
+
domains=None,
|
|
116
|
+
task_subtypes=None,
|
|
117
|
+
license=None,
|
|
118
|
+
annotations_creators=None,
|
|
119
|
+
dialect=None,
|
|
120
|
+
sample_creation=None,
|
|
121
|
+
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
122
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
123
|
+
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
124
|
+
year={2024},
|
|
125
|
+
eprint={2309.07597},
|
|
126
|
+
archivePrefix={arXiv},
|
|
127
|
+
primaryClass={cs.CL},
|
|
128
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
129
|
+
}""",
|
|
130
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
135
|
+
metadata_dict = super().metadata_dict
|
|
136
|
+
metadata_dict["min_score"] = 0
|
|
137
|
+
metadata_dict["max_score"] = 1
|
|
138
|
+
return metadata_dict
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class PAWSX(AbsTaskSTS):
|
|
142
|
+
metadata = TaskMetadata(
|
|
143
|
+
name="PAWSX",
|
|
144
|
+
dataset={
|
|
145
|
+
"path": "C-MTEB/PAWSX",
|
|
146
|
+
"revision": "9c6a90e430ac22b5779fb019a23e820b11a8b5e1",
|
|
147
|
+
},
|
|
148
|
+
description="A Chinese dataset for textual relatedness",
|
|
149
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
150
|
+
type="STS",
|
|
151
|
+
category="s2s",
|
|
152
|
+
modalities=["text"],
|
|
153
|
+
eval_splits=["test"],
|
|
154
|
+
eval_langs=["cmn-Hans"],
|
|
155
|
+
main_score="cosine_spearman",
|
|
156
|
+
date=None,
|
|
157
|
+
domains=None,
|
|
158
|
+
task_subtypes=None,
|
|
159
|
+
license=None,
|
|
160
|
+
annotations_creators=None,
|
|
161
|
+
dialect=None,
|
|
162
|
+
sample_creation=None,
|
|
163
|
+
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
164
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
165
|
+
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
166
|
+
year={2024},
|
|
167
|
+
eprint={2309.07597},
|
|
168
|
+
archivePrefix={arXiv},
|
|
169
|
+
primaryClass={cs.CL},
|
|
170
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
171
|
+
}""",
|
|
172
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
177
|
+
metadata_dict = super().metadata_dict
|
|
178
|
+
metadata_dict["min_score"] = 0
|
|
179
|
+
metadata_dict["max_score"] = 1
|
|
180
|
+
return metadata_dict
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class STSB(AbsTaskSTS):
|
|
184
|
+
metadata = TaskMetadata(
|
|
185
|
+
name="STSB",
|
|
186
|
+
dataset={
|
|
187
|
+
"path": "C-MTEB/STSB",
|
|
188
|
+
"revision": "0cde68302b3541bb8b3c340dc0644b0b745b3dc0",
|
|
189
|
+
},
|
|
190
|
+
description="A Chinese dataset for textual relatedness",
|
|
191
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
192
|
+
type="STS",
|
|
193
|
+
category="s2s",
|
|
194
|
+
modalities=["text"],
|
|
195
|
+
eval_splits=["validation", "test"],
|
|
196
|
+
eval_langs=["cmn-Hans"],
|
|
197
|
+
main_score="cosine_spearman",
|
|
198
|
+
date=None,
|
|
199
|
+
domains=None,
|
|
200
|
+
task_subtypes=None,
|
|
201
|
+
license=None,
|
|
202
|
+
annotations_creators=None,
|
|
203
|
+
dialect=None,
|
|
204
|
+
sample_creation=None,
|
|
205
|
+
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
206
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
207
|
+
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
208
|
+
year={2024},
|
|
209
|
+
eprint={2309.07597},
|
|
210
|
+
archivePrefix={arXiv},
|
|
211
|
+
primaryClass={cs.CL},
|
|
212
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
213
|
+
}""",
|
|
214
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
219
|
+
metadata_dict = super().metadata_dict
|
|
220
|
+
metadata_dict["min_score"] = 0
|
|
221
|
+
metadata_dict["max_score"] = 5
|
|
222
|
+
return metadata_dict
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class AFQMC(AbsTaskSTS):
|
|
226
|
+
metadata = TaskMetadata(
|
|
227
|
+
name="AFQMC",
|
|
228
|
+
dataset={
|
|
229
|
+
"path": "C-MTEB/AFQMC",
|
|
230
|
+
"revision": "b44c3b011063adb25877c13823db83bb193913c4",
|
|
231
|
+
},
|
|
232
|
+
description="A Chinese dataset for textual relatedness",
|
|
233
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
234
|
+
type="STS",
|
|
235
|
+
category="s2s",
|
|
236
|
+
modalities=["text"],
|
|
237
|
+
eval_splits=["validation"],
|
|
238
|
+
eval_langs=["cmn-Hans"],
|
|
239
|
+
main_score="cosine_spearman",
|
|
240
|
+
date=None,
|
|
241
|
+
domains=None,
|
|
242
|
+
task_subtypes=None,
|
|
243
|
+
license=None,
|
|
244
|
+
annotations_creators=None,
|
|
245
|
+
dialect=None,
|
|
246
|
+
sample_creation=None,
|
|
247
|
+
bibtex_citation="""@inproceedings{raghu-etal-2021-end,
|
|
248
|
+
title = "End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs",
|
|
249
|
+
author = "Raghu, Dinesh and
|
|
250
|
+
Agarwal, Shantanu and
|
|
251
|
+
Joshi, Sachindra and
|
|
252
|
+
{Mausam}",
|
|
253
|
+
editor = "Moens, Marie-Francine and
|
|
254
|
+
Huang, Xuanjing and
|
|
255
|
+
Specia, Lucia and
|
|
256
|
+
Yih, Scott Wen-tau",
|
|
257
|
+
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
|
|
258
|
+
month = nov,
|
|
259
|
+
year = "2021",
|
|
260
|
+
address = "Online and Punta Cana, Dominican Republic",
|
|
261
|
+
publisher = "Association for Computational Linguistics",
|
|
262
|
+
url = "https://aclanthology.org/2021.emnlp-main.357",
|
|
263
|
+
doi = "10.18653/v1/2021.emnlp-main.357",
|
|
264
|
+
pages = "4348--4366",
|
|
265
|
+
abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
|
|
266
|
+
}""",
|
|
267
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
@property
|
|
271
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
272
|
+
metadata_dict = super().metadata_dict
|
|
273
|
+
metadata_dict["min_score"] = 0
|
|
274
|
+
metadata_dict["max_score"] = 1
|
|
275
|
+
return metadata_dict
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
class QBQTC(AbsTaskSTS):
|
|
279
|
+
metadata = TaskMetadata(
|
|
280
|
+
name="QBQTC",
|
|
281
|
+
dataset={
|
|
282
|
+
"path": "C-MTEB/QBQTC",
|
|
283
|
+
"revision": "790b0510dc52b1553e8c49f3d2afb48c0e5c48b7",
|
|
284
|
+
},
|
|
285
|
+
description="",
|
|
286
|
+
reference="https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset",
|
|
287
|
+
type="STS",
|
|
288
|
+
category="s2s",
|
|
289
|
+
modalities=["text"],
|
|
290
|
+
eval_splits=["test"],
|
|
291
|
+
eval_langs=["cmn-Hans"],
|
|
292
|
+
main_score="cosine_spearman",
|
|
293
|
+
date=None,
|
|
294
|
+
domains=None,
|
|
295
|
+
task_subtypes=None,
|
|
296
|
+
license=None,
|
|
297
|
+
annotations_creators=None,
|
|
298
|
+
dialect=None,
|
|
299
|
+
sample_creation=None,
|
|
300
|
+
bibtex_citation=None,
|
|
301
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
302
|
+
)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from .Classification import *
|
|
2
|
+
from .Clustering import *
|
|
3
|
+
from .PairClassification import *
|
|
4
|
+
from .Reranking import *
|
|
5
|
+
from .Retrieval import *
|
|
6
|
+
from .STS import *
|
|
7
|
+
from .CustomTask import *
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
CLS_CLASSIFICATION = {
|
|
11
|
+
"TNews": TNews,
|
|
12
|
+
"IFlyTek": IFlyTek,
|
|
13
|
+
"MultilingualSentiment": MultilingualSentiment,
|
|
14
|
+
"JDReview": JDReview,
|
|
15
|
+
"OnlineShopping": OnlineShopping,
|
|
16
|
+
"Waimai": Waimai,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
CLS_CLUSTERING = {
|
|
20
|
+
"CLSClusteringS2S": CLSClusteringFastS2S,
|
|
21
|
+
"CLSClusteringP2P": CLSClusteringFastP2P,
|
|
22
|
+
"ThuNewsClusteringS2S": ThuNewsClusteringFastS2S,
|
|
23
|
+
"ThuNewsClusteringP2P": ThuNewsClusteringFastP2P,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
CLS_PAIR_CLASSIFICATION = {
|
|
27
|
+
"Ocnli": Ocnli,
|
|
28
|
+
"Cmnli": Cmnli,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
CLS_RERANKING = {
|
|
32
|
+
"T2Reranking": T2Reranking,
|
|
33
|
+
"MMarcoReranking": MMarcoReranking,
|
|
34
|
+
"CMedQAv1": CMedQAv1,
|
|
35
|
+
"CMedQAv2": CMedQAv2,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
CLS_RETRIEVAL = {
|
|
39
|
+
"T2Retrieval": T2Retrieval,
|
|
40
|
+
"MMarcoRetrieval": MMarcoRetrieval,
|
|
41
|
+
"DuRetrieval": DuRetrieval,
|
|
42
|
+
"CovidRetrieval": CovidRetrieval,
|
|
43
|
+
"CmedqaRetrieval": CmedqaRetrieval,
|
|
44
|
+
"EcomRetrieval": EcomRetrieval,
|
|
45
|
+
"MedicalRetrieval": MedicalRetrieval,
|
|
46
|
+
"VideoRetrieval": VideoRetrieval,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
CLS_STS = {
|
|
50
|
+
"ATEC": ATEC,
|
|
51
|
+
"BQ": BQ,
|
|
52
|
+
"LCQMC": LCQMC,
|
|
53
|
+
"PAWSX": PAWSX,
|
|
54
|
+
"STSB": STSB,
|
|
55
|
+
"AFQMC": AFQMC,
|
|
56
|
+
"QBQTC": QBQTC,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
CLS_CUSTOM = {
|
|
60
|
+
"CustomRetrieval": CustomRetrieval,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
CLS_DICT = {
|
|
64
|
+
**CLS_CLASSIFICATION,
|
|
65
|
+
**CLS_CLUSTERING,
|
|
66
|
+
**CLS_PAIR_CLASSIFICATION,
|
|
67
|
+
**CLS_RERANKING,
|
|
68
|
+
**CLS_RETRIEVAL,
|
|
69
|
+
**CLS_STS,
|
|
70
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List, Optional, Union, Dict, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class TestsetGenerationArguments:
|
|
7
|
+
docs: List[str] = field(default_factory=list)
|
|
8
|
+
test_size: int = 10
|
|
9
|
+
output_file: str = 'outputs/testset.json'
|
|
10
|
+
knowledge_graph: str = 'outputs/knowledge_graph.json'
|
|
11
|
+
"""
|
|
12
|
+
For local LLM support, you can use the following fields:
|
|
13
|
+
model_name_or_path: str
|
|
14
|
+
model_revision: str = "master"
|
|
15
|
+
template_type: str = "default"
|
|
16
|
+
generation_config: Optional[Dict]
|
|
17
|
+
|
|
18
|
+
For API LLM support, you can use the following fields:
|
|
19
|
+
model_name="gpt-4o-mini"
|
|
20
|
+
api_base: str = "",
|
|
21
|
+
api_key: Optional[str] = None
|
|
22
|
+
"""
|
|
23
|
+
generator_llm: Dict = field(default_factory=dict)
|
|
24
|
+
embeddings: Dict = field(default_factory=dict)
|
|
25
|
+
distribution: str = field(
|
|
26
|
+
default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1}
|
|
27
|
+
)
|
|
28
|
+
# For LLM based evaluation
|
|
29
|
+
# available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
|
|
30
|
+
# 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
|
|
31
|
+
# 'french', 'burmese', 'greek', 'italian', 'japanese', 'deutsch', 'kazakh', 'slovak']
|
|
32
|
+
language: str = 'english'
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class EvaluationArguments:
|
|
37
|
+
testset_file: str
|
|
38
|
+
critic_llm: Dict = field(default_factory=dict)
|
|
39
|
+
embeddings: Dict = field(default_factory=dict)
|
|
40
|
+
metrics: List[str] = field(
|
|
41
|
+
default_factory=lambda: ['answer_relevancy', 'faithfulness']
|
|
42
|
+
)
|
|
43
|
+
# For LLM based evaluation
|
|
44
|
+
# available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
|
|
45
|
+
# 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
|
|
46
|
+
# 'french', 'burmese', 'greek', 'italian', 'japanese', 'deutsch', 'kazakh', 'slovak']
|
|
47
|
+
language: str = 'english'
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
import numpy as np
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from ragas.dataset_schema import SingleTurnSample
|
|
5
|
+
from ragas.metrics.base import MetricWithLLM, SingleTurnMetric, MetricType
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
from evalscope.backend.rag_eval.ragas.prompts.multi_modal_prompt import ImageTextPrompt
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FaithfulnessInput(BaseModel):
|
|
11
|
+
response: str = Field(description="response from AI")
|
|
12
|
+
retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM")
|
|
13
|
+
|
|
14
|
+
def to_string_list(self):
|
|
15
|
+
return [
|
|
16
|
+
"inputs:",
|
|
17
|
+
self.response,
|
|
18
|
+
"retrieved_contexts: ",
|
|
19
|
+
] + self.retrieved_contexts
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FaithfulnessOutput(BaseModel):
|
|
23
|
+
faithful: bool = Field(description="boolean indicating if request was faithful")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MultiModalFaithfulnessPrompt(
|
|
27
|
+
ImageTextPrompt[FaithfulnessInput, FaithfulnessOutput]
|
|
28
|
+
):
|
|
29
|
+
# refer: https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/faithfulness.py
|
|
30
|
+
instruction = "Please tell if a given piece of information is supported by the visual as well as textual context information. You need to answer with either True or False. Answer True if any of the image(s) and textual context supports the information"
|
|
31
|
+
input_model = FaithfulnessInput
|
|
32
|
+
output_model = FaithfulnessOutput
|
|
33
|
+
examples = [
|
|
34
|
+
(
|
|
35
|
+
FaithfulnessInput(
|
|
36
|
+
response="Apple pie is generally double-crusted.",
|
|
37
|
+
retrieved_contexts=[
|
|
38
|
+
"An apple pie is a fruit pie in which the principal filling ingredient is apples.",
|
|
39
|
+
"Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.",
|
|
40
|
+
"It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).",
|
|
41
|
+
],
|
|
42
|
+
),
|
|
43
|
+
FaithfulnessOutput(faithful=True),
|
|
44
|
+
),
|
|
45
|
+
(
|
|
46
|
+
FaithfulnessInput(
|
|
47
|
+
response="Apple pies tastes bad.",
|
|
48
|
+
retrieved_contexts=[
|
|
49
|
+
"An apple pie is a fruit pie in which the principal filling ingredient is apples.",
|
|
50
|
+
"Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.",
|
|
51
|
+
"It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).",
|
|
52
|
+
],
|
|
53
|
+
),
|
|
54
|
+
FaithfulnessOutput(faithful=False),
|
|
55
|
+
),
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class MultiModalFaithfulness(MetricWithLLM, SingleTurnMetric):
|
|
61
|
+
name: str = "faithful_rate" # type: ignore
|
|
62
|
+
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
|
|
63
|
+
default_factory=lambda: {
|
|
64
|
+
MetricType.SINGLE_TURN: {
|
|
65
|
+
"response",
|
|
66
|
+
"retrieved_contexts",
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
)
|
|
70
|
+
faithfulness_prompt: ImageTextPrompt = MultiModalFaithfulnessPrompt()
|
|
71
|
+
|
|
72
|
+
async def _ascore(self, row: t.Dict, callbacks) -> float:
|
|
73
|
+
prompt_input = FaithfulnessInput(
|
|
74
|
+
response=row["response"], retrieved_contexts=row["retrieved_contexts"]
|
|
75
|
+
)
|
|
76
|
+
assert self.llm is not None, "LLM is not set"
|
|
77
|
+
prompt_response = await self.faithfulness_prompt.generate(
|
|
78
|
+
data=prompt_input, llm=self.llm, callbacks=callbacks
|
|
79
|
+
)
|
|
80
|
+
if prompt_response is None:
|
|
81
|
+
return np.nan
|
|
82
|
+
return float(prompt_response.faithful)
|
|
83
|
+
|
|
84
|
+
async def _single_turn_ascore(
|
|
85
|
+
self, sample: SingleTurnSample, callbacks
|
|
86
|
+
) -> float:
|
|
87
|
+
row = sample.to_dict()
|
|
88
|
+
return await self._ascore(row, callbacks)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
multimodal_faithness = MultiModalFaithfulness()
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
import numpy as np
|
|
4
|
+
from ragas.dataset_schema import SingleTurnSample
|
|
5
|
+
from ragas.metrics.base import MetricWithLLM, SingleTurnMetric, MetricType
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
from evalscope.backend.rag_eval.ragas.prompts.multi_modal_prompt import ImageTextPrompt
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RelevanceInput(BaseModel):
|
|
11
|
+
user_input: str = Field(description="user input")
|
|
12
|
+
response: str = Field(description="response from AI")
|
|
13
|
+
retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM")
|
|
14
|
+
|
|
15
|
+
def to_string_list(self):
|
|
16
|
+
return [
|
|
17
|
+
f"Question: {self.user_input}",
|
|
18
|
+
f"Response: {self.response}",
|
|
19
|
+
"retrieved_contexts: ",
|
|
20
|
+
] + self.retrieved_contexts
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RelevanceOutput(BaseModel):
|
|
24
|
+
relevance: bool = Field(description="boolean indicating if request was relevance")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class MultiModalRelevancePrompt(ImageTextPrompt[RelevanceInput, RelevanceOutput]):
|
|
28
|
+
# refer https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/relevancy.py
|
|
29
|
+
instruction = """
|
|
30
|
+
Your task is to evaluate if the response for the query is in line with the images and textual context information provided.
|
|
31
|
+
You have two options to answer. Either True / False.
|
|
32
|
+
Answer - True, if the response for the query is in line with context information otherwise False.
|
|
33
|
+
"""
|
|
34
|
+
input_model = RelevanceInput
|
|
35
|
+
output_model = RelevanceOutput
|
|
36
|
+
examples = [
|
|
37
|
+
(
|
|
38
|
+
RelevanceInput(
|
|
39
|
+
user_input="What is the primary ingredient in a traditional Margherita pizza?",
|
|
40
|
+
response="The primary ingredients in a Margherita pizza are tomatoes, mozzarella cheese, and fresh basil.",
|
|
41
|
+
retrieved_contexts=[
|
|
42
|
+
"A traditional Margherita pizza consists of a thin crust.",
|
|
43
|
+
"The main toppings include tomatoes, mozzarella cheese, fresh basil, salt, and olive oil.",
|
|
44
|
+
"It is one of the simplest and most classic types of pizza.",
|
|
45
|
+
],
|
|
46
|
+
),
|
|
47
|
+
RelevanceOutput(relevance=True),
|
|
48
|
+
),
|
|
49
|
+
(
|
|
50
|
+
RelevanceInput(
|
|
51
|
+
user_input="Who won the Best Actor award at the Oscars in 2021?",
|
|
52
|
+
response="The Best Actor award in 2021 was won by Leonardo DiCaprio.",
|
|
53
|
+
retrieved_contexts=[
|
|
54
|
+
"The 93rd Academy Awards were held in 2021.",
|
|
55
|
+
"Anthony Hopkins won the Best Actor award for his role in 'The Father'.",
|
|
56
|
+
"The event was unique due to COVID-19 restrictions.",
|
|
57
|
+
],
|
|
58
|
+
),
|
|
59
|
+
RelevanceOutput(relevance=False),
|
|
60
|
+
),
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class MultiModalRelevance(MetricWithLLM, SingleTurnMetric):
|
|
66
|
+
name: str = "relevance_rate" # type: ignore
|
|
67
|
+
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
|
|
68
|
+
default_factory=lambda: {
|
|
69
|
+
MetricType.SINGLE_TURN: {
|
|
70
|
+
"user_input",
|
|
71
|
+
"response",
|
|
72
|
+
"retrieved_contexts",
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
)
|
|
76
|
+
relevance_prompt: ImageTextPrompt = MultiModalRelevancePrompt()
|
|
77
|
+
|
|
78
|
+
async def _ascore(self, row: t.Dict, callbacks) -> float:
|
|
79
|
+
prompt_input = RelevanceInput(
|
|
80
|
+
user_input=row["user_input"],
|
|
81
|
+
response=row["response"],
|
|
82
|
+
retrieved_contexts=row["retrieved_contexts"],
|
|
83
|
+
)
|
|
84
|
+
assert self.llm is not None, "LLM is not set"
|
|
85
|
+
prompt_response = await self.relevance_prompt.generate(
|
|
86
|
+
data=prompt_input, llm=self.llm, callbacks=callbacks
|
|
87
|
+
)
|
|
88
|
+
if prompt_response is None:
|
|
89
|
+
return np.nan
|
|
90
|
+
return float(prompt_response.relevance)
|
|
91
|
+
|
|
92
|
+
async def _single_turn_ascore(
|
|
93
|
+
self, sample: SingleTurnSample, callbacks
|
|
94
|
+
) -> float:
|
|
95
|
+
row = sample.to_dict()
|
|
96
|
+
return await self._ascore(row, callbacks)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
multimodal_relevance = MultiModalRelevance()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
from datasets import Dataset
|
|
4
|
+
from evalscope.backend.rag_eval import EmbeddingModel, LLM
|
|
5
|
+
from evalscope.backend.rag_eval.ragas.tasks.translate_prompt import translate_prompts
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
from .arguments import EvaluationArguments
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def rag_eval(
|
|
13
|
+
args: EvaluationArguments,
|
|
14
|
+
) -> None:
|
|
15
|
+
|
|
16
|
+
from ragas import evaluate, RunConfig
|
|
17
|
+
from ragas.llms import LangchainLLMWrapper
|
|
18
|
+
import importlib
|
|
19
|
+
|
|
20
|
+
def dynamic_import(*function_names):
|
|
21
|
+
functions = []
|
|
22
|
+
for name in function_names:
|
|
23
|
+
module = importlib.import_module('ragas.metrics')
|
|
24
|
+
functions.append(getattr(module, name)())
|
|
25
|
+
return functions
|
|
26
|
+
|
|
27
|
+
llm = LLM.load(**args.critic_llm)
|
|
28
|
+
embedding = EmbeddingModel.load(**args.embeddings)
|
|
29
|
+
|
|
30
|
+
# load dataset
|
|
31
|
+
dataset = Dataset.from_json(args.testset_file)
|
|
32
|
+
|
|
33
|
+
# load metrics
|
|
34
|
+
metrics = dynamic_import(*args.metrics)
|
|
35
|
+
asyncio.run(
|
|
36
|
+
translate_prompts(
|
|
37
|
+
prompts=metrics,
|
|
38
|
+
target_lang=args.language,
|
|
39
|
+
llm=LangchainLLMWrapper(llm),
|
|
40
|
+
adapt_instruction=True,
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# evaluate
|
|
45
|
+
runconfig = RunConfig(timeout=600, max_retries=2, max_wait=60, max_workers=1)
|
|
46
|
+
score = evaluate(
|
|
47
|
+
dataset,
|
|
48
|
+
metrics=metrics,
|
|
49
|
+
llm=llm,
|
|
50
|
+
embeddings=embedding,
|
|
51
|
+
run_config=runconfig,
|
|
52
|
+
)
|
|
53
|
+
score_df = score.to_pandas()
|
|
54
|
+
logger.info(score_df)
|
|
55
|
+
|
|
56
|
+
output_path = args.testset_file.replace('.json', '_score.json')
|
|
57
|
+
score_df.to_json(
|
|
58
|
+
output_path, indent=4, index=False, orient='records', force_ascii=False
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
logger.info(f'Eval score saved to {output_path}')
|