evalscope 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/__init__.py +0 -3
- evalscope/backend/opencompass/backend_manager.py +2 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +2 -2
- evalscope/backend/rag_eval/__init__.py +3 -0
- evalscope/backend/rag_eval/backend_manager.py +68 -0
- evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
- evalscope/backend/rag_eval/cmteb/arguments.py +59 -0
- evalscope/backend/rag_eval/cmteb/base.py +89 -0
- evalscope/backend/rag_eval/cmteb/task_template.py +83 -0
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +153 -0
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +64 -0
- evalscope/backend/rag_eval/ragas/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/arguments.py +37 -0
- evalscope/backend/rag_eval/ragas/task_template.py +117 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +1 -2
- evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/evaluator/evaluator.py +4 -3
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +19 -0
- evalscope/models/api/__init__.py +3 -0
- evalscope/models/api/openai_api.py +228 -0
- evalscope/perf/http_client.py +5 -5
- evalscope/run.py +4 -0
- evalscope/third_party/longbench_write/__init__.py +3 -0
- evalscope/third_party/longbench_write/eval.py +284 -0
- evalscope/third_party/longbench_write/infer.py +217 -0
- evalscope/third_party/longbench_write/longbench_write.py +88 -0
- evalscope/third_party/longbench_write/resources/__init__.py +1 -0
- evalscope/third_party/longbench_write/resources/judge.txt +31 -0
- evalscope/third_party/longbench_write/resources/longbench_write.jsonl +120 -0
- evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +60 -0
- evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +48 -0
- evalscope/third_party/longbench_write/tools/__init__.py +1 -0
- evalscope/third_party/longbench_write/tools/data_etl.py +155 -0
- evalscope/third_party/longbench_write/utils.py +37 -0
- evalscope/utils/logger.py +44 -14
- evalscope/utils/task_utils.py +3 -0
- evalscope/version.py +2 -2
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/METADATA +46 -60
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/RECORD +48 -18
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/WHEEL +0 -0
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
from mteb import AbsTaskRetrieval
|
|
2
|
+
from mteb.abstasks.TaskMetadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class T2Retrieval(AbsTaskRetrieval):
|
|
6
|
+
ignore_identical_ids = True
|
|
7
|
+
|
|
8
|
+
metadata = TaskMetadata(
|
|
9
|
+
name="T2Retrieval",
|
|
10
|
+
description="T2Ranking: A large-scale Chinese Benchmark for Passage Ranking",
|
|
11
|
+
reference="https://arxiv.org/abs/2304.03679",
|
|
12
|
+
dataset={
|
|
13
|
+
"path": "C-MTEB/T2Retrieval",
|
|
14
|
+
"revision": "8731a845f1bf500a4f111cf1070785c793d10e64",
|
|
15
|
+
"qrel_revision": "1c83b8d1544e529875e3f6930f3a1fcf749a8e97",
|
|
16
|
+
},
|
|
17
|
+
type="Retrieval",
|
|
18
|
+
category="s2p",
|
|
19
|
+
modalities=["text"],
|
|
20
|
+
eval_splits=["dev"],
|
|
21
|
+
eval_langs=["cmn-Hans"],
|
|
22
|
+
main_score="ndcg_at_10",
|
|
23
|
+
date=None,
|
|
24
|
+
domains=None,
|
|
25
|
+
task_subtypes=None,
|
|
26
|
+
license=None,
|
|
27
|
+
annotations_creators=None,
|
|
28
|
+
dialect=None,
|
|
29
|
+
sample_creation=None,
|
|
30
|
+
bibtex_citation="""@misc{xie2023t2ranking,
|
|
31
|
+
title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
|
|
32
|
+
author={Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma},
|
|
33
|
+
year={2023},
|
|
34
|
+
eprint={2304.03679},
|
|
35
|
+
archivePrefix={arXiv},
|
|
36
|
+
primaryClass={cs.IR}
|
|
37
|
+
}""",
|
|
38
|
+
descriptive_stats={
|
|
39
|
+
"n_samples": None,
|
|
40
|
+
"avg_character_length": {
|
|
41
|
+
"dev": {
|
|
42
|
+
"average_document_length": 874.1184182791619,
|
|
43
|
+
"average_query_length": 10.938847974750132,
|
|
44
|
+
"num_documents": 118605,
|
|
45
|
+
"num_queries": 22812,
|
|
46
|
+
"average_relevant_docs_per_query": 5.213571804313519,
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
},
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class MMarcoRetrieval(AbsTaskRetrieval):
|
|
54
|
+
ignore_identical_ids = True
|
|
55
|
+
|
|
56
|
+
metadata = TaskMetadata(
|
|
57
|
+
name="MMarcoRetrieval",
|
|
58
|
+
description="MMarcoRetrieval",
|
|
59
|
+
reference="https://arxiv.org/abs/2309.07597",
|
|
60
|
+
dataset={
|
|
61
|
+
"path": "C-MTEB/MMarcoRetrieval",
|
|
62
|
+
"revision": "539bbde593d947e2a124ba72651aafc09eb33fc2",
|
|
63
|
+
"qrel_revision": "bae08bb7bddbedb96c7e7db52018a55167b67f89",
|
|
64
|
+
},
|
|
65
|
+
type="Retrieval",
|
|
66
|
+
category="s2p",
|
|
67
|
+
modalities=["text"],
|
|
68
|
+
eval_splits=["dev"],
|
|
69
|
+
eval_langs=["cmn-Hans"],
|
|
70
|
+
main_score="ndcg_at_10",
|
|
71
|
+
date=None,
|
|
72
|
+
domains=None,
|
|
73
|
+
task_subtypes=None,
|
|
74
|
+
license=None,
|
|
75
|
+
annotations_creators=None,
|
|
76
|
+
dialect=None,
|
|
77
|
+
sample_creation=None,
|
|
78
|
+
bibtex_citation="""@misc{xiao2024cpack,
|
|
79
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
80
|
+
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
81
|
+
year={2024},
|
|
82
|
+
eprint={2309.07597},
|
|
83
|
+
archivePrefix={arXiv},
|
|
84
|
+
primaryClass={cs.CL}
|
|
85
|
+
}""",
|
|
86
|
+
descriptive_stats={
|
|
87
|
+
"n_samples": None,
|
|
88
|
+
"avg_character_length": {
|
|
89
|
+
"dev": {
|
|
90
|
+
"average_document_length": 114.41787048392986,
|
|
91
|
+
"average_query_length": 10.51131805157593,
|
|
92
|
+
"num_documents": 106813,
|
|
93
|
+
"num_queries": 6980,
|
|
94
|
+
"average_relevant_docs_per_query": 1.0654727793696275,
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class DuRetrieval(AbsTaskRetrieval):
|
|
102
|
+
metadata = TaskMetadata(
|
|
103
|
+
name="DuRetrieval",
|
|
104
|
+
description="A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine",
|
|
105
|
+
reference="https://aclanthology.org/2022.emnlp-main.357.pdf",
|
|
106
|
+
dataset={
|
|
107
|
+
"path": "C-MTEB/DuRetrieval",
|
|
108
|
+
"revision": "a1a333e290fe30b10f3f56498e3a0d911a693ced",
|
|
109
|
+
"qrel_revision": "497b7bd1bbb25cb3757ff34d95a8be50a3de2279",
|
|
110
|
+
},
|
|
111
|
+
type="Retrieval",
|
|
112
|
+
category="s2p",
|
|
113
|
+
modalities=["text"],
|
|
114
|
+
eval_splits=["dev"],
|
|
115
|
+
eval_langs=["cmn-Hans"],
|
|
116
|
+
main_score="ndcg_at_10",
|
|
117
|
+
date=None,
|
|
118
|
+
domains=None,
|
|
119
|
+
task_subtypes=None,
|
|
120
|
+
license=None,
|
|
121
|
+
annotations_creators=None,
|
|
122
|
+
dialect=None,
|
|
123
|
+
sample_creation=None,
|
|
124
|
+
bibtex_citation="""@misc{qiu2022dureaderretrieval,
|
|
125
|
+
title={DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine},
|
|
126
|
+
author={Yifu Qiu and Hongyu Li and Yingqi Qu and Ying Chen and Qiaoqiao She and Jing Liu and Hua Wu and Haifeng Wang},
|
|
127
|
+
year={2022},
|
|
128
|
+
eprint={2203.10232},
|
|
129
|
+
archivePrefix={arXiv},
|
|
130
|
+
primaryClass={cs.CL}
|
|
131
|
+
}""",
|
|
132
|
+
descriptive_stats={
|
|
133
|
+
"n_samples": None,
|
|
134
|
+
"avg_character_length": {
|
|
135
|
+
"dev": {
|
|
136
|
+
"average_document_length": 331.3219967800322,
|
|
137
|
+
"average_query_length": 9.289,
|
|
138
|
+
"num_documents": 100001,
|
|
139
|
+
"num_queries": 2000,
|
|
140
|
+
"average_relevant_docs_per_query": 4.9195,
|
|
141
|
+
}
|
|
142
|
+
},
|
|
143
|
+
},
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class CovidRetrieval(AbsTaskRetrieval):
|
|
148
|
+
metadata = TaskMetadata(
|
|
149
|
+
name="CovidRetrieval",
|
|
150
|
+
description="COVID-19 news articles",
|
|
151
|
+
reference="https://arxiv.org/abs/2203.03367",
|
|
152
|
+
dataset={
|
|
153
|
+
"path": "C-MTEB/CovidRetrieval",
|
|
154
|
+
"revision": "1271c7809071a13532e05f25fb53511ffce77117",
|
|
155
|
+
"qrel_revision": "a9f41b7cdf24785531d12417ce0d1157ed4b39ca",
|
|
156
|
+
},
|
|
157
|
+
type="Retrieval",
|
|
158
|
+
category="s2p",
|
|
159
|
+
modalities=["text"],
|
|
160
|
+
eval_splits=["dev"],
|
|
161
|
+
eval_langs=["cmn-Hans"],
|
|
162
|
+
main_score="ndcg_at_10",
|
|
163
|
+
date=None,
|
|
164
|
+
domains=None,
|
|
165
|
+
task_subtypes=None,
|
|
166
|
+
license=None,
|
|
167
|
+
annotations_creators=None,
|
|
168
|
+
dialect=None,
|
|
169
|
+
sample_creation=None,
|
|
170
|
+
bibtex_citation=None,
|
|
171
|
+
descriptive_stats={
|
|
172
|
+
"n_samples": None,
|
|
173
|
+
"avg_character_length": {
|
|
174
|
+
"dev": {
|
|
175
|
+
"average_document_length": 332.4152658473415,
|
|
176
|
+
"average_query_length": 25.9304531085353,
|
|
177
|
+
"num_documents": 100001,
|
|
178
|
+
"num_queries": 949,
|
|
179
|
+
"average_relevant_docs_per_query": 1.0105374077976819,
|
|
180
|
+
}
|
|
181
|
+
},
|
|
182
|
+
},
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class CmedqaRetrieval(AbsTaskRetrieval):
|
|
187
|
+
metadata = TaskMetadata(
|
|
188
|
+
name="CmedqaRetrieval",
|
|
189
|
+
description="Online medical consultation text. Used the CMedQAv2 as its underlying dataset.",
|
|
190
|
+
reference="https://aclanthology.org/2022.emnlp-main.357.pdf",
|
|
191
|
+
dataset={
|
|
192
|
+
"path": "C-MTEB/CmedqaRetrieval",
|
|
193
|
+
"revision": "cd540c506dae1cf9e9a59c3e06f42030d54e7301",
|
|
194
|
+
"qrel_revision": "279d737f36c731c8ff6e2b055f31fe02216fa23d",
|
|
195
|
+
},
|
|
196
|
+
type="Retrieval",
|
|
197
|
+
category="s2p",
|
|
198
|
+
modalities=["text"],
|
|
199
|
+
eval_splits=["dev"],
|
|
200
|
+
eval_langs=["cmn-Hans"],
|
|
201
|
+
main_score="ndcg_at_10",
|
|
202
|
+
date=None,
|
|
203
|
+
domains=None,
|
|
204
|
+
task_subtypes=None,
|
|
205
|
+
license=None,
|
|
206
|
+
annotations_creators=None,
|
|
207
|
+
dialect=None,
|
|
208
|
+
sample_creation=None,
|
|
209
|
+
bibtex_citation=None,
|
|
210
|
+
descriptive_stats={
|
|
211
|
+
"n_samples": None,
|
|
212
|
+
"avg_character_length": {
|
|
213
|
+
"dev": {
|
|
214
|
+
"average_document_length": 307.7710222897771,
|
|
215
|
+
"average_query_length": 48.470367591897976,
|
|
216
|
+
"num_documents": 100001,
|
|
217
|
+
"num_queries": 3999,
|
|
218
|
+
"average_relevant_docs_per_query": 1.86271567891973,
|
|
219
|
+
}
|
|
220
|
+
},
|
|
221
|
+
},
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class EcomRetrieval(AbsTaskRetrieval):
|
|
226
|
+
ignore_identical_ids = True
|
|
227
|
+
|
|
228
|
+
metadata = TaskMetadata(
|
|
229
|
+
name="EcomRetrieval",
|
|
230
|
+
description="EcomRetrieval",
|
|
231
|
+
reference="https://arxiv.org/abs/2203.03367",
|
|
232
|
+
dataset={
|
|
233
|
+
"path": "C-MTEB/EcomRetrieval",
|
|
234
|
+
"revision": "687de13dc7294d6fd9be10c6945f9e8fec8166b9",
|
|
235
|
+
"qrel_revision": "39c90699b034ec22ac45b3abf5b0bbb5ffd421f9",
|
|
236
|
+
},
|
|
237
|
+
type="Retrieval",
|
|
238
|
+
category="s2p",
|
|
239
|
+
modalities=["text"],
|
|
240
|
+
eval_splits=["dev"],
|
|
241
|
+
eval_langs=["cmn-Hans"],
|
|
242
|
+
main_score="ndcg_at_10",
|
|
243
|
+
date=None,
|
|
244
|
+
domains=None,
|
|
245
|
+
task_subtypes=None,
|
|
246
|
+
license=None,
|
|
247
|
+
annotations_creators=None,
|
|
248
|
+
dialect=None,
|
|
249
|
+
sample_creation=None,
|
|
250
|
+
bibtex_citation=None,
|
|
251
|
+
descriptive_stats={
|
|
252
|
+
"n_samples": None,
|
|
253
|
+
"avg_character_length": {
|
|
254
|
+
"dev": {
|
|
255
|
+
"average_document_length": 32.98041664189015,
|
|
256
|
+
"average_query_length": 6.798,
|
|
257
|
+
"num_documents": 100902,
|
|
258
|
+
"num_queries": 1000,
|
|
259
|
+
"average_relevant_docs_per_query": 1.0,
|
|
260
|
+
}
|
|
261
|
+
},
|
|
262
|
+
},
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class MedicalRetrieval(AbsTaskRetrieval):
|
|
267
|
+
ignore_identical_ids = True
|
|
268
|
+
|
|
269
|
+
metadata = TaskMetadata(
|
|
270
|
+
name="MedicalRetrieval",
|
|
271
|
+
description="MedicalRetrieval",
|
|
272
|
+
reference="https://arxiv.org/abs/2203.03367",
|
|
273
|
+
dataset={
|
|
274
|
+
"path": "C-MTEB/MedicalRetrieval",
|
|
275
|
+
"revision": "2039188fb5800a9803ba5048df7b76e6fb151fc6",
|
|
276
|
+
"qrel_revision": "37b8efec53c54c3d9c6af212f6710b62ccdf895c",
|
|
277
|
+
},
|
|
278
|
+
type="Retrieval",
|
|
279
|
+
category="s2p",
|
|
280
|
+
modalities=["text"],
|
|
281
|
+
eval_splits=["dev"],
|
|
282
|
+
eval_langs=["cmn-Hans"],
|
|
283
|
+
main_score="ndcg_at_10",
|
|
284
|
+
date=None,
|
|
285
|
+
domains=None,
|
|
286
|
+
task_subtypes=None,
|
|
287
|
+
license=None,
|
|
288
|
+
annotations_creators=None,
|
|
289
|
+
dialect=None,
|
|
290
|
+
sample_creation=None,
|
|
291
|
+
bibtex_citation=None,
|
|
292
|
+
descriptive_stats={
|
|
293
|
+
"n_samples": None,
|
|
294
|
+
"avg_character_length": {
|
|
295
|
+
"dev": {
|
|
296
|
+
"average_document_length": 122.04231725066585,
|
|
297
|
+
"average_query_length": 17.938,
|
|
298
|
+
"num_documents": 100999,
|
|
299
|
+
"num_queries": 1000,
|
|
300
|
+
"average_relevant_docs_per_query": 1.0,
|
|
301
|
+
}
|
|
302
|
+
},
|
|
303
|
+
},
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
class VideoRetrieval(AbsTaskRetrieval):
|
|
308
|
+
ignore_identical_ids = True
|
|
309
|
+
|
|
310
|
+
metadata = TaskMetadata(
|
|
311
|
+
name="VideoRetrieval",
|
|
312
|
+
description="VideoRetrieval",
|
|
313
|
+
reference="https://arxiv.org/abs/2203.03367",
|
|
314
|
+
dataset={
|
|
315
|
+
"path": "C-MTEB/VideoRetrieval",
|
|
316
|
+
"revision": "58c2597a5943a2ba48f4668c3b90d796283c5639",
|
|
317
|
+
"qrel_revision": "faa71382b6a29cf1778d1f436b963e75cb5b927c",
|
|
318
|
+
},
|
|
319
|
+
type="Retrieval",
|
|
320
|
+
category="s2p",
|
|
321
|
+
modalities=["text"],
|
|
322
|
+
eval_splits=["dev"],
|
|
323
|
+
eval_langs=["cmn-Hans"],
|
|
324
|
+
main_score="ndcg_at_10",
|
|
325
|
+
date=None,
|
|
326
|
+
domains=None,
|
|
327
|
+
task_subtypes=None,
|
|
328
|
+
license=None,
|
|
329
|
+
annotations_creators=None,
|
|
330
|
+
dialect=None,
|
|
331
|
+
sample_creation=None,
|
|
332
|
+
bibtex_citation=None,
|
|
333
|
+
descriptive_stats={
|
|
334
|
+
"n_samples": None,
|
|
335
|
+
"avg_character_length": {
|
|
336
|
+
"dev": {
|
|
337
|
+
"average_document_length": 31.048855642524522,
|
|
338
|
+
"average_query_length": 7.365,
|
|
339
|
+
"num_documents": 100930,
|
|
340
|
+
"num_queries": 1000,
|
|
341
|
+
"average_relevant_docs_per_query": 1.0,
|
|
342
|
+
}
|
|
343
|
+
},
|
|
344
|
+
},
|
|
345
|
+
)
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
from mteb.abstasks.AbsTaskSTS import AbsTaskSTS
|
|
2
|
+
from mteb.abstasks.TaskMetadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
class ATEC(AbsTaskSTS):
|
|
5
|
+
metadata = TaskMetadata(
|
|
6
|
+
name="ATEC",
|
|
7
|
+
dataset={
|
|
8
|
+
"path": "C-MTEB/ATEC",
|
|
9
|
+
"revision": "0f319b1142f28d00e055a6770f3f726ae9b7d865",
|
|
10
|
+
},
|
|
11
|
+
description="A Chinese dataset for textual relatedness",
|
|
12
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
13
|
+
type="STS",
|
|
14
|
+
category="s2s",
|
|
15
|
+
modalities=["text"],
|
|
16
|
+
eval_splits=["validation", "test"],
|
|
17
|
+
eval_langs=["cmn-Hans"],
|
|
18
|
+
main_score="cosine_spearman",
|
|
19
|
+
date=None,
|
|
20
|
+
domains=None,
|
|
21
|
+
task_subtypes=None,
|
|
22
|
+
license=None,
|
|
23
|
+
annotations_creators=None,
|
|
24
|
+
dialect=None,
|
|
25
|
+
sample_creation=None,
|
|
26
|
+
bibtex_citation="""@inproceedings{raghu-etal-2021-end,
|
|
27
|
+
title = "End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs",
|
|
28
|
+
author = "Raghu, Dinesh and
|
|
29
|
+
Agarwal, Shantanu and
|
|
30
|
+
Joshi, Sachindra and
|
|
31
|
+
{Mausam}",
|
|
32
|
+
editor = "Moens, Marie-Francine and
|
|
33
|
+
Huang, Xuanjing and
|
|
34
|
+
Specia, Lucia and
|
|
35
|
+
Yih, Scott Wen-tau",
|
|
36
|
+
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
|
|
37
|
+
month = nov,
|
|
38
|
+
year = "2021",
|
|
39
|
+
address = "Online and Punta Cana, Dominican Republic",
|
|
40
|
+
publisher = "Association for Computational Linguistics",
|
|
41
|
+
url = "https://aclanthology.org/2021.emnlp-main.357",
|
|
42
|
+
doi = "10.18653/v1/2021.emnlp-main.357",
|
|
43
|
+
pages = "4348--4366",
|
|
44
|
+
abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
|
|
45
|
+
}""",
|
|
46
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
51
|
+
metadata_dict = super().metadata_dict
|
|
52
|
+
metadata_dict["min_score"] = 0
|
|
53
|
+
metadata_dict["max_score"] = 1
|
|
54
|
+
return metadata_dict
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class BQ(AbsTaskSTS):
|
|
58
|
+
metadata = TaskMetadata(
|
|
59
|
+
name="BQ",
|
|
60
|
+
dataset={
|
|
61
|
+
"path": "C-MTEB/BQ",
|
|
62
|
+
"revision": "e3dda5e115e487b39ec7e618c0c6a29137052a55",
|
|
63
|
+
},
|
|
64
|
+
description="A Chinese dataset for textual relatedness",
|
|
65
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
66
|
+
type="STS",
|
|
67
|
+
category="s2s",
|
|
68
|
+
modalities=["text"],
|
|
69
|
+
eval_splits=["validation", "test"],
|
|
70
|
+
eval_langs=["cmn-Hans"],
|
|
71
|
+
main_score="cosine_spearman",
|
|
72
|
+
date=None,
|
|
73
|
+
domains=None,
|
|
74
|
+
task_subtypes=None,
|
|
75
|
+
license=None,
|
|
76
|
+
annotations_creators=None,
|
|
77
|
+
dialect=None,
|
|
78
|
+
sample_creation=None,
|
|
79
|
+
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
80
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
81
|
+
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
82
|
+
year={2024},
|
|
83
|
+
eprint={2309.07597},
|
|
84
|
+
archivePrefix={arXiv},
|
|
85
|
+
primaryClass={cs.CL},
|
|
86
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
87
|
+
}""",
|
|
88
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
93
|
+
metadata_dict = super().metadata_dict
|
|
94
|
+
metadata_dict["min_score"] = 0
|
|
95
|
+
metadata_dict["max_score"] = 1
|
|
96
|
+
return metadata_dict
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class LCQMC(AbsTaskSTS):
|
|
100
|
+
metadata = TaskMetadata(
|
|
101
|
+
name="LCQMC",
|
|
102
|
+
dataset={
|
|
103
|
+
"path": "C-MTEB/LCQMC",
|
|
104
|
+
"revision": "17f9b096f80380fce5ed12a9be8be7784b337daf",
|
|
105
|
+
},
|
|
106
|
+
description="A Chinese dataset for textual relatedness",
|
|
107
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
108
|
+
type="STS",
|
|
109
|
+
category="s2s",
|
|
110
|
+
modalities=["text"],
|
|
111
|
+
eval_splits=["test"],
|
|
112
|
+
eval_langs=["cmn-Hans"],
|
|
113
|
+
main_score="cosine_spearman",
|
|
114
|
+
date=None,
|
|
115
|
+
domains=None,
|
|
116
|
+
task_subtypes=None,
|
|
117
|
+
license=None,
|
|
118
|
+
annotations_creators=None,
|
|
119
|
+
dialect=None,
|
|
120
|
+
sample_creation=None,
|
|
121
|
+
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
122
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
123
|
+
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
124
|
+
year={2024},
|
|
125
|
+
eprint={2309.07597},
|
|
126
|
+
archivePrefix={arXiv},
|
|
127
|
+
primaryClass={cs.CL},
|
|
128
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
129
|
+
}""",
|
|
130
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
135
|
+
metadata_dict = super().metadata_dict
|
|
136
|
+
metadata_dict["min_score"] = 0
|
|
137
|
+
metadata_dict["max_score"] = 1
|
|
138
|
+
return metadata_dict
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class PAWSX(AbsTaskSTS):
|
|
142
|
+
metadata = TaskMetadata(
|
|
143
|
+
name="PAWSX",
|
|
144
|
+
dataset={
|
|
145
|
+
"path": "C-MTEB/PAWSX",
|
|
146
|
+
"revision": "9c6a90e430ac22b5779fb019a23e820b11a8b5e1",
|
|
147
|
+
},
|
|
148
|
+
description="A Chinese dataset for textual relatedness",
|
|
149
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
150
|
+
type="STS",
|
|
151
|
+
category="s2s",
|
|
152
|
+
modalities=["text"],
|
|
153
|
+
eval_splits=["test"],
|
|
154
|
+
eval_langs=["cmn-Hans"],
|
|
155
|
+
main_score="cosine_spearman",
|
|
156
|
+
date=None,
|
|
157
|
+
domains=None,
|
|
158
|
+
task_subtypes=None,
|
|
159
|
+
license=None,
|
|
160
|
+
annotations_creators=None,
|
|
161
|
+
dialect=None,
|
|
162
|
+
sample_creation=None,
|
|
163
|
+
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
164
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
165
|
+
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
166
|
+
year={2024},
|
|
167
|
+
eprint={2309.07597},
|
|
168
|
+
archivePrefix={arXiv},
|
|
169
|
+
primaryClass={cs.CL},
|
|
170
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
171
|
+
}""",
|
|
172
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
177
|
+
metadata_dict = super().metadata_dict
|
|
178
|
+
metadata_dict["min_score"] = 0
|
|
179
|
+
metadata_dict["max_score"] = 1
|
|
180
|
+
return metadata_dict
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class STSB(AbsTaskSTS):
|
|
184
|
+
metadata = TaskMetadata(
|
|
185
|
+
name="STSB",
|
|
186
|
+
dataset={
|
|
187
|
+
"path": "C-MTEB/STSB",
|
|
188
|
+
"revision": "0cde68302b3541bb8b3c340dc0644b0b745b3dc0",
|
|
189
|
+
},
|
|
190
|
+
description="A Chinese dataset for textual relatedness",
|
|
191
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
192
|
+
type="STS",
|
|
193
|
+
category="s2s",
|
|
194
|
+
modalities=["text"],
|
|
195
|
+
eval_splits=["validation", "test"],
|
|
196
|
+
eval_langs=["cmn-Hans"],
|
|
197
|
+
main_score="cosine_spearman",
|
|
198
|
+
date=None,
|
|
199
|
+
domains=None,
|
|
200
|
+
task_subtypes=None,
|
|
201
|
+
license=None,
|
|
202
|
+
annotations_creators=None,
|
|
203
|
+
dialect=None,
|
|
204
|
+
sample_creation=None,
|
|
205
|
+
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
206
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
207
|
+
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
208
|
+
year={2024},
|
|
209
|
+
eprint={2309.07597},
|
|
210
|
+
archivePrefix={arXiv},
|
|
211
|
+
primaryClass={cs.CL},
|
|
212
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
213
|
+
}""",
|
|
214
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
219
|
+
metadata_dict = super().metadata_dict
|
|
220
|
+
metadata_dict["min_score"] = 0
|
|
221
|
+
metadata_dict["max_score"] = 5
|
|
222
|
+
return metadata_dict
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class AFQMC(AbsTaskSTS):
|
|
226
|
+
metadata = TaskMetadata(
|
|
227
|
+
name="AFQMC",
|
|
228
|
+
dataset={
|
|
229
|
+
"path": "C-MTEB/AFQMC",
|
|
230
|
+
"revision": "b44c3b011063adb25877c13823db83bb193913c4",
|
|
231
|
+
},
|
|
232
|
+
description="A Chinese dataset for textual relatedness",
|
|
233
|
+
reference="https://aclanthology.org/2021.emnlp-main.357",
|
|
234
|
+
type="STS",
|
|
235
|
+
category="s2s",
|
|
236
|
+
modalities=["text"],
|
|
237
|
+
eval_splits=["validation"],
|
|
238
|
+
eval_langs=["cmn-Hans"],
|
|
239
|
+
main_score="cosine_spearman",
|
|
240
|
+
date=None,
|
|
241
|
+
domains=None,
|
|
242
|
+
task_subtypes=None,
|
|
243
|
+
license=None,
|
|
244
|
+
annotations_creators=None,
|
|
245
|
+
dialect=None,
|
|
246
|
+
sample_creation=None,
|
|
247
|
+
bibtex_citation="""@inproceedings{raghu-etal-2021-end,
|
|
248
|
+
title = "End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs",
|
|
249
|
+
author = "Raghu, Dinesh and
|
|
250
|
+
Agarwal, Shantanu and
|
|
251
|
+
Joshi, Sachindra and
|
|
252
|
+
{Mausam}",
|
|
253
|
+
editor = "Moens, Marie-Francine and
|
|
254
|
+
Huang, Xuanjing and
|
|
255
|
+
Specia, Lucia and
|
|
256
|
+
Yih, Scott Wen-tau",
|
|
257
|
+
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
|
|
258
|
+
month = nov,
|
|
259
|
+
year = "2021",
|
|
260
|
+
address = "Online and Punta Cana, Dominican Republic",
|
|
261
|
+
publisher = "Association for Computational Linguistics",
|
|
262
|
+
url = "https://aclanthology.org/2021.emnlp-main.357",
|
|
263
|
+
doi = "10.18653/v1/2021.emnlp-main.357",
|
|
264
|
+
pages = "4348--4366",
|
|
265
|
+
abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
|
|
266
|
+
}""",
|
|
267
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
@property
|
|
271
|
+
def metadata_dict(self) -> dict[str, str]:
|
|
272
|
+
metadata_dict = super().metadata_dict
|
|
273
|
+
metadata_dict["min_score"] = 0
|
|
274
|
+
metadata_dict["max_score"] = 1
|
|
275
|
+
return metadata_dict
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
class QBQTC(AbsTaskSTS):
|
|
279
|
+
metadata = TaskMetadata(
|
|
280
|
+
name="QBQTC",
|
|
281
|
+
dataset={
|
|
282
|
+
"path": "C-MTEB/QBQTC",
|
|
283
|
+
"revision": "790b0510dc52b1553e8c49f3d2afb48c0e5c48b7",
|
|
284
|
+
},
|
|
285
|
+
description="",
|
|
286
|
+
reference="https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset",
|
|
287
|
+
type="STS",
|
|
288
|
+
category="s2s",
|
|
289
|
+
modalities=["text"],
|
|
290
|
+
eval_splits=["test"],
|
|
291
|
+
eval_langs=["cmn-Hans"],
|
|
292
|
+
main_score="cosine_spearman",
|
|
293
|
+
date=None,
|
|
294
|
+
domains=None,
|
|
295
|
+
task_subtypes=None,
|
|
296
|
+
license=None,
|
|
297
|
+
annotations_creators=None,
|
|
298
|
+
dialect=None,
|
|
299
|
+
sample_creation=None,
|
|
300
|
+
bibtex_citation=None,
|
|
301
|
+
descriptive_stats={"n_samples": None, "avg_character_length": None},
|
|
302
|
+
)
|