evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +26 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +9 -26
- evalscope/evaluator/evaluator.py +87 -121
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +48 -72
- evalscope/run_arena.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +5 -4
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +5 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +52 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
import os.path
|
|
3
3
|
import random
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
-
from typing import Any, Optional
|
|
5
|
+
from typing import Any, List, Optional
|
|
6
6
|
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
7
|
+
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
8
|
+
from evalscope.metrics import Metric
|
|
9
|
+
from evalscope.report import Report, ReportGenerator
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
@@ -14,15 +15,22 @@ logger = get_logger()
|
|
|
14
15
|
class DataAdapter(ABC):
|
|
15
16
|
|
|
16
17
|
def __init__(self,
|
|
18
|
+
name: str,
|
|
17
19
|
subset_list: list,
|
|
18
|
-
metric_list:
|
|
20
|
+
metric_list: List[Metric],
|
|
19
21
|
few_shot_num: Optional[int] = 0,
|
|
20
22
|
train_split: Optional[str] = None,
|
|
21
23
|
eval_split: Optional[str] = None,
|
|
22
|
-
prompt_template: str =
|
|
24
|
+
prompt_template: Optional[str] = None,
|
|
23
25
|
**kwargs):
|
|
24
26
|
"""
|
|
27
|
+
Data Adapter for the benchmark. You need to implement the following methods:
|
|
28
|
+
- gen_prompt
|
|
29
|
+
- get_gold_answer
|
|
30
|
+
- parse_pred_result
|
|
31
|
+
- match
|
|
25
32
|
Args:
|
|
33
|
+
name: str, the name of the benchmark.
|
|
26
34
|
subset_list: list of subset names for the dataset.
|
|
27
35
|
metric_list: list, the metric list to evaluate the model on specific benchmark.
|
|
28
36
|
few_shot_num: int, number of few-shot examples. Default: 0
|
|
@@ -32,6 +40,7 @@ class DataAdapter(ABC):
|
|
|
32
40
|
e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
|
|
33
41
|
the form of A or B or C or D, do not output explanation:`
|
|
34
42
|
"""
|
|
43
|
+
self.name = name
|
|
35
44
|
self.subset_list = subset_list
|
|
36
45
|
self.metric_list = metric_list
|
|
37
46
|
self.few_shot_num = few_shot_num
|
|
@@ -39,6 +48,7 @@ class DataAdapter(ABC):
|
|
|
39
48
|
self.eval_split = eval_split
|
|
40
49
|
self.prompt_template = prompt_template
|
|
41
50
|
self.config_kwargs = kwargs
|
|
51
|
+
self.category_map = kwargs.get('category_map', {})
|
|
42
52
|
|
|
43
53
|
def load(self,
|
|
44
54
|
dataset_name_or_path: str,
|
|
@@ -55,33 +65,36 @@ class DataAdapter(ABC):
|
|
|
55
65
|
|
|
56
66
|
"""
|
|
57
67
|
dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
68
|
+
subset_list = subset_list or self.subset_list
|
|
58
69
|
|
|
59
70
|
# Try to load dataset from local disk
|
|
60
71
|
if os.path.exists(dataset_name_or_path):
|
|
61
|
-
logger.info(
|
|
62
|
-
|
|
72
|
+
logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
|
|
73
|
+
subsets: {subset_list}')
|
|
63
74
|
data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
64
75
|
if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
|
|
65
76
|
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
66
77
|
else:
|
|
78
|
+
from modelscope.msdatasets import MsDataset
|
|
79
|
+
|
|
67
80
|
# Load dataset from remote
|
|
68
|
-
logger.info(
|
|
81
|
+
logger.info(
|
|
82
|
+
f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
69
83
|
data_dict = {}
|
|
70
84
|
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
71
85
|
if len(split_list) == 0:
|
|
72
86
|
logger.error(f'Got empty split list: {split_list}')
|
|
73
87
|
|
|
74
|
-
subset_list = subset_list if subset_list is not None else self.subset_list
|
|
75
88
|
for sub_name in subset_list:
|
|
76
89
|
data_dict[sub_name] = {}
|
|
77
90
|
# e.g. train: few-shot, test: target dataset to evaluate
|
|
78
91
|
for split in split_list:
|
|
79
|
-
dataset =
|
|
92
|
+
dataset = MsDataset.load(
|
|
80
93
|
dataset_name=dataset_name_or_path,
|
|
81
|
-
|
|
94
|
+
subset_name=sub_name,
|
|
82
95
|
split=split,
|
|
96
|
+
cache_dir=work_dir,
|
|
83
97
|
hub=datasets_hub,
|
|
84
|
-
work_dir=work_dir,
|
|
85
98
|
**kwargs)
|
|
86
99
|
|
|
87
100
|
data_dict[sub_name].update({split: dataset})
|
|
@@ -132,30 +145,105 @@ class DataAdapter(ABC):
|
|
|
132
145
|
prompt_d[AnswerKeys.RAW_INPUT] = sample_d
|
|
133
146
|
res_dict[sub_name].append(prompt_d)
|
|
134
147
|
|
|
135
|
-
rnd = random.Random()
|
|
136
|
-
rnd.seed(42)
|
|
137
|
-
for k, v in res_dict.items():
|
|
138
|
-
rnd.shuffle(v)
|
|
139
|
-
|
|
140
148
|
return res_dict
|
|
141
149
|
|
|
142
|
-
|
|
143
|
-
|
|
150
|
+
def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
|
|
151
|
+
|
|
152
|
+
if k > len(data_list):
|
|
153
|
+
k = len(data_list)
|
|
154
|
+
if few_shot_random:
|
|
155
|
+
return random.sample(data_list, k)
|
|
156
|
+
else:
|
|
157
|
+
return data_list[:k]
|
|
158
|
+
|
|
159
|
+
def compute_metric(self, review_res_list: list) -> List[dict]:
|
|
160
|
+
"""
|
|
161
|
+
Compute evaluation result by specific metrics.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
review_res_list: list, the review result list, each item of which is match result for gold and pred.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]
|
|
168
|
+
"""
|
|
169
|
+
if len(self.metric_list) == 0:
|
|
170
|
+
raise ValueError('No metric list found for the benchmark.')
|
|
171
|
+
|
|
172
|
+
res_list = []
|
|
173
|
+
for metric in self.metric_list:
|
|
174
|
+
metric_name = metric.name
|
|
175
|
+
metric_func = metric.object
|
|
176
|
+
res_list.append({
|
|
177
|
+
'metric_name': metric_name,
|
|
178
|
+
'score': metric_func(review_res_list),
|
|
179
|
+
'num': len(review_res_list)
|
|
180
|
+
})
|
|
181
|
+
return res_list
|
|
182
|
+
|
|
183
|
+
def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
|
|
184
|
+
"""
|
|
185
|
+
Generate report for the evaluation results for all subsets.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
subset_score_map: The subset-score map.
|
|
189
|
+
e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
|
|
190
|
+
|
|
191
|
+
report_name: str, the user-defined report name. Default: None
|
|
192
|
+
|
|
193
|
+
Returns: The evaluation report.
|
|
194
|
+
|
|
195
|
+
Here is a format example for gsm8k:
|
|
196
|
+
{
|
|
197
|
+
"name": "qwen2.5_gsm8k",
|
|
198
|
+
"metrics": [
|
|
199
|
+
{
|
|
200
|
+
"name": "AverageAccuracy",
|
|
201
|
+
"categories": [
|
|
202
|
+
{
|
|
203
|
+
"name": "default",
|
|
204
|
+
"subsets": [
|
|
205
|
+
{
|
|
206
|
+
"name": "main",
|
|
207
|
+
"score": 0.0,
|
|
208
|
+
"num": 2
|
|
209
|
+
}
|
|
210
|
+
],
|
|
211
|
+
"num": 2,
|
|
212
|
+
"score": 0.0,
|
|
213
|
+
"macro_score": 0.0
|
|
214
|
+
}
|
|
215
|
+
],
|
|
216
|
+
"num": 2,
|
|
217
|
+
"score": 0.0,
|
|
218
|
+
"macro_score": 0.0
|
|
219
|
+
}
|
|
220
|
+
],
|
|
221
|
+
"dataset_name": "gsm8k",
|
|
222
|
+
"model_name": "qwen2.5"
|
|
223
|
+
}
|
|
224
|
+
""" # noqa: E501
|
|
225
|
+
kwargs['category_map'] = self.category_map
|
|
226
|
+
kwargs['metric_list'] = self.metric_list
|
|
227
|
+
return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
|
|
228
|
+
|
|
229
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
144
230
|
"""
|
|
145
231
|
Generate model prompt from raw input, unify the prompt format for different datasets.
|
|
146
232
|
The input format is compatible with OpenAI Chat Completions APIs.
|
|
147
|
-
Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
|
|
148
233
|
|
|
149
234
|
Args:
|
|
150
235
|
input_d (Any): The raw input. Depending on the dataset.
|
|
236
|
+
subset_name (str): The subset name.
|
|
237
|
+
few_shot_list (list): The few-shot examples.
|
|
151
238
|
|
|
152
239
|
Returns:
|
|
240
|
+
For class ChatGenerationModelAdapter, the output format is:
|
|
241
|
+
{'data': [full_prompt], 'system_prompt': (str, optional)}, -- full_prompt: str, the constructed prompt for each sample from dataset.
|
|
153
242
|
For class MultiChoiceModelAdapter, the output format is:
|
|
154
|
-
{'data': [full_prompt]}
|
|
155
|
-
|
|
243
|
+
{'data': [full_prompt], 'multi_choices': self.choices} -- full_prompt: str, the constructed prompt for each sample from dataset.
|
|
156
244
|
For class ContinuationEvalModelAdapter, the output format is:
|
|
157
|
-
{'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
|
|
158
|
-
"""
|
|
245
|
+
{'data': ctx_continuation_pair_list, 'multi_choices': self.choices} -- ctx_continuation_pair_list: list, the context-continuation pair list.
|
|
246
|
+
""" # noqa: E501
|
|
159
247
|
raise NotImplementedError
|
|
160
248
|
|
|
161
249
|
@abstractmethod
|
|
@@ -172,7 +260,7 @@ class DataAdapter(ABC):
|
|
|
172
260
|
raise NotImplementedError
|
|
173
261
|
|
|
174
262
|
@abstractmethod
|
|
175
|
-
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str =
|
|
263
|
+
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
|
|
176
264
|
"""
|
|
177
265
|
Parse the predicted result and extract proper answer.
|
|
178
266
|
|
|
@@ -193,71 +281,11 @@ class DataAdapter(ABC):
|
|
|
193
281
|
|
|
194
282
|
Args:
|
|
195
283
|
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
|
|
196
|
-
e.g. 'A'
|
|
284
|
+
e.g. 'A', extracted from get_gold_answer method.
|
|
197
285
|
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
|
|
198
|
-
e.g. 'B'
|
|
286
|
+
e.g. 'B', extracted from parse_pred_result method.
|
|
199
287
|
|
|
200
288
|
Returns:
|
|
201
289
|
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
202
290
|
"""
|
|
203
291
|
raise NotImplementedError
|
|
204
|
-
|
|
205
|
-
@abstractmethod
|
|
206
|
-
def compute_metric(self, review_res_list: list) -> Any:
|
|
207
|
-
"""
|
|
208
|
-
Compute evaluation result by specific metrics.
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
review_res_list: list, the review result list, each item of which is match result for gold and pred.
|
|
212
|
-
|
|
213
|
-
Attributes:
|
|
214
|
-
DataAdapter.metric_func_map: metric_name -> metric_func mapping,
|
|
215
|
-
e.g. {'WeightedAverageAccuracy': weighted_average_acc}
|
|
216
|
-
|
|
217
|
-
Returns:
|
|
218
|
-
Metric results.
|
|
219
|
-
"""
|
|
220
|
-
raise NotImplementedError
|
|
221
|
-
|
|
222
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
223
|
-
"""
|
|
224
|
-
Generate report for the evaluation results for all subsets.
|
|
225
|
-
|
|
226
|
-
Args:
|
|
227
|
-
subset_score_map: The subset-score map.
|
|
228
|
-
e.g. {subset_name: (score, num)}
|
|
229
|
-
|
|
230
|
-
report_name: str, the user-defined report name. Default: None
|
|
231
|
-
|
|
232
|
-
Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
|
|
233
|
-
|
|
234
|
-
Here is a format example for ARC-Challenge:
|
|
235
|
-
{
|
|
236
|
-
"name":"ARC-Challenge",
|
|
237
|
-
"metric":"WeightedAverageAccuracy",
|
|
238
|
-
"score": 0.3389,
|
|
239
|
-
"category":[
|
|
240
|
-
{
|
|
241
|
-
"name":"DEFAULT",
|
|
242
|
-
"score": 0.3389,
|
|
243
|
-
"subset":[
|
|
244
|
-
{
|
|
245
|
-
"name":"ARC-Challenge",
|
|
246
|
-
"score": 0.3389
|
|
247
|
-
},
|
|
248
|
-
]
|
|
249
|
-
}
|
|
250
|
-
],
|
|
251
|
-
"total_num":100
|
|
252
|
-
}
|
|
253
|
-
"""
|
|
254
|
-
raise NotImplementedError
|
|
255
|
-
|
|
256
|
-
def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
|
|
257
|
-
|
|
258
|
-
if k > len(data_list):
|
|
259
|
-
k = len(data_list)
|
|
260
|
-
if few_shot_random:
|
|
261
|
-
return random.sample(data_list, k)
|
|
262
|
-
else:
|
|
263
|
-
return data_list[:k]
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter
|
|
5
|
-
from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
|
|
@@ -1,39 +1,34 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import glob
|
|
3
|
-
import json
|
|
4
3
|
import os.path
|
|
5
4
|
from collections import defaultdict
|
|
6
|
-
from typing import
|
|
5
|
+
from typing import List
|
|
7
6
|
|
|
8
|
-
from evalscope.benchmarks
|
|
9
|
-
from evalscope.metrics
|
|
10
|
-
from evalscope.
|
|
7
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
8
|
+
from evalscope.metrics import AverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
|
|
9
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
11
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
12
11
|
from evalscope.utils.logger import get_logger
|
|
13
12
|
|
|
14
13
|
logger = get_logger()
|
|
15
14
|
|
|
16
|
-
DATASET_ID = 'general_qa'
|
|
17
|
-
SUBSET_LIST = ['default']
|
|
18
|
-
|
|
19
15
|
|
|
16
|
+
@Benchmark.register(
|
|
17
|
+
name='general_qa',
|
|
18
|
+
dataset_id='general_qa',
|
|
19
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
20
|
+
subset_list=['default'],
|
|
21
|
+
metric_list=[AverageBLEU],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
20
26
|
class GeneralQAAdapter(DataAdapter):
|
|
21
27
|
# TODO: set few_shot_num
|
|
22
28
|
|
|
23
|
-
def __init__(self,
|
|
24
|
-
subset_list: list = None,
|
|
25
|
-
metric_list: list = None,
|
|
26
|
-
train_split: str = None,
|
|
27
|
-
eval_split: str = 'test',
|
|
28
|
-
**kwargs):
|
|
29
|
-
if subset_list is None:
|
|
30
|
-
subset_list = SUBSET_LIST
|
|
31
|
-
|
|
32
|
-
if metric_list is None:
|
|
33
|
-
metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
|
|
29
|
+
def __init__(self, **kwargs):
|
|
34
30
|
|
|
35
|
-
super().__init__(
|
|
36
|
-
subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs)
|
|
31
|
+
super().__init__(**kwargs)
|
|
37
32
|
|
|
38
33
|
def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:
|
|
39
34
|
|
|
@@ -71,7 +66,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
71
66
|
|
|
72
67
|
# if len(history) > 0:
|
|
73
68
|
# prompt = '\n'.join(history) + '\n' + prompt
|
|
74
|
-
return {'data': [prompt]}
|
|
69
|
+
return {'data': [prompt], 'system_prompt': self.prompt_template}
|
|
75
70
|
|
|
76
71
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
77
72
|
"""
|
|
@@ -95,14 +90,14 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
95
90
|
"""
|
|
96
91
|
return result
|
|
97
92
|
|
|
98
|
-
def match(self, gold: str, pred: str) ->
|
|
93
|
+
def match(self, gold: str, pred: str) -> dict:
|
|
99
94
|
"""
|
|
100
95
|
Args:
|
|
101
96
|
gold: str
|
|
102
97
|
pred: str
|
|
103
98
|
|
|
104
99
|
Returns:
|
|
105
|
-
bleu_score:
|
|
100
|
+
bleu_score: dict
|
|
106
101
|
|
|
107
102
|
"""
|
|
108
103
|
res = dict()
|
|
@@ -110,10 +105,9 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
110
105
|
bleu_dict = bleu_ngram_one_sample(pred, gold)
|
|
111
106
|
res.update(rouge_dict)
|
|
112
107
|
res.update(bleu_dict)
|
|
113
|
-
# return bleu(item)
|
|
114
108
|
return res
|
|
115
109
|
|
|
116
|
-
def compute_metric(self, review_res_list:
|
|
110
|
+
def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
|
|
117
111
|
"""
|
|
118
112
|
compute weighted mean of the bleu score of all samples
|
|
119
113
|
|
|
@@ -121,62 +115,12 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
121
115
|
review_res_list: [score1, score2, ...]
|
|
122
116
|
|
|
123
117
|
Returns:
|
|
124
|
-
avg_res:
|
|
118
|
+
avg_res: List[dict]
|
|
125
119
|
|
|
126
120
|
"""
|
|
127
121
|
items = defaultdict(list)
|
|
128
122
|
for scores in review_res_list:
|
|
129
123
|
for k, v in scores.items():
|
|
130
|
-
items[k].append(
|
|
124
|
+
items[k].append(v)
|
|
131
125
|
# items = [(score, 1.0) for score in review_res_list]
|
|
132
|
-
|
|
133
|
-
# return weighted_mean(items)
|
|
134
|
-
return res
|
|
135
|
-
|
|
136
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
137
|
-
"""
|
|
138
|
-
Args:
|
|
139
|
-
subset_score_map: {subset_name: (score_dict, num), ...}
|
|
140
|
-
report_name: str, the user-defined report name.
|
|
141
|
-
|
|
142
|
-
Returns:
|
|
143
|
-
{
|
|
144
|
-
"name":"GeneralQA",
|
|
145
|
-
"metric":"WeightedAverageBLEU",
|
|
146
|
-
"score":0.399,
|
|
147
|
-
"category":[
|
|
148
|
-
{
|
|
149
|
-
"name":"DEFAULT",
|
|
150
|
-
"score":0.399,
|
|
151
|
-
"subset":[
|
|
152
|
-
{
|
|
153
|
-
"name":"default",
|
|
154
|
-
"score":0.399
|
|
155
|
-
},
|
|
156
|
-
]
|
|
157
|
-
}
|
|
158
|
-
],
|
|
159
|
-
"total_num":10
|
|
160
|
-
}
|
|
161
|
-
"""
|
|
162
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
163
|
-
# weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
164
|
-
cate_avg_list = [{
|
|
165
|
-
'name': subset_name,
|
|
166
|
-
'score': score_dict
|
|
167
|
-
} for subset_name, (score_dict, _) in subset_score_map.items()]
|
|
168
|
-
total_avg_list = defaultdict(float)
|
|
169
|
-
for score_dict, num in subset_score_map.values():
|
|
170
|
-
for metric, score in score_dict.items():
|
|
171
|
-
total_avg_list[metric] += score * num / total_num
|
|
172
|
-
|
|
173
|
-
category_d = dict(name='DEFAULT', score=total_avg_list, subset=cate_avg_list)
|
|
174
|
-
|
|
175
|
-
res_map = dict(
|
|
176
|
-
name=report_name or 'general_qa',
|
|
177
|
-
metric=self.metric_list[0]['name'],
|
|
178
|
-
score=total_avg_list,
|
|
179
|
-
category=[category_d],
|
|
180
|
-
total_num=total_num)
|
|
181
|
-
|
|
182
|
-
return res_map
|
|
126
|
+
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
@@ -1,5 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|
|
@@ -1,70 +1,51 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) EleutherAI, Inc. and its affiliates.
|
|
3
|
+
# flake8: noqa
|
|
3
4
|
import math
|
|
4
5
|
import os
|
|
5
6
|
import re
|
|
6
7
|
|
|
7
|
-
from evalscope.benchmarks import DataAdapter
|
|
8
|
-
from evalscope.metrics
|
|
9
|
-
from evalscope.
|
|
8
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
+
from evalscope.metrics import AverageAccuracy
|
|
10
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
10
11
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
12
|
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
13
|
-
# flake8: noqa
|
|
14
|
-
|
|
15
14
|
logger = get_logger()
|
|
16
15
|
|
|
17
|
-
DATASET_ID = 'modelscope/gsm8k'
|
|
18
|
-
SUBSET_LIST = ['main']
|
|
19
|
-
ANS_RE = re.compile(r'#### (\-?[0-9\.\,]+)')
|
|
20
|
-
INVALID_ANS = '[invalid]'
|
|
21
|
-
|
|
22
16
|
|
|
17
|
+
@Benchmark.register(
|
|
18
|
+
name='gsm8k',
|
|
19
|
+
dataset_id='modelscope/gsm8k',
|
|
20
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
21
|
+
subset_list=['main'],
|
|
22
|
+
metric_list=[AverageAccuracy],
|
|
23
|
+
few_shot_num=4,
|
|
24
|
+
train_split='train',
|
|
25
|
+
eval_split='test',
|
|
26
|
+
prompt_template='',
|
|
27
|
+
)
|
|
23
28
|
class GSM8KAdapter(DataAdapter):
|
|
24
29
|
|
|
25
|
-
def __init__(self,
|
|
26
|
-
subset_list: list = None,
|
|
27
|
-
metric_list: list = None,
|
|
28
|
-
few_shot_num: int = None,
|
|
29
|
-
train_split: str = 'train',
|
|
30
|
-
eval_split: str = 'test',
|
|
31
|
-
prompt_template: str = '',
|
|
32
|
-
**kwargs):
|
|
30
|
+
def __init__(self, **kwargs):
|
|
33
31
|
"""
|
|
34
32
|
Data adapter for GSM8K dataset.
|
|
35
33
|
|
|
36
34
|
Args:
|
|
37
35
|
subset_list (list): Subset list for the dataset. Default: ['main']
|
|
38
|
-
metric_list (list): Metric list for the dataset. Default: [{'name': '
|
|
36
|
+
metric_list (list): Metric list for the dataset. Default: [{'name': 'AverageAccuracy', 'object': mean}]
|
|
39
37
|
few_shot_num (int): Number of few-shot examples. Default: 4
|
|
40
38
|
train_split (str): Train split name. Default: 'train'
|
|
41
39
|
eval_split (str): The target eval split name. Default: 'test'
|
|
42
40
|
**kwargs: ...
|
|
43
41
|
"""
|
|
44
|
-
|
|
45
|
-
if subset_list is None:
|
|
46
|
-
subset_list = SUBSET_LIST
|
|
47
|
-
|
|
48
|
-
if metric_list is None:
|
|
49
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
50
|
-
|
|
51
|
-
if few_shot_num is None:
|
|
52
|
-
logger.info(f'Set 4-shot examples by system for GSM8K.')
|
|
53
|
-
few_shot_num = 4
|
|
54
|
-
|
|
42
|
+
few_shot_num = kwargs.get('few_shot_num', 4)
|
|
55
43
|
if few_shot_num != 4 and few_shot_num != 0:
|
|
56
44
|
logger.error(f'GSM8K uses 4-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
|
|
57
45
|
f'Use 4-shot by default.')
|
|
58
|
-
few_shot_num = 4
|
|
46
|
+
kwargs['few_shot_num'] = 4
|
|
59
47
|
|
|
60
|
-
super().__init__(
|
|
61
|
-
subset_list=subset_list,
|
|
62
|
-
metric_list=metric_list,
|
|
63
|
-
few_shot_num=few_shot_num,
|
|
64
|
-
train_split=train_split,
|
|
65
|
-
eval_split=eval_split,
|
|
66
|
-
prompt_template=prompt_template,
|
|
67
|
-
**kwargs)
|
|
48
|
+
super().__init__(**kwargs)
|
|
68
49
|
|
|
69
50
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
70
51
|
data_dict = {}
|
|
@@ -94,9 +75,8 @@ class GSM8KAdapter(DataAdapter):
|
|
|
94
75
|
use_fewshot = self.few_shot_num > 0
|
|
95
76
|
|
|
96
77
|
full_prompt = self._generate_prompt(input_d, few_shot_list=few_shot_list, use_fewshot=use_fewshot)
|
|
97
|
-
full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
|
|
98
78
|
|
|
99
|
-
return {'data': [full_prompt]}
|
|
79
|
+
return {'data': [full_prompt], 'system_prompt': self.prompt_template}
|
|
100
80
|
|
|
101
81
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
102
82
|
# Extract the gold answer from the input dict.
|
|
@@ -142,66 +122,6 @@ class GSM8KAdapter(DataAdapter):
|
|
|
142
122
|
|
|
143
123
|
return number_equal(gold_ans=gold, pred_ans=pred)
|
|
144
124
|
|
|
145
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
146
|
-
"""
|
|
147
|
-
Compute evaluation result by specific metric.
|
|
148
|
-
|
|
149
|
-
Args:
|
|
150
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
151
|
-
|
|
152
|
-
Returns:
|
|
153
|
-
The metric score.
|
|
154
|
-
"""
|
|
155
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
156
|
-
return weighted_mean(items)
|
|
157
|
-
|
|
158
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
159
|
-
"""
|
|
160
|
-
Generate the report for the model output.
|
|
161
|
-
|
|
162
|
-
Args:
|
|
163
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
164
|
-
report_name: The user-defined report name. Default: None
|
|
165
|
-
|
|
166
|
-
Returns: A dict of metric calculation results. The format is like:
|
|
167
|
-
{
|
|
168
|
-
"name":"GSM8K",
|
|
169
|
-
"metric":"WeightedAverageAccuracy",
|
|
170
|
-
"score":0.5632,
|
|
171
|
-
"category":[
|
|
172
|
-
{
|
|
173
|
-
"name":"DEFAULT",
|
|
174
|
-
"score":0.5632,
|
|
175
|
-
"subset":[
|
|
176
|
-
{
|
|
177
|
-
"name":"main",
|
|
178
|
-
"score":0.5632
|
|
179
|
-
},
|
|
180
|
-
]
|
|
181
|
-
}
|
|
182
|
-
],
|
|
183
|
-
"total_num":100
|
|
184
|
-
}
|
|
185
|
-
"""
|
|
186
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
187
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
188
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
189
|
-
cate_avg_list = [{
|
|
190
|
-
'name': subset_name,
|
|
191
|
-
'score': normalize_score(score=score)
|
|
192
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
193
|
-
|
|
194
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
195
|
-
|
|
196
|
-
res_map = dict(
|
|
197
|
-
name=report_name or 'gsm8k',
|
|
198
|
-
metric=self.metric_list[0]['name'],
|
|
199
|
-
score=weighted_avg_acc,
|
|
200
|
-
category=[category_d],
|
|
201
|
-
total_num=total_num)
|
|
202
|
-
|
|
203
|
-
return res_map
|
|
204
|
-
|
|
205
125
|
@classmethod
|
|
206
126
|
def _generate_prompt(cls, input_d: dict, few_shot_list: list, use_fewshot: bool = True) -> str:
|
|
207
127
|
if use_fewshot:
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter
|
|
5
|
-
from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
|