evalscope 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +3 -1
- evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +14 -17
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -11
- evalscope/benchmarks/benchmark.py +12 -10
- evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +6 -20
- evalscope/benchmarks/data_adapter.py +82 -19
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +15 -22
- evalscope/benchmarks/general_qa/general_qa_adapter.py +29 -16
- evalscope/benchmarks/gpqa/gpqa_adapter.py +13 -8
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -4
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/math_500/math_500_adapter.py +9 -4
- evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +24 -36
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +71 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +99 -0
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +90 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +7 -14
- evalscope/benchmarks/utils.py +43 -0
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +16 -1
- evalscope/config.py +13 -3
- evalscope/constants.py +7 -0
- evalscope/evaluator/evaluator.py +3 -1
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/metrics.py +23 -2
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +32 -6
- evalscope/models/chat_adapter.py +4 -1
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/local_model.py +3 -2
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +107 -29
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +18 -8
- evalscope/perf/http_client.py +8 -6
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +15 -8
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +6 -5
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +429 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +14 -5
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +89 -65
- tests/cli/test_collection.py +1 -1
- tests/cli/test_run.py +151 -32
- /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
import os.path
|
|
3
3
|
import random
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
+
from collections import defaultdict
|
|
5
6
|
from typing import Any, List, Optional, Union
|
|
6
7
|
|
|
8
|
+
from evalscope.benchmarks.utils import PromptData, preprocess_decorator
|
|
7
9
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
8
10
|
from evalscope.metrics.named_metrics import metric_registry
|
|
9
11
|
from evalscope.report import Report, ReportGenerator
|
|
@@ -17,6 +19,7 @@ class DataAdapter(ABC):
|
|
|
17
19
|
def __init__(self,
|
|
18
20
|
name: str,
|
|
19
21
|
dataset_id: str,
|
|
22
|
+
model_adapter: str,
|
|
20
23
|
subset_list: list,
|
|
21
24
|
metric_list: List[str],
|
|
22
25
|
few_shot_num: Optional[int] = 0,
|
|
@@ -24,6 +27,8 @@ class DataAdapter(ABC):
|
|
|
24
27
|
eval_split: Optional[str] = None,
|
|
25
28
|
prompt_template: Optional[str] = None,
|
|
26
29
|
system_prompt: Optional[str] = None,
|
|
30
|
+
query_template: Optional[str] = None,
|
|
31
|
+
pretty_name: Optional[str] = None,
|
|
27
32
|
**kwargs):
|
|
28
33
|
"""
|
|
29
34
|
Data Adapter for the benchmark. You need to implement the following methods:
|
|
@@ -45,6 +50,7 @@ class DataAdapter(ABC):
|
|
|
45
50
|
"""
|
|
46
51
|
self.name = name
|
|
47
52
|
self.dataset_id = dataset_id
|
|
53
|
+
self.model_adapter = model_adapter
|
|
48
54
|
self.subset_list = subset_list
|
|
49
55
|
self.metric_list = metric_list
|
|
50
56
|
self.few_shot_num = few_shot_num
|
|
@@ -52,14 +58,24 @@ class DataAdapter(ABC):
|
|
|
52
58
|
self.eval_split = eval_split
|
|
53
59
|
self.prompt_template = prompt_template
|
|
54
60
|
self.system_prompt = system_prompt
|
|
61
|
+
self.query_template = query_template
|
|
62
|
+
self.pretty_name = pretty_name
|
|
55
63
|
self.config_kwargs = kwargs
|
|
56
64
|
self.category_map = kwargs.get('category_map', {})
|
|
65
|
+
self.choices = kwargs.get('choices', None)
|
|
66
|
+
|
|
67
|
+
def __init_subclass__(cls, **kwargs):
|
|
68
|
+
super().__init_subclass__(**kwargs)
|
|
69
|
+
|
|
70
|
+
# find and decorate parse_pred_result method
|
|
71
|
+
if hasattr(cls, 'parse_pred_result'):
|
|
72
|
+
original_method = cls.parse_pred_result
|
|
73
|
+
cls.parse_pred_result = preprocess_decorator(original_method)
|
|
57
74
|
|
|
58
75
|
def load(self,
|
|
59
76
|
dataset_name_or_path: str = None,
|
|
60
77
|
subset_list: list = None,
|
|
61
78
|
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
62
|
-
datasets_hub: str = HubType.MODELSCOPE,
|
|
63
79
|
**kwargs) -> dict:
|
|
64
80
|
"""
|
|
65
81
|
Load the dataset. Remote and local datasets are supported.
|
|
@@ -74,22 +90,43 @@ class DataAdapter(ABC):
|
|
|
74
90
|
|
|
75
91
|
# Try to load dataset from local disk
|
|
76
92
|
if os.path.exists(dataset_name_or_path):
|
|
77
|
-
logger.info(f'Loading dataset from
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
|
|
81
|
-
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
93
|
+
logger.info(f'Loading dataset from local disk: {dataset_name_or_path}')
|
|
94
|
+
data_dict = self.load_from_disk(
|
|
95
|
+
dataset_name_or_path, subset_list, work_dir, trust_remote_code=False, **kwargs)
|
|
82
96
|
else:
|
|
83
|
-
from
|
|
97
|
+
logger.info(f'Loading dataset from hub: {dataset_name_or_path}')
|
|
98
|
+
data_dict = self.load_from_hub(
|
|
99
|
+
dataset_name_or_path, subset_list, work_dir, trust_remote_code=True, **kwargs)
|
|
100
|
+
if len(data_dict) == 0:
|
|
101
|
+
raise ValueError(f'Dataset is empty: {dataset_name_or_path}')
|
|
102
|
+
return data_dict
|
|
103
|
+
|
|
104
|
+
def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
|
|
105
|
+
from modelscope.msdatasets import MsDataset
|
|
84
106
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
90
|
-
if len(split_list) == 0:
|
|
91
|
-
logger.error(f'Got empty split list: {split_list}')
|
|
107
|
+
datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
|
|
108
|
+
split_as_subset: bool = kwargs.pop('split_as_subset', False)
|
|
109
|
+
# Load dataset from remote
|
|
110
|
+
logger.info(f'Loading dataset: dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
92
111
|
|
|
112
|
+
data_dict = {}
|
|
113
|
+
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
114
|
+
if len(split_list) == 0:
|
|
115
|
+
logger.error(f'Got empty split list: {split_list}')
|
|
116
|
+
|
|
117
|
+
if split_as_subset:
|
|
118
|
+
for sub_name in subset_list:
|
|
119
|
+
data_dict[sub_name] = {}
|
|
120
|
+
# e.g. train: few-shot, test: target dataset to evaluate
|
|
121
|
+
for split in split_list:
|
|
122
|
+
dataset = MsDataset.load(
|
|
123
|
+
dataset_name=dataset_name_or_path,
|
|
124
|
+
split=sub_name, # load subset from split
|
|
125
|
+
cache_dir=work_dir,
|
|
126
|
+
hub=datasets_hub,
|
|
127
|
+
**kwargs)
|
|
128
|
+
data_dict[sub_name].update({split: dataset})
|
|
129
|
+
else:
|
|
93
130
|
for sub_name in subset_list:
|
|
94
131
|
data_dict[sub_name] = {}
|
|
95
132
|
# e.g. train: few-shot, test: target dataset to evaluate
|
|
@@ -101,17 +138,34 @@ class DataAdapter(ABC):
|
|
|
101
138
|
cache_dir=work_dir,
|
|
102
139
|
hub=datasets_hub,
|
|
103
140
|
**kwargs)
|
|
104
|
-
|
|
105
141
|
data_dict[sub_name].update({split: dataset})
|
|
106
142
|
|
|
107
143
|
return data_dict
|
|
108
144
|
|
|
109
|
-
def load_from_disk(self,
|
|
145
|
+
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
110
146
|
"""
|
|
111
147
|
Load the dataset from local disk.
|
|
112
148
|
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
|
|
149
|
+
Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
|
|
150
|
+
"""
|
|
151
|
+
return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
152
|
+
|
|
153
|
+
def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
|
|
113
154
|
"""
|
|
114
|
-
|
|
155
|
+
Reformat the dataset subset with subset_key and format.
|
|
156
|
+
"""
|
|
157
|
+
res_dict: dict = defaultdict(lambda: defaultdict(list), {key: defaultdict(list) for key in self.subset_list})
|
|
158
|
+
|
|
159
|
+
for sub_name, sub_data_dict in data_dict.items():
|
|
160
|
+
for split in [self.train_split, self.eval_split]:
|
|
161
|
+
if split is None:
|
|
162
|
+
continue
|
|
163
|
+
for sample_d in sub_data_dict[split]:
|
|
164
|
+
new_subset_name = format.format(sample_d[subset_key])
|
|
165
|
+
if new_subset_name not in self.subset_list:
|
|
166
|
+
continue
|
|
167
|
+
res_dict[new_subset_name][split].append(sample_d)
|
|
168
|
+
return res_dict
|
|
115
169
|
|
|
116
170
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
117
171
|
"""
|
|
@@ -138,7 +192,7 @@ class DataAdapter(ABC):
|
|
|
138
192
|
|
|
139
193
|
for sub_name, sub_data_dict in data_dict.items():
|
|
140
194
|
few_shot_data = []
|
|
141
|
-
if self.few_shot_num and self.few_shot_num > 0:
|
|
195
|
+
if self.train_split and self.few_shot_num and self.few_shot_num > 0:
|
|
142
196
|
few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
|
|
143
197
|
few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
|
|
144
198
|
self.few_shot_num,
|
|
@@ -161,7 +215,7 @@ class DataAdapter(ABC):
|
|
|
161
215
|
else:
|
|
162
216
|
return data_list[:k]
|
|
163
217
|
|
|
164
|
-
def compute_metric(self, review_res_list: Union[dict, list]) -> List[dict]:
|
|
218
|
+
def compute_metric(self, review_res_list: Union[dict, list], **kwargs) -> List[dict]:
|
|
165
219
|
"""
|
|
166
220
|
Compute evaluation result by specific metrics.
|
|
167
221
|
|
|
@@ -232,6 +286,12 @@ class DataAdapter(ABC):
|
|
|
232
286
|
kwargs['metric_list'] = self.metric_list
|
|
233
287
|
return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
|
|
234
288
|
|
|
289
|
+
def gen_prompt_data(self, prompt: str, **kwargs) -> dict:
|
|
290
|
+
if not isinstance(prompt, list):
|
|
291
|
+
prompt = [prompt]
|
|
292
|
+
prompt_data = PromptData(data=prompt, multi_choices=self.choices, system_prompt=self.system_prompt)
|
|
293
|
+
return prompt_data.to_dict()
|
|
294
|
+
|
|
235
295
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
236
296
|
"""
|
|
237
297
|
Generate model prompt from raw input, unify the prompt format for different datasets.
|
|
@@ -295,3 +355,6 @@ class DataAdapter(ABC):
|
|
|
295
355
|
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
296
356
|
"""
|
|
297
357
|
raise NotImplementedError
|
|
358
|
+
|
|
359
|
+
def llm_match(self, *args, **kwargs):
|
|
360
|
+
pass
|
|
@@ -3,9 +3,8 @@ import csv
|
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics.metrics import exact_match
|
|
8
|
-
from evalscope.models import MultiChoiceModelAdapter
|
|
9
8
|
from evalscope.utils import ResponseParser
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
11
10
|
|
|
@@ -16,22 +15,24 @@ logger = get_logger()
|
|
|
16
15
|
|
|
17
16
|
@Benchmark.register(
|
|
18
17
|
name='general_mcq',
|
|
18
|
+
pretty_name='General MCQ',
|
|
19
19
|
dataset_id='general_mcq',
|
|
20
|
-
model_adapter=
|
|
20
|
+
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
21
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
21
22
|
subset_list=['default'],
|
|
22
23
|
metric_list=['AverageAccuracy'],
|
|
23
24
|
few_shot_num=0,
|
|
24
25
|
train_split='dev',
|
|
25
26
|
eval_split='val',
|
|
26
27
|
prompt_template='请回答问题,并选出其中的正确答案\n{query}',
|
|
27
|
-
)
|
|
28
|
+
query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
|
|
28
29
|
class GeneralMCQAdapter(DataAdapter):
|
|
29
30
|
|
|
30
|
-
choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
31
|
-
|
|
32
31
|
def __init__(self, **kwargs):
|
|
33
32
|
super().__init__(**kwargs)
|
|
34
33
|
|
|
34
|
+
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
35
|
+
|
|
35
36
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
36
37
|
data_dict = {}
|
|
37
38
|
for subset_name in subset_list:
|
|
@@ -85,7 +86,7 @@ class GeneralMCQAdapter(DataAdapter):
|
|
|
85
86
|
|
|
86
87
|
full_prompt = self.prompt_template.format(query=context)
|
|
87
88
|
|
|
88
|
-
return
|
|
89
|
+
return self.gen_prompt_data(full_prompt)
|
|
89
90
|
|
|
90
91
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
91
92
|
# Get the gold choice
|
|
@@ -103,27 +104,19 @@ class GeneralMCQAdapter(DataAdapter):
|
|
|
103
104
|
Returns:
|
|
104
105
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
105
106
|
"""
|
|
106
|
-
if
|
|
107
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
107
108
|
return result
|
|
108
|
-
elif eval_type == EvalType.SERVICE:
|
|
109
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
110
|
-
elif eval_type == EvalType.CUSTOM:
|
|
111
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
112
109
|
else:
|
|
113
|
-
|
|
110
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
114
111
|
|
|
115
112
|
def match(self, gold: str, pred: str) -> float:
|
|
116
113
|
return exact_match(gold=gold, pred=pred)
|
|
117
114
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
example = '问题:' + input_d['question']
|
|
121
|
-
for choice in cls.choices:
|
|
122
|
-
if choice in input_d:
|
|
123
|
-
example += f'\n{choice}. {input_d[f"{choice}"]}'
|
|
115
|
+
def _format_example(self, input_d: dict, include_answer=True):
|
|
116
|
+
choices_str = '\n'.join([f'{choice}. {input_d[choice]}' for choice in self.choices if choice in input_d])
|
|
124
117
|
|
|
125
118
|
if include_answer:
|
|
126
|
-
|
|
119
|
+
return self.query_template.format(
|
|
120
|
+
question=input_d['question'], choices=choices_str, answer=input_d['answer'])
|
|
127
121
|
else:
|
|
128
|
-
|
|
129
|
-
return example
|
|
122
|
+
return self.query_template.format(question=input_d['question'], choices=choices_str, answer='').rstrip()
|
|
@@ -16,12 +16,12 @@ logger = get_logger()
|
|
|
16
16
|
@Benchmark.register(
|
|
17
17
|
name='general_qa',
|
|
18
18
|
dataset_id='general_qa',
|
|
19
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
20
19
|
subset_list=['default'],
|
|
21
|
-
metric_list=['AverageBLEU'],
|
|
20
|
+
metric_list=['AverageBLEU', 'AverageRouge'],
|
|
22
21
|
few_shot_num=0,
|
|
23
22
|
train_split=None,
|
|
24
23
|
eval_split='test',
|
|
24
|
+
prompt_template='请回答问题\n{query}',
|
|
25
25
|
)
|
|
26
26
|
class GeneralQAAdapter(DataAdapter):
|
|
27
27
|
# TODO: set few_shot_num
|
|
@@ -30,18 +30,31 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
30
30
|
|
|
31
31
|
super().__init__(**kwargs)
|
|
32
32
|
|
|
33
|
-
def load(self, **kwargs) -> dict:
|
|
33
|
+
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
34
|
+
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
35
|
+
subset_list = subset_list or self.subset_list
|
|
34
36
|
|
|
35
|
-
|
|
37
|
+
data_file_dict = defaultdict(str)
|
|
36
38
|
data_list = []
|
|
37
39
|
|
|
40
|
+
# get data file path and subset name
|
|
41
|
+
if os.path.isdir(dataset_name_or_path):
|
|
42
|
+
for subset_name in subset_list:
|
|
43
|
+
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
44
|
+
elif os.path.isfile(dataset_name_or_path):
|
|
45
|
+
cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
|
|
46
|
+
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
47
|
+
else:
|
|
48
|
+
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
49
|
+
|
|
50
|
+
# load data from local disk
|
|
38
51
|
try:
|
|
39
|
-
for file_path in
|
|
52
|
+
for subset_name, file_path in data_file_dict.items():
|
|
40
53
|
data_list.extend(jsonl_to_list(file_path))
|
|
41
54
|
except Exception as e:
|
|
42
55
|
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
43
56
|
|
|
44
|
-
data_dict = {
|
|
57
|
+
data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
|
|
45
58
|
|
|
46
59
|
return data_dict
|
|
47
60
|
|
|
@@ -62,11 +75,9 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
62
75
|
logger.warning('The history is not included in the prompt for GeneralQA. \
|
|
63
76
|
To be supported in the future.')
|
|
64
77
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
# prompt = '\n'.join(history) + '\n' + prompt
|
|
69
|
-
return {'data': [prompt], 'system_prompt': self.system_prompt}
|
|
78
|
+
query = input_d.get('question', '') or input_d.get('query', '')
|
|
79
|
+
prompt = self.prompt_template.format(query=query)
|
|
80
|
+
return self.gen_prompt_data(prompt)
|
|
70
81
|
|
|
71
82
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
72
83
|
"""
|
|
@@ -101,13 +112,15 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
101
112
|
|
|
102
113
|
"""
|
|
103
114
|
res = dict()
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
115
|
+
if 'AverageRouge' in self.metric_list:
|
|
116
|
+
rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
|
|
117
|
+
res.update(rouge_dict)
|
|
118
|
+
if 'AverageBLEU' in self.metric_list:
|
|
119
|
+
bleu_dict = bleu_ngram_one_sample(pred, gold)
|
|
120
|
+
res.update(bleu_dict)
|
|
108
121
|
return res
|
|
109
122
|
|
|
110
|
-
def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
|
|
123
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
111
124
|
"""
|
|
112
125
|
compute weighted mean of the bleu score of all samples
|
|
113
126
|
|
|
@@ -3,19 +3,20 @@ import random
|
|
|
3
3
|
import re
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='gpqa',
|
|
12
|
+
pretty_name='GPQA',
|
|
13
13
|
dataset_id='modelscope/gpqa',
|
|
14
|
-
model_adapter=
|
|
14
|
+
model_adapter=OutputType.GENERATION,
|
|
15
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
15
16
|
subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
|
|
16
17
|
metric_list=['AveragePass@1'],
|
|
17
18
|
few_shot_num=5,
|
|
18
|
-
train_split=
|
|
19
|
+
train_split=None,
|
|
19
20
|
eval_split='train', # only have train split
|
|
20
21
|
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
22
|
)
|
|
@@ -27,8 +28,9 @@ class GPQAAdapter(DataAdapter):
|
|
|
27
28
|
self.choices = ['A', 'B', 'C', 'D']
|
|
28
29
|
if self.few_shot_num and self.few_shot_num > 0:
|
|
29
30
|
self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n' # noqa: E501
|
|
30
|
-
self.prompt_prefix += open(
|
|
31
|
-
|
|
31
|
+
self.prompt_prefix += open(
|
|
32
|
+
os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'), 'r',
|
|
33
|
+
encoding='utf-8').read() + '\nQuestion: '
|
|
32
34
|
else:
|
|
33
35
|
self.prompt_prefix = 'What is the correct answer to this question:'
|
|
34
36
|
|
|
@@ -50,7 +52,7 @@ class GPQAAdapter(DataAdapter):
|
|
|
50
52
|
query = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}" # noqa: E501
|
|
51
53
|
|
|
52
54
|
prompt = self.prompt_template.format(query=query)
|
|
53
|
-
return
|
|
55
|
+
return self.gen_prompt_data(prompt)
|
|
54
56
|
|
|
55
57
|
def __process_input(self, input_d: dict) -> dict:
|
|
56
58
|
|
|
@@ -94,7 +96,10 @@ class GPQAAdapter(DataAdapter):
|
|
|
94
96
|
"""
|
|
95
97
|
Parse the predicted result and extract proper answer.
|
|
96
98
|
"""
|
|
97
|
-
|
|
99
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
100
|
+
return result
|
|
101
|
+
else:
|
|
102
|
+
return GPQAAdapter.get_multiple_choice_answer(result)
|
|
98
103
|
|
|
99
104
|
def match(self, gold: str, pred: str) -> float:
|
|
100
105
|
"""
|
|
@@ -6,7 +6,6 @@ import os
|
|
|
6
6
|
import re
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
10
9
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
@@ -15,12 +14,12 @@ logger = get_logger()
|
|
|
15
14
|
|
|
16
15
|
@Benchmark.register(
|
|
17
16
|
name='gsm8k',
|
|
17
|
+
pretty_name='GSM8K',
|
|
18
18
|
dataset_id='modelscope/gsm8k',
|
|
19
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
20
19
|
subset_list=['main'],
|
|
21
20
|
metric_list=['AverageAccuracy'],
|
|
22
21
|
few_shot_num=4,
|
|
23
|
-
train_split=
|
|
22
|
+
train_split=None,
|
|
24
23
|
eval_split='test',
|
|
25
24
|
prompt_template="Question: {query}\nLet's think step by step\nAnswer:",
|
|
26
25
|
)
|
|
@@ -76,7 +75,7 @@ class GSM8KAdapter(DataAdapter):
|
|
|
76
75
|
|
|
77
76
|
full_prompt = context + self.prompt_template.format(query=input_d['question'])
|
|
78
77
|
|
|
79
|
-
return
|
|
78
|
+
return self.gen_prompt_data(full_prompt)
|
|
80
79
|
|
|
81
80
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
82
81
|
# Extract the gold answer from the input dict.
|
|
@@ -4,9 +4,8 @@ import os
|
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
-
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.constants import EvalType, OutputType
|
|
8
8
|
from evalscope.metrics import exact_match
|
|
9
|
-
from evalscope.models import ContinuationLogitsModelAdapter
|
|
10
9
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
from evalscope.utils.utils import ResponseParser
|
|
@@ -18,8 +17,10 @@ logger = get_logger()
|
|
|
18
17
|
|
|
19
18
|
@Benchmark.register(
|
|
20
19
|
name='hellaswag',
|
|
20
|
+
pretty_name='HellaSwag',
|
|
21
21
|
dataset_id='modelscope/hellaswag',
|
|
22
|
-
model_adapter=
|
|
22
|
+
model_adapter=OutputType.CONTINUOUS,
|
|
23
|
+
output_types=[OutputType.CONTINUOUS, OutputType.GENERATION],
|
|
23
24
|
subset_list=['default'],
|
|
24
25
|
metric_list=['AverageAccuracy'],
|
|
25
26
|
few_shot_num=0,
|
|
@@ -30,8 +31,6 @@ logger = get_logger()
|
|
|
30
31
|
)
|
|
31
32
|
class HellaSwagAdapter(DataAdapter):
|
|
32
33
|
|
|
33
|
-
choices = ['0', '1', '2', '3']
|
|
34
|
-
|
|
35
34
|
def __init__(self, **kwargs):
|
|
36
35
|
|
|
37
36
|
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
@@ -40,6 +39,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
40
39
|
kwargs['few_shot_num'] = 0
|
|
41
40
|
|
|
42
41
|
super().__init__(**kwargs)
|
|
42
|
+
self.choices = ['0', '1', '2', '3']
|
|
43
43
|
|
|
44
44
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
45
45
|
data_dict = {}
|
|
@@ -89,7 +89,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
89
89
|
|
|
90
90
|
ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
|
|
91
91
|
|
|
92
|
-
return
|
|
92
|
+
return self.gen_prompt_data(ctx_continuation_pair_list)
|
|
93
93
|
|
|
94
94
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
95
95
|
# Get the gold choice
|
|
@@ -107,7 +107,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
107
107
|
Returns:
|
|
108
108
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
109
109
|
"""
|
|
110
|
-
if
|
|
110
|
+
if self.model_adapter == OutputType.CONTINUOUS:
|
|
111
111
|
# answer: in the form of [-2.3, -4.5, ...], len of self.choices
|
|
112
112
|
result = np.array(result)
|
|
113
113
|
endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
|
|
@@ -115,12 +115,8 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
115
115
|
best_choice_idx = np.argmax(result / completion_len)
|
|
116
116
|
|
|
117
117
|
return str(best_choice_idx)
|
|
118
|
-
elif eval_type == EvalType.SERVICE:
|
|
119
|
-
return ResponseParser.parse_first_option(result)
|
|
120
|
-
elif eval_type == EvalType.CUSTOM:
|
|
121
|
-
return ResponseParser.parse_first_option(result)
|
|
122
118
|
else:
|
|
123
|
-
|
|
119
|
+
return ResponseParser.parse_first_option(result)
|
|
124
120
|
|
|
125
121
|
def match(self, gold: str, pred: str) -> float:
|
|
126
122
|
return exact_match(gold=str(gold), pred=str(pred))
|
|
@@ -13,8 +13,8 @@ logger = get_logger()
|
|
|
13
13
|
|
|
14
14
|
@Benchmark.register(
|
|
15
15
|
name='humaneval',
|
|
16
|
+
pretty_name='HumanEval',
|
|
16
17
|
dataset_id='modelscope/humaneval',
|
|
17
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
18
18
|
subset_list=['openai_humaneval'],
|
|
19
19
|
metric_list=['Pass@1'],
|
|
20
20
|
few_shot_num=0,
|
|
@@ -66,7 +66,7 @@ class HumanevalAdapter(DataAdapter):
|
|
|
66
66
|
query = input_d['prompt']
|
|
67
67
|
full_prompt = self.prompt_template.format(query=query)
|
|
68
68
|
|
|
69
|
-
return
|
|
69
|
+
return self.gen_prompt_data(full_prompt)
|
|
70
70
|
|
|
71
71
|
@classmethod
|
|
72
72
|
def _postprocess(cls, text: str) -> str:
|
|
@@ -5,13 +5,12 @@ from evalscope.benchmarks import Benchmark, DataAdapter
|
|
|
5
5
|
from evalscope.benchmarks.ifeval.utils import process_results
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
7
|
from evalscope.metrics import Metric, mean, metric_registry
|
|
8
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='ifeval',
|
|
12
|
+
pretty_name='IFEval',
|
|
13
13
|
dataset_id='opencompass/ifeval',
|
|
14
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
15
14
|
subset_list=['default'],
|
|
16
15
|
metric_list=[
|
|
17
16
|
'prompt_level_strict_acc',
|
|
@@ -36,7 +35,7 @@ class IFEvalAdapter(DataAdapter):
|
|
|
36
35
|
metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
|
|
37
36
|
|
|
38
37
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
39
|
-
return
|
|
38
|
+
return self.gen_prompt_data(input_d['prompt'])
|
|
40
39
|
|
|
41
40
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
42
41
|
return input_d
|
|
@@ -47,7 +46,7 @@ class IFEvalAdapter(DataAdapter):
|
|
|
47
46
|
def match(self, gold: Any, pred: Any) -> Dict:
|
|
48
47
|
return process_results(gold, [pred])
|
|
49
48
|
|
|
50
|
-
def compute_metric(self, review_res_list: List[dict]) -> Any:
|
|
49
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
|
|
51
50
|
# aggregate review results
|
|
52
51
|
res_dict = defaultdict(list)
|
|
53
52
|
for res in review_res_list:
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
-
from evalscope.constants import EvalType
|
|
2
|
+
from evalscope.constants import EvalType, OutputType
|
|
3
3
|
from evalscope.metrics import exact_match
|
|
4
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
5
4
|
from evalscope.utils.utils import ResponseParser
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
@Benchmark.register(
|
|
9
8
|
name='iquiz',
|
|
9
|
+
pretty_name='IQuiz',
|
|
10
10
|
dataset_id='AI-ModelScope/IQuiz',
|
|
11
|
-
model_adapter=
|
|
11
|
+
model_adapter=OutputType.GENERATION,
|
|
12
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
12
13
|
subset_list=['IQ', 'EQ'],
|
|
13
14
|
metric_list=['AverageAccuracy'],
|
|
14
15
|
few_shot_num=0,
|
|
@@ -36,7 +37,7 @@ class IQuizAdapter(DataAdapter):
|
|
|
36
37
|
"""
|
|
37
38
|
prompt = f"问题: {input_d['question']}\n"
|
|
38
39
|
prompt += self.__form_options(input_d['choices'])
|
|
39
|
-
return
|
|
40
|
+
return self.gen_prompt_data(prompt)
|
|
40
41
|
|
|
41
42
|
def __form_options(self, options: list):
|
|
42
43
|
option_str = '选项:\n'
|
|
@@ -54,7 +55,10 @@ class IQuizAdapter(DataAdapter):
|
|
|
54
55
|
"""
|
|
55
56
|
Parse the predicted result and extract proper answer.
|
|
56
57
|
"""
|
|
57
|
-
|
|
58
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
59
|
+
return result
|
|
60
|
+
else:
|
|
61
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
58
62
|
|
|
59
63
|
def match(self, gold: str, pred: str) -> float:
|
|
60
64
|
"""
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
2
|
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
4
3
|
from evalscope.utils.logger import get_logger
|
|
5
4
|
|
|
6
5
|
# flake8: noqa
|
|
@@ -10,9 +9,9 @@ logger = get_logger()
|
|
|
10
9
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='math_500',
|
|
12
|
+
pretty_name='MATH-500',
|
|
13
13
|
dataset_id='AI-ModelScope/MATH-500',
|
|
14
|
-
|
|
15
|
-
subset_list=['default'],
|
|
14
|
+
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
16
15
|
metric_list=['AveragePass@1'],
|
|
17
16
|
few_shot_num=0,
|
|
18
17
|
train_split=None,
|
|
@@ -24,6 +23,12 @@ class Math500Adapter(DataAdapter):
|
|
|
24
23
|
def __init__(self, *args, **kwargs):
|
|
25
24
|
super().__init__(*args, **kwargs)
|
|
26
25
|
|
|
26
|
+
def load(self, **kwargs):
|
|
27
|
+
# default load all levels
|
|
28
|
+
kwargs['subset_list'] = ['default']
|
|
29
|
+
data_dict = super().load(**kwargs)
|
|
30
|
+
return self.reformat_subset(data_dict, subset_key='level', format='Level {}')
|
|
31
|
+
|
|
27
32
|
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
28
33
|
"""
|
|
29
34
|
Generate the prompt for the model input.
|
|
@@ -31,7 +36,7 @@ class Math500Adapter(DataAdapter):
|
|
|
31
36
|
problem = input_d['problem']
|
|
32
37
|
full_prompt = self.prompt_template.format(query=problem)
|
|
33
38
|
|
|
34
|
-
return
|
|
39
|
+
return self.gen_prompt_data(full_prompt)
|
|
35
40
|
|
|
36
41
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
42
|
# Extract the gold answer from the input dict.
|