evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +3 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +49 -0
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
- evalscope/benchmarks/benchmark.py +5 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
- evalscope/benchmarks/data_adapter.py +88 -29
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
- evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +68 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +109 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +4 -1
- evalscope/evaluator/evaluator.py +81 -65
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +39 -3
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/base_adapter.py +7 -1
- evalscope/models/chat_adapter.py +69 -49
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +7 -2
- evalscope/models/server_adapter.py +106 -61
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +5 -1
- evalscope/perf/http_client.py +2 -2
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +42 -23
- evalscope/run.py +11 -8
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +264 -0
- evalscope/third_party/thinkbench/infer.py +100 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +47 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
- tests/cli/test_run.py +108 -19
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
import os.path
|
|
3
3
|
import random
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
-
from
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from typing import Any, List, Optional, Union
|
|
6
7
|
|
|
7
8
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
8
|
-
from evalscope.metrics import
|
|
9
|
+
from evalscope.metrics.named_metrics import metric_registry
|
|
9
10
|
from evalscope.report import Report, ReportGenerator
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
11
12
|
|
|
@@ -16,12 +17,16 @@ class DataAdapter(ABC):
|
|
|
16
17
|
|
|
17
18
|
def __init__(self,
|
|
18
19
|
name: str,
|
|
20
|
+
dataset_id: str,
|
|
19
21
|
subset_list: list,
|
|
20
|
-
metric_list: List[
|
|
22
|
+
metric_list: List[str],
|
|
21
23
|
few_shot_num: Optional[int] = 0,
|
|
22
24
|
train_split: Optional[str] = None,
|
|
23
25
|
eval_split: Optional[str] = None,
|
|
24
26
|
prompt_template: Optional[str] = None,
|
|
27
|
+
system_prompt: Optional[str] = None,
|
|
28
|
+
query_template: Optional[str] = None,
|
|
29
|
+
pretty_name: Optional[str] = None,
|
|
25
30
|
**kwargs):
|
|
26
31
|
"""
|
|
27
32
|
Data Adapter for the benchmark. You need to implement the following methods:
|
|
@@ -31,6 +36,7 @@ class DataAdapter(ABC):
|
|
|
31
36
|
- match
|
|
32
37
|
Args:
|
|
33
38
|
name: str, the name of the benchmark.
|
|
39
|
+
dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
|
|
34
40
|
subset_list: list of subset names for the dataset.
|
|
35
41
|
metric_list: list, the metric list to evaluate the model on specific benchmark.
|
|
36
42
|
few_shot_num: int, number of few-shot examples. Default: 0
|
|
@@ -41,20 +47,23 @@ class DataAdapter(ABC):
|
|
|
41
47
|
the form of A or B or C or D, do not output explanation:`
|
|
42
48
|
"""
|
|
43
49
|
self.name = name
|
|
50
|
+
self.dataset_id = dataset_id
|
|
44
51
|
self.subset_list = subset_list
|
|
45
52
|
self.metric_list = metric_list
|
|
46
53
|
self.few_shot_num = few_shot_num
|
|
47
54
|
self.train_split = train_split
|
|
48
55
|
self.eval_split = eval_split
|
|
49
56
|
self.prompt_template = prompt_template
|
|
57
|
+
self.system_prompt = system_prompt
|
|
58
|
+
self.query_template = query_template
|
|
59
|
+
self.pretty_name = pretty_name
|
|
50
60
|
self.config_kwargs = kwargs
|
|
51
61
|
self.category_map = kwargs.get('category_map', {})
|
|
52
62
|
|
|
53
63
|
def load(self,
|
|
54
|
-
dataset_name_or_path: str,
|
|
64
|
+
dataset_name_or_path: str = None,
|
|
55
65
|
subset_list: list = None,
|
|
56
66
|
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
57
|
-
datasets_hub: str = HubType.MODELSCOPE,
|
|
58
67
|
**kwargs) -> dict:
|
|
59
68
|
"""
|
|
60
69
|
Load the dataset. Remote and local datasets are supported.
|
|
@@ -64,27 +73,45 @@ class DataAdapter(ABC):
|
|
|
64
73
|
train_dataset, test_dataset: Iterable dataset, object each item of which is a dict.
|
|
65
74
|
|
|
66
75
|
"""
|
|
67
|
-
dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
76
|
+
dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
|
|
68
77
|
subset_list = subset_list or self.subset_list
|
|
69
78
|
|
|
70
79
|
# Try to load dataset from local disk
|
|
71
80
|
if os.path.exists(dataset_name_or_path):
|
|
72
|
-
logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
|
|
73
|
-
subsets: {subset_list}')
|
|
74
81
|
data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
75
|
-
if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
|
|
76
|
-
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
77
82
|
else:
|
|
78
|
-
|
|
83
|
+
data_dict = self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
84
|
+
if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
|
|
85
|
+
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
86
|
+
return data_dict
|
|
87
|
+
|
|
88
|
+
def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
|
|
89
|
+
from modelscope.msdatasets import MsDataset
|
|
90
|
+
|
|
91
|
+
datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
|
|
92
|
+
split_as_subset: bool = kwargs.pop('split_as_subset', False)
|
|
93
|
+
# Load dataset from remote
|
|
94
|
+
logger.info(
|
|
95
|
+
f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
79
96
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
85
|
-
if len(split_list) == 0:
|
|
86
|
-
logger.error(f'Got empty split list: {split_list}')
|
|
97
|
+
data_dict = {}
|
|
98
|
+
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
99
|
+
if len(split_list) == 0:
|
|
100
|
+
logger.error(f'Got empty split list: {split_list}')
|
|
87
101
|
|
|
102
|
+
if split_as_subset:
|
|
103
|
+
for sub_name in subset_list:
|
|
104
|
+
data_dict[sub_name] = {}
|
|
105
|
+
# e.g. train: few-shot, test: target dataset to evaluate
|
|
106
|
+
for split in split_list:
|
|
107
|
+
dataset = MsDataset.load(
|
|
108
|
+
dataset_name=dataset_name_or_path,
|
|
109
|
+
split=sub_name, # load subset from split
|
|
110
|
+
cache_dir=work_dir,
|
|
111
|
+
hub=datasets_hub,
|
|
112
|
+
**kwargs)
|
|
113
|
+
data_dict[sub_name].update({split: dataset})
|
|
114
|
+
else:
|
|
88
115
|
for sub_name in subset_list:
|
|
89
116
|
data_dict[sub_name] = {}
|
|
90
117
|
# e.g. train: few-shot, test: target dataset to evaluate
|
|
@@ -96,17 +123,48 @@ class DataAdapter(ABC):
|
|
|
96
123
|
cache_dir=work_dir,
|
|
97
124
|
hub=datasets_hub,
|
|
98
125
|
**kwargs)
|
|
99
|
-
|
|
100
126
|
data_dict[sub_name].update({split: dataset})
|
|
101
127
|
|
|
102
128
|
return data_dict
|
|
103
129
|
|
|
104
|
-
def load_from_disk(self,
|
|
130
|
+
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
105
131
|
"""
|
|
106
132
|
Load the dataset from local disk.
|
|
107
133
|
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
|
|
134
|
+
Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
|
|
135
|
+
"""
|
|
136
|
+
from modelscope.msdatasets import MsDataset
|
|
137
|
+
|
|
138
|
+
logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
|
|
139
|
+
subsets: {subset_list}')
|
|
140
|
+
data_dict = {}
|
|
141
|
+
subset_list = subset_list or self.subset_list
|
|
142
|
+
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
143
|
+
for sub_name in subset_list:
|
|
144
|
+
data_dict[sub_name] = {}
|
|
145
|
+
# e.g. train: few-shot, test: target dataset to evaluate
|
|
146
|
+
for split in split_list:
|
|
147
|
+
dataset = MsDataset.load(
|
|
148
|
+
dataset_name=dataset_name_or_path, subset_name=sub_name, split=split, cache_dir=work_dir, **kwargs)
|
|
149
|
+
data_dict[sub_name].update({split: dataset})
|
|
150
|
+
return data_dict
|
|
151
|
+
|
|
152
|
+
def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
|
|
153
|
+
"""
|
|
154
|
+
Reformat the dataset subset with subset_key and format.
|
|
108
155
|
"""
|
|
109
|
-
|
|
156
|
+
res_dict: dict = defaultdict(lambda: defaultdict(list), {key: defaultdict(list) for key in self.subset_list})
|
|
157
|
+
|
|
158
|
+
for sub_name, sub_data_dict in data_dict.items():
|
|
159
|
+
for split in [self.train_split, self.eval_split]:
|
|
160
|
+
if split is None:
|
|
161
|
+
continue
|
|
162
|
+
for sample_d in sub_data_dict[split]:
|
|
163
|
+
new_subset_name = format.format(sample_d[subset_key])
|
|
164
|
+
if new_subset_name not in self.subset_list:
|
|
165
|
+
continue
|
|
166
|
+
res_dict[new_subset_name][split].append(sample_d)
|
|
167
|
+
return res_dict
|
|
110
168
|
|
|
111
169
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
112
170
|
"""
|
|
@@ -133,7 +191,7 @@ class DataAdapter(ABC):
|
|
|
133
191
|
|
|
134
192
|
for sub_name, sub_data_dict in data_dict.items():
|
|
135
193
|
few_shot_data = []
|
|
136
|
-
if self.few_shot_num and self.few_shot_num > 0:
|
|
194
|
+
if self.train_split and self.few_shot_num and self.few_shot_num > 0:
|
|
137
195
|
few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
|
|
138
196
|
few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
|
|
139
197
|
self.few_shot_num,
|
|
@@ -156,7 +214,7 @@ class DataAdapter(ABC):
|
|
|
156
214
|
else:
|
|
157
215
|
return data_list[:k]
|
|
158
216
|
|
|
159
|
-
def compute_metric(self, review_res_list: list) -> List[dict]:
|
|
217
|
+
def compute_metric(self, review_res_list: Union[dict, list], **kwargs) -> List[dict]:
|
|
160
218
|
"""
|
|
161
219
|
Compute evaluation result by specific metrics.
|
|
162
220
|
|
|
@@ -170,14 +228,15 @@ class DataAdapter(ABC):
|
|
|
170
228
|
raise ValueError('No metric list found for the benchmark.')
|
|
171
229
|
|
|
172
230
|
res_list = []
|
|
173
|
-
for
|
|
231
|
+
for metric_str in self.metric_list:
|
|
232
|
+
metric = metric_registry.get(metric_str)
|
|
174
233
|
metric_name = metric.name
|
|
175
234
|
metric_func = metric.object
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
})
|
|
235
|
+
if isinstance(review_res_list, dict):
|
|
236
|
+
review_res = review_res_list.get(metric_name, [])
|
|
237
|
+
else:
|
|
238
|
+
review_res = review_res_list
|
|
239
|
+
res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
|
|
181
240
|
return res_list
|
|
182
241
|
|
|
183
242
|
def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
|
|
File without changes
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
+
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, EvalType, HubType
|
|
8
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='data_collection',
|
|
17
|
+
dataset_id='', # dataset_id need to be set
|
|
18
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
19
|
+
subset_list=['default'],
|
|
20
|
+
metric_list=['AverageAccuracy'],
|
|
21
|
+
few_shot_num=0,
|
|
22
|
+
train_split=None,
|
|
23
|
+
eval_split='test',
|
|
24
|
+
prompt_template='',
|
|
25
|
+
)
|
|
26
|
+
class DataCollectionAdapter(DataAdapter):
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
"""
|
|
30
|
+
Data adapter for collection dataset.
|
|
31
|
+
"""
|
|
32
|
+
super().__init__(**kwargs)
|
|
33
|
+
|
|
34
|
+
def load(self,
|
|
35
|
+
dataset_name_or_path: str = None,
|
|
36
|
+
subset_list: list = None,
|
|
37
|
+
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
38
|
+
datasets_hub: str = HubType.MODELSCOPE,
|
|
39
|
+
**kwargs) -> dict:
|
|
40
|
+
"""
|
|
41
|
+
Load the dataset. Remote and local datasets are supported.
|
|
42
|
+
"""
|
|
43
|
+
dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
|
|
44
|
+
subset_list = subset_list or self.subset_list
|
|
45
|
+
|
|
46
|
+
# Try to load dataset from local disk
|
|
47
|
+
if os.path.exists(dataset_name_or_path):
|
|
48
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
49
|
+
dataset = jsonl_to_list(dataset_name_or_path)
|
|
50
|
+
if len(dataset) == 0:
|
|
51
|
+
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
52
|
+
else:
|
|
53
|
+
from modelscope.msdatasets import MsDataset
|
|
54
|
+
|
|
55
|
+
# Load dataset from remote
|
|
56
|
+
logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
|
|
57
|
+
|
|
58
|
+
dataset = MsDataset.load(dataset_name=dataset_name_or_path, cache_dir=work_dir, hub=datasets_hub, **kwargs)
|
|
59
|
+
|
|
60
|
+
dataset = dataset[self.eval_split].to_list()
|
|
61
|
+
|
|
62
|
+
return dataset
|
|
63
|
+
|
|
64
|
+
def get_gold_answer(self, input_d: Any) -> Any:
|
|
65
|
+
return super().get_gold_answer(input_d)
|
|
66
|
+
|
|
67
|
+
def match(self, gold: Any, pred: Any) -> Any:
|
|
68
|
+
return super().match(gold, pred)
|
|
69
|
+
|
|
70
|
+
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
|
|
71
|
+
return super().parse_pred_result(result, raw_input_d, eval_type)
|
|
File without changes
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import csv
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.metrics.metrics import exact_match
|
|
8
|
+
from evalscope.models import MultiChoiceModelAdapter
|
|
9
|
+
from evalscope.utils import ResponseParser
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
# flake8: noqa
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@Benchmark.register(
|
|
18
|
+
name='general_mcq',
|
|
19
|
+
dataset_id='general_mcq',
|
|
20
|
+
model_adapter=MultiChoiceModelAdapter,
|
|
21
|
+
subset_list=['default'],
|
|
22
|
+
metric_list=['AverageAccuracy'],
|
|
23
|
+
few_shot_num=0,
|
|
24
|
+
train_split='dev',
|
|
25
|
+
eval_split='val',
|
|
26
|
+
prompt_template='请回答问题,并选出其中的正确答案\n{query}',
|
|
27
|
+
query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
|
|
28
|
+
class GeneralMCQAdapter(DataAdapter):
|
|
29
|
+
|
|
30
|
+
choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
31
|
+
|
|
32
|
+
def __init__(self, **kwargs):
|
|
33
|
+
super().__init__(**kwargs)
|
|
34
|
+
|
|
35
|
+
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
36
|
+
data_dict = {}
|
|
37
|
+
for subset_name in subset_list:
|
|
38
|
+
for split_name in [self.train_split, self.eval_split]:
|
|
39
|
+
if os.path.exists(dataset_name_or_path):
|
|
40
|
+
file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}.csv')
|
|
41
|
+
else:
|
|
42
|
+
file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
|
|
43
|
+
if os.path.exists(file_path):
|
|
44
|
+
with open(file_path, encoding='utf-8') as f:
|
|
45
|
+
rows = []
|
|
46
|
+
reader = csv.reader(f)
|
|
47
|
+
header = next(reader)
|
|
48
|
+
for row in reader:
|
|
49
|
+
item = dict(zip(header, row))
|
|
50
|
+
rows.append(item)
|
|
51
|
+
|
|
52
|
+
if subset_name in data_dict:
|
|
53
|
+
data_dict[subset_name].update({split_name: rows})
|
|
54
|
+
else:
|
|
55
|
+
data_dict[subset_name] = {split_name: rows}
|
|
56
|
+
|
|
57
|
+
return data_dict
|
|
58
|
+
|
|
59
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
60
|
+
"""
|
|
61
|
+
Generate model prompt from raw input, unify the prompt format for C-Eval benchmark.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
input_d (dict): The raw input. A single data format of the C-Eval:
|
|
65
|
+
|
|
66
|
+
{'id': 0,
|
|
67
|
+
'question': '下列关于税法基本原则的表述中,不正确的是____。',
|
|
68
|
+
'A': '税收法定原则包括税收要件法定原则和税务合法性原则',
|
|
69
|
+
'B': '税收公平原则源于法律上的平等性原则',
|
|
70
|
+
'C': '税收效率原则包含经济效率和行政效率两个方面',
|
|
71
|
+
'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定',
|
|
72
|
+
'answer': 'D'}
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
{'data': ['prompt ...']}
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
few_shot_prompts = [self._format_example(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
79
|
+
|
|
80
|
+
if len(few_shot_prompts) > 0:
|
|
81
|
+
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
82
|
+
else:
|
|
83
|
+
context = ''
|
|
84
|
+
context = context.strip() + self._format_example(input_d=input_d, include_answer=False)
|
|
85
|
+
|
|
86
|
+
full_prompt = self.prompt_template.format(query=context)
|
|
87
|
+
|
|
88
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
89
|
+
|
|
90
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
91
|
+
# Get the gold choice
|
|
92
|
+
return input_d.get('answer', '')
|
|
93
|
+
|
|
94
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
95
|
+
"""
|
|
96
|
+
Parse the model output to get the answer. Could be the best choice index.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
result: Predicted answer from the model. Usually a string for chat.
|
|
100
|
+
raw_input_d (dict): The raw input. Depending on the dataset.
|
|
101
|
+
eval_type: `checkpoint` or `service` or `custom`. Default is `checkpoint`.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
105
|
+
"""
|
|
106
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
107
|
+
return result
|
|
108
|
+
elif eval_type == EvalType.SERVICE:
|
|
109
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
110
|
+
elif eval_type == EvalType.CUSTOM:
|
|
111
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
112
|
+
else:
|
|
113
|
+
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
114
|
+
|
|
115
|
+
def match(self, gold: str, pred: str) -> float:
|
|
116
|
+
return exact_match(gold=gold, pred=pred)
|
|
117
|
+
|
|
118
|
+
def _format_example(self, input_d: dict, include_answer=True):
|
|
119
|
+
choices_str = '\n'.join([f'{choice}. {input_d[choice]}' for choice in self.choices if choice in input_d])
|
|
120
|
+
|
|
121
|
+
if include_answer:
|
|
122
|
+
return self.query_template.format(
|
|
123
|
+
question=input_d['question'], choices=choices_str, answer=input_d['answer'])
|
|
124
|
+
else:
|
|
125
|
+
return self.query_template.format(question=input_d['question'], choices=choices_str, answer='').rstrip()
|
|
@@ -5,7 +5,7 @@ from collections import defaultdict
|
|
|
5
5
|
from typing import List
|
|
6
6
|
|
|
7
7
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
|
|
9
9
|
from evalscope.models import ChatGenerationModelAdapter
|
|
10
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -18,10 +18,11 @@ logger = get_logger()
|
|
|
18
18
|
dataset_id='general_qa',
|
|
19
19
|
model_adapter=ChatGenerationModelAdapter,
|
|
20
20
|
subset_list=['default'],
|
|
21
|
-
metric_list=[AverageBLEU],
|
|
21
|
+
metric_list=['AverageBLEU'],
|
|
22
22
|
few_shot_num=0,
|
|
23
23
|
train_split=None,
|
|
24
24
|
eval_split='test',
|
|
25
|
+
prompt_template='请回答问题\n{query}',
|
|
25
26
|
)
|
|
26
27
|
class GeneralQAAdapter(DataAdapter):
|
|
27
28
|
# TODO: set few_shot_num
|
|
@@ -30,16 +31,16 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
30
31
|
|
|
31
32
|
super().__init__(**kwargs)
|
|
32
33
|
|
|
33
|
-
def load(self,
|
|
34
|
+
def load(self, **kwargs) -> dict:
|
|
34
35
|
|
|
35
|
-
data_file_list = glob.glob(os.path.join(
|
|
36
|
+
data_file_list = glob.glob(os.path.join(self.dataset_id, '*.jsonl'))
|
|
36
37
|
data_list = []
|
|
37
38
|
|
|
38
39
|
try:
|
|
39
40
|
for file_path in data_file_list:
|
|
40
41
|
data_list.extend(jsonl_to_list(file_path))
|
|
41
42
|
except Exception as e:
|
|
42
|
-
raise ValueError(f'Failed to load data from {
|
|
43
|
+
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
43
44
|
|
|
44
45
|
data_dict = {'default': {'test': data_list}}
|
|
45
46
|
|
|
@@ -62,11 +63,9 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
62
63
|
logger.warning('The history is not included in the prompt for GeneralQA. \
|
|
63
64
|
To be supported in the future.')
|
|
64
65
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
# prompt = '\n'.join(history) + '\n' + prompt
|
|
69
|
-
return {'data': [prompt], 'system_prompt': self.prompt_template}
|
|
66
|
+
query = input_d.get('question', '') or input_d.get('query', '')
|
|
67
|
+
prompt = self.prompt_template.format(query=query)
|
|
68
|
+
return {'data': [prompt], 'system_prompt': self.system_prompt}
|
|
70
69
|
|
|
71
70
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
72
71
|
"""
|
|
@@ -107,7 +106,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
107
106
|
res.update(bleu_dict)
|
|
108
107
|
return res
|
|
109
108
|
|
|
110
|
-
def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
|
|
109
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
111
110
|
"""
|
|
112
111
|
compute weighted mean of the bleu score of all samples
|
|
113
112
|
|
|
@@ -3,10 +3,9 @@ import random
|
|
|
3
3
|
import re
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import
|
|
7
|
-
from evalscope.metrics import
|
|
6
|
+
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
-
from evalscope.utils.utils import ResponseParser
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
@Benchmark.register(
|
|
@@ -14,11 +13,11 @@ from evalscope.utils.utils import ResponseParser
|
|
|
14
13
|
dataset_id='modelscope/gpqa',
|
|
15
14
|
model_adapter=ChatGenerationModelAdapter,
|
|
16
15
|
subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
|
|
17
|
-
metric_list=[
|
|
16
|
+
metric_list=['AveragePass@1'],
|
|
18
17
|
few_shot_num=5,
|
|
19
|
-
train_split=
|
|
18
|
+
train_split=None,
|
|
20
19
|
eval_split='train', # only have train split
|
|
21
|
-
prompt_template='',
|
|
20
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
22
21
|
)
|
|
23
22
|
class GPQAAdapter(DataAdapter):
|
|
24
23
|
|
|
@@ -48,9 +47,10 @@ class GPQAAdapter(DataAdapter):
|
|
|
48
47
|
""" # noqa: E501
|
|
49
48
|
processed_input_d = self.__process_input(input_d)
|
|
50
49
|
input_d['answer'] = processed_input_d['answer'] # add answer to input_d for answer extraction
|
|
51
|
-
|
|
50
|
+
query = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}" # noqa: E501
|
|
52
51
|
|
|
53
|
-
|
|
52
|
+
prompt = self.prompt_template.format(query=query)
|
|
53
|
+
return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
54
54
|
|
|
55
55
|
def __process_input(self, input_d: dict) -> dict:
|
|
56
56
|
|
|
@@ -94,10 +94,28 @@ class GPQAAdapter(DataAdapter):
|
|
|
94
94
|
"""
|
|
95
95
|
Parse the predicted result and extract proper answer.
|
|
96
96
|
"""
|
|
97
|
-
return
|
|
97
|
+
return GPQAAdapter.get_multiple_choice_answer(result)
|
|
98
98
|
|
|
99
99
|
def match(self, gold: str, pred: str) -> float:
|
|
100
100
|
"""
|
|
101
101
|
Match the gold answer and the predicted answer.
|
|
102
102
|
"""
|
|
103
103
|
return exact_match(gold=gold, pred=pred)
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def get_multiple_choice_answer(pred: str):
|
|
107
|
+
tmp = re.findall(r'\b(A|B|C|D)\b', pred.upper())
|
|
108
|
+
if tmp:
|
|
109
|
+
pred = tmp
|
|
110
|
+
else:
|
|
111
|
+
pred = [pred.strip().strip('.')]
|
|
112
|
+
|
|
113
|
+
if len(pred) == 0:
|
|
114
|
+
pred = ''
|
|
115
|
+
else:
|
|
116
|
+
pred = pred[-1]
|
|
117
|
+
|
|
118
|
+
# Remove the period at the end, again!
|
|
119
|
+
pred = pred.rstrip('.').rstrip('/')
|
|
120
|
+
|
|
121
|
+
return pred
|
|
@@ -6,7 +6,6 @@ import os
|
|
|
6
6
|
import re
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
-
from evalscope.metrics import AverageAccuracy
|
|
10
9
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
12
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -19,11 +18,11 @@ logger = get_logger()
|
|
|
19
18
|
dataset_id='modelscope/gsm8k',
|
|
20
19
|
model_adapter=ChatGenerationModelAdapter,
|
|
21
20
|
subset_list=['main'],
|
|
22
|
-
metric_list=[AverageAccuracy],
|
|
21
|
+
metric_list=['AverageAccuracy'],
|
|
23
22
|
few_shot_num=4,
|
|
24
|
-
train_split=
|
|
23
|
+
train_split=None,
|
|
25
24
|
eval_split='test',
|
|
26
|
-
prompt_template='
|
|
25
|
+
prompt_template="Question: {query}\nLet's think step by step\nAnswer:",
|
|
27
26
|
)
|
|
28
27
|
class GSM8KAdapter(DataAdapter):
|
|
29
28
|
|
|
@@ -73,10 +72,11 @@ class GSM8KAdapter(DataAdapter):
|
|
|
73
72
|
}
|
|
74
73
|
"""
|
|
75
74
|
use_fewshot = self.few_shot_num > 0
|
|
75
|
+
context = self._generate_prompt(use_fewshot=use_fewshot)
|
|
76
76
|
|
|
77
|
-
full_prompt = self.
|
|
77
|
+
full_prompt = context + self.prompt_template.format(query=input_d['question'])
|
|
78
78
|
|
|
79
|
-
return {'data': [full_prompt], 'system_prompt': self.
|
|
79
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
80
80
|
|
|
81
81
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
82
82
|
# Extract the gold answer from the input dict.
|
|
@@ -123,7 +123,7 @@ class GSM8KAdapter(DataAdapter):
|
|
|
123
123
|
return number_equal(gold_ans=gold, pred_ans=pred)
|
|
124
124
|
|
|
125
125
|
@classmethod
|
|
126
|
-
def _generate_prompt(cls,
|
|
126
|
+
def _generate_prompt(cls, use_fewshot: bool = True) -> str:
|
|
127
127
|
if use_fewshot:
|
|
128
128
|
# Use 4-shot examples by system
|
|
129
129
|
context = (
|
|
@@ -135,14 +135,9 @@ class GSM8KAdapter(DataAdapter):
|
|
|
135
135
|
"When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n"
|
|
136
136
|
"Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n"
|
|
137
137
|
'For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n'
|
|
138
|
-
|
|
139
|
-
# context = input_d['question']
|
|
140
|
-
# fewshot_prompts = ['Question: ' + item_d['question'] + '\nAnswer: ' + item_d['answer'] for item_d in few_shot_list]
|
|
141
|
-
# fewshot_prompts = fewshot_prompts + ['Question: ' + context + '\nAnswer:']
|
|
142
|
-
# context = '\n\n'.join(fewshot_prompts)
|
|
138
|
+
)
|
|
143
139
|
else:
|
|
144
|
-
context =
|
|
145
|
-
context = 'Question: ' + context + '\nAnswer:'
|
|
140
|
+
context = ''
|
|
146
141
|
return context
|
|
147
142
|
|
|
148
143
|
@staticmethod
|
|
@@ -5,7 +5,7 @@ import re
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import exact_match
|
|
9
9
|
from evalscope.models import ContinuationLogitsModelAdapter
|
|
10
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -21,7 +21,7 @@ logger = get_logger()
|
|
|
21
21
|
dataset_id='modelscope/hellaswag',
|
|
22
22
|
model_adapter=ContinuationLogitsModelAdapter,
|
|
23
23
|
subset_list=['default'],
|
|
24
|
-
metric_list=[AverageAccuracy],
|
|
24
|
+
metric_list=['AverageAccuracy'],
|
|
25
25
|
few_shot_num=0,
|
|
26
26
|
train_split='train',
|
|
27
27
|
eval_split='validation',
|
|
@@ -89,11 +89,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
89
89
|
|
|
90
90
|
ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
|
|
91
91
|
|
|
92
|
-
return {
|
|
93
|
-
'data': ctx_continuation_pair_list,
|
|
94
|
-
'multi_choices': self.choices,
|
|
95
|
-
'system_prompt': self.prompt_template
|
|
96
|
-
}
|
|
92
|
+
return {'data': ctx_continuation_pair_list, 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
97
93
|
|
|
98
94
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
99
95
|
# Get the gold choice
|