evalscope 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +10 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +23 -99
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +114 -85
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +25 -53
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +178 -0
- evalscope/collections/sampler.py +132 -0
- evalscope/collections/schema.py +122 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +7 -28
- evalscope/evaluator/evaluator.py +66 -109
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +6 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +7 -4
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +104 -0
- evalscope/run.py +37 -66
- evalscope/run_arena.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +4 -3
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +4 -0
- evalscope/utils/model_utils.py +10 -0
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/METADATA +32 -15
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/RECORD +75 -66
- tests/cli/test_collection.py +53 -0
- tests/cli/test_run.py +43 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -4,8 +4,8 @@ import random
|
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
5
|
from typing import Any, Optional
|
|
6
6
|
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
7
|
+
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
8
|
+
from evalscope.utils import normalize_score
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
11
11
|
logger = get_logger()
|
|
@@ -22,6 +22,11 @@ class DataAdapter(ABC):
|
|
|
22
22
|
prompt_template: str = '',
|
|
23
23
|
**kwargs):
|
|
24
24
|
"""
|
|
25
|
+
Data Adapter for the benchmark. You need to implement the following methods:
|
|
26
|
+
- gen_prompt
|
|
27
|
+
- get_gold_answer
|
|
28
|
+
- parse_pred_result
|
|
29
|
+
- match
|
|
25
30
|
Args:
|
|
26
31
|
subset_list: list of subset names for the dataset.
|
|
27
32
|
metric_list: list, the metric list to evaluate the model on specific benchmark.
|
|
@@ -55,33 +60,36 @@ class DataAdapter(ABC):
|
|
|
55
60
|
|
|
56
61
|
"""
|
|
57
62
|
dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
63
|
+
subset_list = subset_list or self.subset_list
|
|
58
64
|
|
|
59
65
|
# Try to load dataset from local disk
|
|
60
66
|
if os.path.exists(dataset_name_or_path):
|
|
61
|
-
logger.info(
|
|
62
|
-
|
|
67
|
+
logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
|
|
68
|
+
subsets: {subset_list}')
|
|
63
69
|
data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
64
70
|
if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
|
|
65
71
|
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
66
72
|
else:
|
|
73
|
+
from modelscope.msdatasets import MsDataset
|
|
74
|
+
|
|
67
75
|
# Load dataset from remote
|
|
68
|
-
logger.info(
|
|
76
|
+
logger.info(
|
|
77
|
+
f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
69
78
|
data_dict = {}
|
|
70
79
|
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
71
80
|
if len(split_list) == 0:
|
|
72
81
|
logger.error(f'Got empty split list: {split_list}')
|
|
73
82
|
|
|
74
|
-
subset_list = subset_list if subset_list is not None else self.subset_list
|
|
75
83
|
for sub_name in subset_list:
|
|
76
84
|
data_dict[sub_name] = {}
|
|
77
85
|
# e.g. train: few-shot, test: target dataset to evaluate
|
|
78
86
|
for split in split_list:
|
|
79
|
-
dataset =
|
|
87
|
+
dataset = MsDataset.load(
|
|
80
88
|
dataset_name=dataset_name_or_path,
|
|
81
|
-
|
|
89
|
+
subset_name=sub_name,
|
|
82
90
|
split=split,
|
|
91
|
+
cache_dir=work_dir,
|
|
83
92
|
hub=datasets_hub,
|
|
84
|
-
work_dir=work_dir,
|
|
85
93
|
**kwargs)
|
|
86
94
|
|
|
87
95
|
data_dict[sub_name].update({split: dataset})
|
|
@@ -132,30 +140,111 @@ class DataAdapter(ABC):
|
|
|
132
140
|
prompt_d[AnswerKeys.RAW_INPUT] = sample_d
|
|
133
141
|
res_dict[sub_name].append(prompt_d)
|
|
134
142
|
|
|
135
|
-
rnd = random.Random()
|
|
136
|
-
rnd.seed(42)
|
|
137
|
-
for k, v in res_dict.items():
|
|
138
|
-
rnd.shuffle(v)
|
|
139
|
-
|
|
140
143
|
return res_dict
|
|
141
144
|
|
|
142
|
-
|
|
143
|
-
|
|
145
|
+
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
146
|
+
"""
|
|
147
|
+
Generate report for the evaluation results for all subsets.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
subset_score_map: The subset-score map.
|
|
151
|
+
e.g. {subset_name: (score, num)}
|
|
152
|
+
|
|
153
|
+
report_name: str, the user-defined report name. Default: None
|
|
154
|
+
|
|
155
|
+
Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
|
|
156
|
+
|
|
157
|
+
Here is a format example for ARC-Challenge:
|
|
158
|
+
{
|
|
159
|
+
"name":"ARC-Challenge",
|
|
160
|
+
"metric":"WeightedAverageAccuracy",
|
|
161
|
+
"score": 0.3389,
|
|
162
|
+
"category":[
|
|
163
|
+
{
|
|
164
|
+
"name":"DEFAULT",
|
|
165
|
+
"score": 0.3389,
|
|
166
|
+
"subset":[
|
|
167
|
+
{
|
|
168
|
+
"name":"ARC-Challenge",
|
|
169
|
+
"score": 0.3389,
|
|
170
|
+
"num": 100
|
|
171
|
+
},
|
|
172
|
+
]
|
|
173
|
+
}
|
|
174
|
+
],
|
|
175
|
+
"total_num":100
|
|
176
|
+
}
|
|
177
|
+
""" # noqa: E501
|
|
178
|
+
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
179
|
+
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
180
|
+
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
181
|
+
cate_avg_list = [{
|
|
182
|
+
'name': subset_name,
|
|
183
|
+
'score': normalize_score(score=score),
|
|
184
|
+
'num': num
|
|
185
|
+
} for subset_name, (score, num) in subset_score_map.items()]
|
|
186
|
+
|
|
187
|
+
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
188
|
+
|
|
189
|
+
res_map = dict(
|
|
190
|
+
name=report_name or 'DEFAULT',
|
|
191
|
+
metric=self.metric_list[0]['name'],
|
|
192
|
+
score=weighted_avg_acc,
|
|
193
|
+
category=[category_d],
|
|
194
|
+
total_num=total_num)
|
|
195
|
+
|
|
196
|
+
return res_map
|
|
197
|
+
|
|
198
|
+
def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
|
|
199
|
+
|
|
200
|
+
if k > len(data_list):
|
|
201
|
+
k = len(data_list)
|
|
202
|
+
if few_shot_random:
|
|
203
|
+
return random.sample(data_list, k)
|
|
204
|
+
else:
|
|
205
|
+
return data_list[:k]
|
|
206
|
+
|
|
207
|
+
def compute_metric(self, review_res_list: list) -> Any:
|
|
208
|
+
"""
|
|
209
|
+
Compute evaluation result by specific metrics.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
review_res_list: list, the review result list, each item of which is match result for gold and pred.
|
|
213
|
+
|
|
214
|
+
Attributes:
|
|
215
|
+
DataAdapter.metric_func_map: metric_name -> metric_func mapping,
|
|
216
|
+
e.g. {'WeightedAverageAccuracy': weighted_average_acc}
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Metric results.
|
|
220
|
+
"""
|
|
221
|
+
if len(self.metric_list) == 0:
|
|
222
|
+
raise ValueError('No metric list found for the benchmark.')
|
|
223
|
+
elif len(self.metric_list) == 1:
|
|
224
|
+
# review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
225
|
+
items = [(score, 1.0) for score in review_res_list]
|
|
226
|
+
return self.metric_list[0]['object'](items)
|
|
227
|
+
else:
|
|
228
|
+
raise ValueError('Please implement the compute_metric method for multiple metrics.')
|
|
229
|
+
|
|
230
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
144
231
|
"""
|
|
145
232
|
Generate model prompt from raw input, unify the prompt format for different datasets.
|
|
146
233
|
The input format is compatible with OpenAI Chat Completions APIs.
|
|
147
|
-
Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
|
|
148
234
|
|
|
149
235
|
Args:
|
|
150
236
|
input_d (Any): The raw input. Depending on the dataset.
|
|
237
|
+
subset_name (str): The subset name.
|
|
238
|
+
few_shot_list (list): The few-shot examples.
|
|
151
239
|
|
|
152
240
|
Returns:
|
|
241
|
+
For class ChatGenerationModelAdapter, the output format is:
|
|
242
|
+
{'data': [full_prompt], 'system_prompt': (str, optional)}, -- full_prompt: str, the constructed prompt for each sample from dataset.
|
|
153
243
|
For class MultiChoiceModelAdapter, the output format is:
|
|
154
|
-
{'data': [full_prompt]}
|
|
155
|
-
|
|
244
|
+
{'data': [full_prompt], 'multi_choices': self.choices} -- full_prompt: str, the constructed prompt for each sample from dataset.
|
|
156
245
|
For class ContinuationEvalModelAdapter, the output format is:
|
|
157
|
-
{'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
|
|
158
|
-
"""
|
|
246
|
+
{'data': ctx_continuation_pair_list, 'multi_choices': self.choices} -- ctx_continuation_pair_list: list, the context-continuation pair list.
|
|
247
|
+
""" # noqa: E501
|
|
159
248
|
raise NotImplementedError
|
|
160
249
|
|
|
161
250
|
@abstractmethod
|
|
@@ -172,7 +261,7 @@ class DataAdapter(ABC):
|
|
|
172
261
|
raise NotImplementedError
|
|
173
262
|
|
|
174
263
|
@abstractmethod
|
|
175
|
-
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str =
|
|
264
|
+
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
|
|
176
265
|
"""
|
|
177
266
|
Parse the predicted result and extract proper answer.
|
|
178
267
|
|
|
@@ -187,77 +276,17 @@ class DataAdapter(ABC):
|
|
|
187
276
|
raise NotImplementedError
|
|
188
277
|
|
|
189
278
|
@abstractmethod
|
|
190
|
-
def match(self, gold: Any, pred: Any) ->
|
|
279
|
+
def match(self, gold: Any, pred: Any) -> float:
|
|
191
280
|
"""
|
|
192
281
|
Match the gold answer and the predicted answer.
|
|
193
282
|
|
|
194
283
|
Args:
|
|
195
284
|
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
|
|
196
|
-
e.g. 'A'
|
|
285
|
+
e.g. 'A', extracted from get_gold_answer method.
|
|
197
286
|
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
|
|
198
|
-
e.g. 'B'
|
|
287
|
+
e.g. 'B', extracted from parse_pred_result method.
|
|
199
288
|
|
|
200
289
|
Returns:
|
|
201
290
|
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
202
291
|
"""
|
|
203
292
|
raise NotImplementedError
|
|
204
|
-
|
|
205
|
-
@abstractmethod
|
|
206
|
-
def compute_metric(self, review_res_list: list) -> Any:
|
|
207
|
-
"""
|
|
208
|
-
Compute evaluation result by specific metrics.
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
review_res_list: list, the review result list, each item of which is match result for gold and pred.
|
|
212
|
-
|
|
213
|
-
Attributes:
|
|
214
|
-
DataAdapter.metric_func_map: metric_name -> metric_func mapping,
|
|
215
|
-
e.g. {'WeightedAverageAccuracy': weighted_average_acc}
|
|
216
|
-
|
|
217
|
-
Returns:
|
|
218
|
-
Metric results.
|
|
219
|
-
"""
|
|
220
|
-
raise NotImplementedError
|
|
221
|
-
|
|
222
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
223
|
-
"""
|
|
224
|
-
Generate report for the evaluation results for all subsets.
|
|
225
|
-
|
|
226
|
-
Args:
|
|
227
|
-
subset_score_map: The subset-score map.
|
|
228
|
-
e.g. {subset_name: (score, num)}
|
|
229
|
-
|
|
230
|
-
report_name: str, the user-defined report name. Default: None
|
|
231
|
-
|
|
232
|
-
Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
|
|
233
|
-
|
|
234
|
-
Here is a format example for ARC-Challenge:
|
|
235
|
-
{
|
|
236
|
-
"name":"ARC-Challenge",
|
|
237
|
-
"metric":"WeightedAverageAccuracy",
|
|
238
|
-
"score": 0.3389,
|
|
239
|
-
"category":[
|
|
240
|
-
{
|
|
241
|
-
"name":"DEFAULT",
|
|
242
|
-
"score": 0.3389,
|
|
243
|
-
"subset":[
|
|
244
|
-
{
|
|
245
|
-
"name":"ARC-Challenge",
|
|
246
|
-
"score": 0.3389
|
|
247
|
-
},
|
|
248
|
-
]
|
|
249
|
-
}
|
|
250
|
-
],
|
|
251
|
-
"total_num":100
|
|
252
|
-
}
|
|
253
|
-
"""
|
|
254
|
-
raise NotImplementedError
|
|
255
|
-
|
|
256
|
-
def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
|
|
257
|
-
|
|
258
|
-
if k > len(data_list):
|
|
259
|
-
k = len(data_list)
|
|
260
|
-
if few_shot_random:
|
|
261
|
-
return random.sample(data_list, k)
|
|
262
|
-
else:
|
|
263
|
-
return data_list[:k]
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter
|
|
5
|
-
from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
|
|
@@ -5,35 +5,32 @@ import os.path
|
|
|
5
5
|
from collections import defaultdict
|
|
6
6
|
from typing import Any, Optional
|
|
7
7
|
|
|
8
|
-
from evalscope.benchmarks
|
|
9
|
-
from evalscope.metrics
|
|
10
|
-
|
|
8
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
+
from evalscope.metrics import (WeightedAverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh,
|
|
10
|
+
weighted_mean)
|
|
11
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
11
12
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
12
13
|
from evalscope.utils.logger import get_logger
|
|
13
14
|
|
|
14
15
|
logger = get_logger()
|
|
15
16
|
|
|
16
|
-
DATASET_ID = 'general_qa'
|
|
17
|
-
SUBSET_LIST = ['default']
|
|
18
|
-
|
|
19
17
|
|
|
18
|
+
@Benchmark.register(
|
|
19
|
+
name='general_qa',
|
|
20
|
+
dataset_id='general_qa',
|
|
21
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
22
|
+
subset_list=['default'],
|
|
23
|
+
metric_list=[WeightedAverageBLEU],
|
|
24
|
+
few_shot_num=0,
|
|
25
|
+
train_split=None,
|
|
26
|
+
eval_split='test',
|
|
27
|
+
)
|
|
20
28
|
class GeneralQAAdapter(DataAdapter):
|
|
21
29
|
# TODO: set few_shot_num
|
|
22
30
|
|
|
23
|
-
def __init__(self,
|
|
24
|
-
subset_list: list = None,
|
|
25
|
-
metric_list: list = None,
|
|
26
|
-
train_split: str = None,
|
|
27
|
-
eval_split: str = 'test',
|
|
28
|
-
**kwargs):
|
|
29
|
-
if subset_list is None:
|
|
30
|
-
subset_list = SUBSET_LIST
|
|
31
|
-
|
|
32
|
-
if metric_list is None:
|
|
33
|
-
metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
|
|
31
|
+
def __init__(self, **kwargs):
|
|
34
32
|
|
|
35
|
-
super().__init__(
|
|
36
|
-
subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs)
|
|
33
|
+
super().__init__(**kwargs)
|
|
37
34
|
|
|
38
35
|
def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:
|
|
39
36
|
|
|
@@ -1,5 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|
|
@@ -1,35 +1,33 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) EleutherAI, Inc. and its affiliates.
|
|
3
|
+
# flake8: noqa
|
|
3
4
|
import math
|
|
4
5
|
import os
|
|
5
6
|
import re
|
|
6
7
|
|
|
7
|
-
from evalscope.benchmarks import DataAdapter
|
|
8
|
-
from evalscope.metrics
|
|
9
|
-
from evalscope.
|
|
8
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
+
from evalscope.metrics import WeightedAverageAccuracy
|
|
10
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
10
11
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
12
|
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
13
|
-
# flake8: noqa
|
|
14
|
-
|
|
15
14
|
logger = get_logger()
|
|
16
15
|
|
|
17
|
-
DATASET_ID = 'modelscope/gsm8k'
|
|
18
|
-
SUBSET_LIST = ['main']
|
|
19
|
-
ANS_RE = re.compile(r'#### (\-?[0-9\.\,]+)')
|
|
20
|
-
INVALID_ANS = '[invalid]'
|
|
21
|
-
|
|
22
16
|
|
|
17
|
+
@Benchmark.register(
|
|
18
|
+
name='gsm8k',
|
|
19
|
+
dataset_id='modelscope/gsm8k',
|
|
20
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
21
|
+
subset_list=['main'],
|
|
22
|
+
metric_list=[WeightedAverageAccuracy],
|
|
23
|
+
few_shot_num=4,
|
|
24
|
+
train_split='train',
|
|
25
|
+
eval_split='test',
|
|
26
|
+
prompt_template='',
|
|
27
|
+
)
|
|
23
28
|
class GSM8KAdapter(DataAdapter):
|
|
24
29
|
|
|
25
|
-
def __init__(self,
|
|
26
|
-
subset_list: list = None,
|
|
27
|
-
metric_list: list = None,
|
|
28
|
-
few_shot_num: int = None,
|
|
29
|
-
train_split: str = 'train',
|
|
30
|
-
eval_split: str = 'test',
|
|
31
|
-
prompt_template: str = '',
|
|
32
|
-
**kwargs):
|
|
30
|
+
def __init__(self, **kwargs):
|
|
33
31
|
"""
|
|
34
32
|
Data adapter for GSM8K dataset.
|
|
35
33
|
|
|
@@ -41,30 +39,13 @@ class GSM8KAdapter(DataAdapter):
|
|
|
41
39
|
eval_split (str): The target eval split name. Default: 'test'
|
|
42
40
|
**kwargs: ...
|
|
43
41
|
"""
|
|
44
|
-
|
|
45
|
-
if subset_list is None:
|
|
46
|
-
subset_list = SUBSET_LIST
|
|
47
|
-
|
|
48
|
-
if metric_list is None:
|
|
49
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
50
|
-
|
|
51
|
-
if few_shot_num is None:
|
|
52
|
-
logger.info(f'Set 4-shot examples by system for GSM8K.')
|
|
53
|
-
few_shot_num = 4
|
|
54
|
-
|
|
42
|
+
few_shot_num = kwargs.get('few_shot_num', 4)
|
|
55
43
|
if few_shot_num != 4 and few_shot_num != 0:
|
|
56
44
|
logger.error(f'GSM8K uses 4-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
|
|
57
45
|
f'Use 4-shot by default.')
|
|
58
|
-
few_shot_num = 4
|
|
46
|
+
kwargs['few_shot_num'] = 4
|
|
59
47
|
|
|
60
|
-
super().__init__(
|
|
61
|
-
subset_list=subset_list,
|
|
62
|
-
metric_list=metric_list,
|
|
63
|
-
few_shot_num=few_shot_num,
|
|
64
|
-
train_split=train_split,
|
|
65
|
-
eval_split=eval_split,
|
|
66
|
-
prompt_template=prompt_template,
|
|
67
|
-
**kwargs)
|
|
48
|
+
super().__init__(**kwargs)
|
|
68
49
|
|
|
69
50
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
70
51
|
data_dict = {}
|
|
@@ -142,66 +123,6 @@ class GSM8KAdapter(DataAdapter):
|
|
|
142
123
|
|
|
143
124
|
return number_equal(gold_ans=gold, pred_ans=pred)
|
|
144
125
|
|
|
145
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
146
|
-
"""
|
|
147
|
-
Compute evaluation result by specific metric.
|
|
148
|
-
|
|
149
|
-
Args:
|
|
150
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
151
|
-
|
|
152
|
-
Returns:
|
|
153
|
-
The metric score.
|
|
154
|
-
"""
|
|
155
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
156
|
-
return weighted_mean(items)
|
|
157
|
-
|
|
158
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
159
|
-
"""
|
|
160
|
-
Generate the report for the model output.
|
|
161
|
-
|
|
162
|
-
Args:
|
|
163
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
164
|
-
report_name: The user-defined report name. Default: None
|
|
165
|
-
|
|
166
|
-
Returns: A dict of metric calculation results. The format is like:
|
|
167
|
-
{
|
|
168
|
-
"name":"GSM8K",
|
|
169
|
-
"metric":"WeightedAverageAccuracy",
|
|
170
|
-
"score":0.5632,
|
|
171
|
-
"category":[
|
|
172
|
-
{
|
|
173
|
-
"name":"DEFAULT",
|
|
174
|
-
"score":0.5632,
|
|
175
|
-
"subset":[
|
|
176
|
-
{
|
|
177
|
-
"name":"main",
|
|
178
|
-
"score":0.5632
|
|
179
|
-
},
|
|
180
|
-
]
|
|
181
|
-
}
|
|
182
|
-
],
|
|
183
|
-
"total_num":100
|
|
184
|
-
}
|
|
185
|
-
"""
|
|
186
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
187
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
188
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
189
|
-
cate_avg_list = [{
|
|
190
|
-
'name': subset_name,
|
|
191
|
-
'score': normalize_score(score=score)
|
|
192
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
193
|
-
|
|
194
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
195
|
-
|
|
196
|
-
res_map = dict(
|
|
197
|
-
name=report_name or 'gsm8k',
|
|
198
|
-
metric=self.metric_list[0]['name'],
|
|
199
|
-
score=weighted_avg_acc,
|
|
200
|
-
category=[category_d],
|
|
201
|
-
total_num=total_num)
|
|
202
|
-
|
|
203
|
-
return res_map
|
|
204
|
-
|
|
205
126
|
@classmethod
|
|
206
127
|
def _generate_prompt(cls, input_d: dict, few_shot_list: list, use_fewshot: bool = True) -> str:
|
|
207
128
|
if use_fewshot:
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter
|
|
5
|
-
from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
|
|
@@ -3,9 +3,10 @@ import numpy as np
|
|
|
3
3
|
import os
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
|
-
from evalscope.benchmarks
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
6
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
+
from evalscope.constants import EvalType
|
|
8
|
+
from evalscope.metrics import WeightedAverageAccuracy, exact_match
|
|
9
|
+
from evalscope.models import ContinuationLogitsModelAdapter
|
|
9
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
11
12
|
|
|
@@ -13,44 +14,30 @@ from evalscope.utils.logger import get_logger
|
|
|
13
14
|
|
|
14
15
|
logger = get_logger()
|
|
15
16
|
|
|
16
|
-
DATASET_ID = 'modelscope/hellaswag'
|
|
17
|
-
SUBSET_LIST = ['default']
|
|
18
|
-
|
|
19
17
|
|
|
18
|
+
@Benchmark.register(
|
|
19
|
+
name='hellaswag',
|
|
20
|
+
dataset_id='modelscope/hellaswag',
|
|
21
|
+
model_adapter=ContinuationLogitsModelAdapter,
|
|
22
|
+
subset_list=['default'],
|
|
23
|
+
metric_list=[WeightedAverageAccuracy],
|
|
24
|
+
few_shot_num=0,
|
|
25
|
+
train_split='train',
|
|
26
|
+
eval_split='validation',
|
|
27
|
+
prompt_template='',
|
|
28
|
+
)
|
|
20
29
|
class HellaSwagAdapter(DataAdapter):
|
|
21
30
|
|
|
22
31
|
choices = ['0', '1', '2', '3']
|
|
23
32
|
|
|
24
|
-
def __init__(self,
|
|
25
|
-
subset_list: list = None,
|
|
26
|
-
metric_list: list = None,
|
|
27
|
-
few_shot_num: int = None,
|
|
28
|
-
train_split: str = 'train',
|
|
29
|
-
eval_split: str = 'validation',
|
|
30
|
-
**kwargs):
|
|
31
|
-
|
|
32
|
-
if subset_list is None:
|
|
33
|
-
subset_list = SUBSET_LIST
|
|
34
|
-
|
|
35
|
-
if metric_list is None:
|
|
36
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
37
|
-
|
|
38
|
-
if few_shot_num is None:
|
|
39
|
-
# Use 0-shot by default
|
|
40
|
-
logger.info(f'Set 0-shot examples by system for HellaSwag.')
|
|
41
|
-
few_shot_num = 0
|
|
33
|
+
def __init__(self, **kwargs):
|
|
42
34
|
|
|
35
|
+
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
43
36
|
if few_shot_num != 0:
|
|
44
37
|
logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
|
|
45
|
-
few_shot_num = 0
|
|
38
|
+
kwargs['few_shot_num'] = 0
|
|
46
39
|
|
|
47
|
-
super().__init__(
|
|
48
|
-
subset_list=subset_list,
|
|
49
|
-
metric_list=metric_list,
|
|
50
|
-
few_shot_num=few_shot_num,
|
|
51
|
-
train_split=train_split,
|
|
52
|
-
eval_split=eval_split,
|
|
53
|
-
**kwargs)
|
|
40
|
+
super().__init__(**kwargs)
|
|
54
41
|
|
|
55
42
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
56
43
|
data_dict = {}
|
|
@@ -106,7 +93,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
106
93
|
# Get the gold choice
|
|
107
94
|
return input_d['label']
|
|
108
95
|
|
|
109
|
-
def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str =
|
|
96
|
+
def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
110
97
|
"""
|
|
111
98
|
Parse the model output to get the answer. Could be the best choice index.
|
|
112
99
|
|
|
@@ -118,7 +105,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
118
105
|
Returns:
|
|
119
106
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
120
107
|
"""
|
|
121
|
-
if eval_type ==
|
|
108
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
122
109
|
# answer: in the form of [-2.3, -4.5, ...], len of self.choices
|
|
123
110
|
result = np.array(result)
|
|
124
111
|
endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
|
|
@@ -126,9 +113,9 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
126
113
|
best_choice_idx = np.argmax(result / completion_len)
|
|
127
114
|
|
|
128
115
|
return str(best_choice_idx)
|
|
129
|
-
elif eval_type ==
|
|
116
|
+
elif eval_type == EvalType.SERVICE:
|
|
130
117
|
return result # TODO: to be supported !
|
|
131
|
-
elif eval_type ==
|
|
118
|
+
elif eval_type == EvalType.CUSTOM:
|
|
132
119
|
return result # TODO: to be supported !
|
|
133
120
|
else:
|
|
134
121
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
@@ -136,66 +123,6 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
136
123
|
def match(self, gold: str, pred: str) -> float:
|
|
137
124
|
return exact_match(gold=str(gold), pred=str(pred))
|
|
138
125
|
|
|
139
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
140
|
-
"""
|
|
141
|
-
Compute evaluation result by specific metric.
|
|
142
|
-
|
|
143
|
-
Args:
|
|
144
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
145
|
-
|
|
146
|
-
Returns:
|
|
147
|
-
The metric score.
|
|
148
|
-
"""
|
|
149
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
150
|
-
return weighted_mean(items)
|
|
151
|
-
|
|
152
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
153
|
-
"""
|
|
154
|
-
Generate the report for the model output.
|
|
155
|
-
|
|
156
|
-
Args:
|
|
157
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
158
|
-
report_name: The user-defined report name.
|
|
159
|
-
|
|
160
|
-
Returns: A dict of metric calculation results. The format is like:
|
|
161
|
-
{
|
|
162
|
-
"name":"HellaSwag",
|
|
163
|
-
"metric":"WeightedAverageAccuracy",
|
|
164
|
-
"score":0.3389,
|
|
165
|
-
"category":[
|
|
166
|
-
{
|
|
167
|
-
"name":"DEFAULT",
|
|
168
|
-
"score":0.4128,
|
|
169
|
-
"subset":[
|
|
170
|
-
{
|
|
171
|
-
"name":"default",
|
|
172
|
-
"score":0.5632
|
|
173
|
-
},
|
|
174
|
-
]
|
|
175
|
-
}
|
|
176
|
-
],
|
|
177
|
-
"total_num":7800
|
|
178
|
-
}
|
|
179
|
-
"""
|
|
180
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
181
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
182
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
183
|
-
cate_avg_list = [{
|
|
184
|
-
'name': subset_name,
|
|
185
|
-
'score': normalize_score(score=score)
|
|
186
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
187
|
-
|
|
188
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
189
|
-
|
|
190
|
-
res_map = dict(
|
|
191
|
-
name=report_name or 'hellaswag',
|
|
192
|
-
metric=self.metric_list[0]['name'],
|
|
193
|
-
score=weighted_avg_acc,
|
|
194
|
-
category=[category_d],
|
|
195
|
-
total_num=total_num)
|
|
196
|
-
|
|
197
|
-
return res_map
|
|
198
|
-
|
|
199
126
|
@classmethod
|
|
200
127
|
def _preprocess(cls, text):
|
|
201
128
|
text = text.strip()
|
|
@@ -1,5 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.humaneval.humaneval_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.humaneval.humaneval_adapter import HumanevalAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|