evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +26 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +9 -26
- evalscope/evaluator/evaluator.py +87 -121
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +48 -72
- evalscope/run_arena.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +5 -4
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +5 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +52 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
|
@@ -2,8 +2,11 @@
|
|
|
2
2
|
import csv
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
|
-
from evalscope.benchmarks
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.metrics import AverageAccuracy
|
|
6
8
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
9
|
+
from evalscope.models import MultiChoiceModelAdapter
|
|
7
10
|
from evalscope.utils import ResponseParser, normalize_score
|
|
8
11
|
from evalscope.utils.logger import get_logger
|
|
9
12
|
|
|
@@ -11,8 +14,6 @@ from evalscope.utils.logger import get_logger
|
|
|
11
14
|
|
|
12
15
|
logger = get_logger()
|
|
13
16
|
|
|
14
|
-
DATASET_ID = 'modelscope/ceval-exam'
|
|
15
|
-
|
|
16
17
|
SUBSET_LIST = [
|
|
17
18
|
'computer_network',
|
|
18
19
|
'operating_system',
|
|
@@ -124,40 +125,29 @@ SUBJECT_MAPPING = {
|
|
|
124
125
|
}
|
|
125
126
|
|
|
126
127
|
|
|
128
|
+
@Benchmark.register(
|
|
129
|
+
name='ceval',
|
|
130
|
+
dataset_id='modelscope/ceval-exam',
|
|
131
|
+
model_adapter=MultiChoiceModelAdapter,
|
|
132
|
+
subset_list=SUBSET_LIST,
|
|
133
|
+
metric_list=[AverageAccuracy],
|
|
134
|
+
few_shot_num=0,
|
|
135
|
+
train_split='dev',
|
|
136
|
+
eval_split='val',
|
|
137
|
+
)
|
|
127
138
|
class CEVALAdapter(DataAdapter):
|
|
128
139
|
|
|
129
140
|
choices = ['A', 'B', 'C', 'D']
|
|
130
141
|
|
|
131
|
-
def __init__(self,
|
|
132
|
-
subset_list: list = None,
|
|
133
|
-
metric_list: list = None,
|
|
134
|
-
few_shot_num: int = None,
|
|
135
|
-
train_split: str = 'dev',
|
|
136
|
-
eval_split: str = 'val',
|
|
137
|
-
**kwargs):
|
|
138
|
-
|
|
139
|
-
if subset_list is None:
|
|
140
|
-
subset_list = SUBSET_LIST
|
|
141
|
-
|
|
142
|
-
if metric_list is None:
|
|
143
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
144
|
-
|
|
145
|
-
if few_shot_num is None:
|
|
146
|
-
# Use 5-shot by default
|
|
147
|
-
logger.info(f'Set 0-shot examples by default for C-Eval.')
|
|
148
|
-
few_shot_num = 0
|
|
142
|
+
def __init__(self, **kwargs):
|
|
149
143
|
|
|
144
|
+
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
150
145
|
if few_shot_num > 5:
|
|
151
146
|
logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
|
|
152
|
-
few_shot_num = 5
|
|
147
|
+
kwargs['few_shot_num'] = 5
|
|
148
|
+
super().__init__(**kwargs)
|
|
153
149
|
|
|
154
|
-
|
|
155
|
-
subset_list=subset_list,
|
|
156
|
-
metric_list=metric_list,
|
|
157
|
-
few_shot_num=few_shot_num,
|
|
158
|
-
train_split=train_split,
|
|
159
|
-
eval_split=eval_split,
|
|
160
|
-
**kwargs)
|
|
150
|
+
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
161
151
|
|
|
162
152
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
163
153
|
data_dict = {}
|
|
@@ -217,13 +207,13 @@ class CEVALAdapter(DataAdapter):
|
|
|
217
207
|
subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
218
208
|
full_prompt = f'以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n' + full_prompt
|
|
219
209
|
|
|
220
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
210
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
221
211
|
|
|
222
212
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
223
213
|
# Get the gold choice
|
|
224
214
|
return input_d.get('answer', '')
|
|
225
215
|
|
|
226
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str =
|
|
216
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
227
217
|
"""
|
|
228
218
|
Parse the model output to get the answer. Could be the best choice index.
|
|
229
219
|
|
|
@@ -235,11 +225,11 @@ class CEVALAdapter(DataAdapter):
|
|
|
235
225
|
Returns:
|
|
236
226
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
237
227
|
"""
|
|
238
|
-
if eval_type ==
|
|
228
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
239
229
|
return result
|
|
240
|
-
elif eval_type ==
|
|
230
|
+
elif eval_type == EvalType.SERVICE:
|
|
241
231
|
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
242
|
-
elif eval_type ==
|
|
232
|
+
elif eval_type == EvalType.CUSTOM:
|
|
243
233
|
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
244
234
|
else:
|
|
245
235
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
@@ -247,97 +237,6 @@ class CEVALAdapter(DataAdapter):
|
|
|
247
237
|
def match(self, gold: str, pred: str) -> float:
|
|
248
238
|
return exact_match(gold=gold, pred=pred)
|
|
249
239
|
|
|
250
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
251
|
-
"""
|
|
252
|
-
Compute evaluation result by specific metric.
|
|
253
|
-
|
|
254
|
-
Args:
|
|
255
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
256
|
-
|
|
257
|
-
Returns:
|
|
258
|
-
The metric score.
|
|
259
|
-
"""
|
|
260
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
261
|
-
return weighted_mean(items)
|
|
262
|
-
|
|
263
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
264
|
-
"""
|
|
265
|
-
Generate report for the evaluation.
|
|
266
|
-
|
|
267
|
-
Args:
|
|
268
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
269
|
-
report_name: The user-defined report name.
|
|
270
|
-
|
|
271
|
-
Returns:
|
|
272
|
-
{
|
|
273
|
-
"name":"C-Eval",
|
|
274
|
-
"metric":"WeightedAverageAccuracy",
|
|
275
|
-
"score":0.3389,
|
|
276
|
-
"category":[
|
|
277
|
-
{
|
|
278
|
-
"name":"STEM",
|
|
279
|
-
"score":0.2528,
|
|
280
|
-
"subset":[
|
|
281
|
-
{
|
|
282
|
-
"name":"computer_network",
|
|
283
|
-
"score":0.2632
|
|
284
|
-
},
|
|
285
|
-
{
|
|
286
|
-
"name":"operating_system",
|
|
287
|
-
"score":0.3157
|
|
288
|
-
},
|
|
289
|
-
{
|
|
290
|
-
"name":"computer_architecture",
|
|
291
|
-
"score":0.4285
|
|
292
|
-
}
|
|
293
|
-
]
|
|
294
|
-
}
|
|
295
|
-
],
|
|
296
|
-
"total_num":59
|
|
297
|
-
}
|
|
298
|
-
"""
|
|
299
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
300
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
301
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
302
|
-
|
|
303
|
-
# Get domain-subject mapping
|
|
304
|
-
subject_review_map = {}
|
|
305
|
-
for subset_name, (subset_score, num) in subset_score_map.items():
|
|
306
|
-
domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else 'DEFAULT'
|
|
307
|
-
if domain_name in subject_review_map:
|
|
308
|
-
subject_review_map[domain_name].append((subset_name, subset_score, num))
|
|
309
|
-
else:
|
|
310
|
-
subject_review_map[domain_name] = [(subset_name, subset_score, num)]
|
|
311
|
-
|
|
312
|
-
# Get domain score
|
|
313
|
-
category_list = []
|
|
314
|
-
for domain_name, domain_res_list in subject_review_map.items():
|
|
315
|
-
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
316
|
-
sum([num for _, _, num in domain_res_list])
|
|
317
|
-
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
318
|
-
category_list.append({
|
|
319
|
-
'name':
|
|
320
|
-
domain_name,
|
|
321
|
-
'score':
|
|
322
|
-
domain_weighted_avg_acc,
|
|
323
|
-
'subset': [{
|
|
324
|
-
'name': subset_name,
|
|
325
|
-
'score': normalize_score(score=subset_score)
|
|
326
|
-
} for subset_name, subset_score, _ in domain_res_list]
|
|
327
|
-
})
|
|
328
|
-
|
|
329
|
-
category_list = sorted(category_list, key=lambda x: x['name'])
|
|
330
|
-
|
|
331
|
-
# Get final dict of report
|
|
332
|
-
res_map = dict(
|
|
333
|
-
name=report_name or 'ceval',
|
|
334
|
-
metric=self.metric_list[0]['name'],
|
|
335
|
-
score=weighted_avg_acc,
|
|
336
|
-
category=category_list,
|
|
337
|
-
total_num=total_num)
|
|
338
|
-
|
|
339
|
-
return res_map
|
|
340
|
-
|
|
341
240
|
@classmethod
|
|
342
241
|
def _format_example(cls, input_d: dict, include_answer=True):
|
|
343
242
|
example = '问题:' + input_d['question']
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.cmmlu.cmmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.cmmlu.cmmlu_adapter import CMMLUAdapter
|
|
5
|
-
from evalscope.benchmarks.cmmlu.cmmlu_adapter import CMMLUAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
|
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
import csv
|
|
4
4
|
import os
|
|
5
5
|
|
|
6
|
-
from evalscope.benchmarks
|
|
7
|
-
from evalscope.
|
|
6
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
+
from evalscope.constants import EvalType
|
|
8
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
9
|
+
from evalscope.models import MultiChoiceModelAdapter
|
|
8
10
|
from evalscope.utils import ResponseParser, normalize_score
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
|
|
@@ -12,8 +14,6 @@ from evalscope.utils.logger import get_logger
|
|
|
12
14
|
|
|
13
15
|
logger = get_logger()
|
|
14
16
|
|
|
15
|
-
DATASET_ID = 'modelscope/cmmlu'
|
|
16
|
-
|
|
17
17
|
SUBSET_LIST = [
|
|
18
18
|
'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam',
|
|
19
19
|
'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
|
|
@@ -101,31 +101,24 @@ SUBJECT_MAPPING = {
|
|
|
101
101
|
}
|
|
102
102
|
|
|
103
103
|
|
|
104
|
+
@Benchmark.register(
|
|
105
|
+
name='cmmlu',
|
|
106
|
+
dataset_id='modelscope/cmmlu',
|
|
107
|
+
model_adapter=MultiChoiceModelAdapter,
|
|
108
|
+
subset_list=SUBSET_LIST,
|
|
109
|
+
metric_list=[AverageAccuracy],
|
|
110
|
+
few_shot_num=5,
|
|
111
|
+
train_split='dev',
|
|
112
|
+
eval_split='test',
|
|
113
|
+
)
|
|
104
114
|
class CMMLUAdapter(DataAdapter):
|
|
105
115
|
|
|
106
116
|
choices = ['A', 'B', 'C', 'D']
|
|
107
117
|
|
|
108
|
-
def __init__(self,
|
|
109
|
-
|
|
110
|
-
metric_list: list = None,
|
|
111
|
-
few_shot_num: int = 5,
|
|
112
|
-
train_split: str = 'dev',
|
|
113
|
-
eval_split: str = 'test',
|
|
114
|
-
**kwargs):
|
|
115
|
-
|
|
116
|
-
if subset_list is None:
|
|
117
|
-
subset_list = SUBSET_LIST
|
|
118
|
-
|
|
119
|
-
if metric_list is None:
|
|
120
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
118
|
+
def __init__(self, **kwargs):
|
|
119
|
+
super().__init__(**kwargs)
|
|
121
120
|
|
|
122
|
-
|
|
123
|
-
subset_list=subset_list,
|
|
124
|
-
metric_list=metric_list,
|
|
125
|
-
few_shot_num=few_shot_num,
|
|
126
|
-
train_split=train_split,
|
|
127
|
-
eval_split=eval_split,
|
|
128
|
-
**kwargs)
|
|
121
|
+
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
129
122
|
|
|
130
123
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
131
124
|
data_dict = {}
|
|
@@ -181,13 +174,13 @@ class CMMLUAdapter(DataAdapter):
|
|
|
181
174
|
|
|
182
175
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
183
176
|
|
|
184
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
177
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
|
|
185
178
|
|
|
186
179
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
187
180
|
# Get the gold choice
|
|
188
181
|
return input_d.get('Answer', '')
|
|
189
182
|
|
|
190
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str =
|
|
183
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
191
184
|
"""
|
|
192
185
|
Parse the model output to get the answer. Could be the best choice index.
|
|
193
186
|
|
|
@@ -199,11 +192,11 @@ class CMMLUAdapter(DataAdapter):
|
|
|
199
192
|
Returns:
|
|
200
193
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
201
194
|
"""
|
|
202
|
-
if eval_type ==
|
|
195
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
203
196
|
return result
|
|
204
|
-
elif eval_type ==
|
|
197
|
+
elif eval_type == EvalType.SERVICE:
|
|
205
198
|
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
206
|
-
elif eval_type ==
|
|
199
|
+
elif eval_type == EvalType.CUSTOM:
|
|
207
200
|
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
208
201
|
else:
|
|
209
202
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
@@ -211,94 +204,6 @@ class CMMLUAdapter(DataAdapter):
|
|
|
211
204
|
def match(self, gold: str, pred: str) -> float:
|
|
212
205
|
return exact_match(gold=gold, pred=pred)
|
|
213
206
|
|
|
214
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
215
|
-
"""
|
|
216
|
-
Compute evaluation result by specific metric.
|
|
217
|
-
|
|
218
|
-
Args:
|
|
219
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
220
|
-
|
|
221
|
-
Returns:
|
|
222
|
-
The metric score.
|
|
223
|
-
"""
|
|
224
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
225
|
-
return weighted_mean(items)
|
|
226
|
-
|
|
227
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
228
|
-
"""
|
|
229
|
-
Generate report for the evaluation.
|
|
230
|
-
|
|
231
|
-
Args:
|
|
232
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
233
|
-
report_name: the user-defined report name. Default: None
|
|
234
|
-
|
|
235
|
-
Returns:
|
|
236
|
-
{
|
|
237
|
-
"name":"CMMLU",
|
|
238
|
-
"metric":"WeightedAverageAccuracy",
|
|
239
|
-
"score":0.3389,
|
|
240
|
-
"category":[
|
|
241
|
-
{
|
|
242
|
-
"name":"STEM",
|
|
243
|
-
"score":0.2528,
|
|
244
|
-
"subset":[
|
|
245
|
-
{
|
|
246
|
-
"name":"computer_network",
|
|
247
|
-
"score":0.2632
|
|
248
|
-
},
|
|
249
|
-
{
|
|
250
|
-
"name":"operating_system",
|
|
251
|
-
"score":0.3157
|
|
252
|
-
},
|
|
253
|
-
{
|
|
254
|
-
"name":"computer_architecture",
|
|
255
|
-
"score":0.4285
|
|
256
|
-
}
|
|
257
|
-
]
|
|
258
|
-
}
|
|
259
|
-
],
|
|
260
|
-
"total_num":59
|
|
261
|
-
}
|
|
262
|
-
"""
|
|
263
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
264
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
265
|
-
|
|
266
|
-
# Get domain-subject mapping
|
|
267
|
-
subject_review_map = {}
|
|
268
|
-
for subset_name, (subset_score, num) in subset_score_map.items():
|
|
269
|
-
domain_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
270
|
-
if domain_name in subject_review_map:
|
|
271
|
-
subject_review_map[domain_name].append((subset_name, subset_score, num))
|
|
272
|
-
else:
|
|
273
|
-
subject_review_map[domain_name] = [(subset_name, subset_score, num)]
|
|
274
|
-
|
|
275
|
-
# Get domain score
|
|
276
|
-
category_list = []
|
|
277
|
-
for domain_name, domain_res_list in subject_review_map.items():
|
|
278
|
-
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
279
|
-
sum([num for _, _, num in domain_res_list])
|
|
280
|
-
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
281
|
-
category_list.append({
|
|
282
|
-
'name':
|
|
283
|
-
domain_name,
|
|
284
|
-
'score':
|
|
285
|
-
domain_weighted_avg_acc,
|
|
286
|
-
'subset': [{
|
|
287
|
-
'name': subset_name,
|
|
288
|
-
'score': normalize_score(subset_score)
|
|
289
|
-
} for subset_name, subset_score, _ in domain_res_list]
|
|
290
|
-
})
|
|
291
|
-
|
|
292
|
-
# Get final dict of report
|
|
293
|
-
res_map = dict(
|
|
294
|
-
name=report_name or 'cmmlu',
|
|
295
|
-
metric=self.metric_list[0]['name'],
|
|
296
|
-
score=weighted_avg_acc,
|
|
297
|
-
category=category_list,
|
|
298
|
-
total_num=total_num)
|
|
299
|
-
|
|
300
|
-
return res_map
|
|
301
|
-
|
|
302
207
|
@classmethod
|
|
303
208
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
304
209
|
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.competition_math.competition_math_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter
|
|
5
|
-
from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|