evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +6 -1
- evalscope/benchmarks/aime/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +3 -3
- evalscope/benchmarks/arc/arc_adapter.py +15 -18
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
- evalscope/benchmarks/benchmark.py +12 -11
- evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
- evalscope/benchmarks/data_adapter.py +59 -21
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
- evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
- evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
- evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
- evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
- evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
- evalscope/benchmarks/musr/musr_adapter.py +8 -5
- evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +85 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
- evalscope/benchmarks/utils.py +43 -0
- evalscope/collections/evaluator.py +14 -5
- evalscope/config.py +15 -2
- evalscope/constants.py +14 -0
- evalscope/evaluator/evaluator.py +51 -13
- evalscope/metrics/llm_judge.py +104 -0
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +25 -5
- evalscope/models/chat_adapter.py +3 -0
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +35 -8
- evalscope/perf/arguments.py +13 -7
- evalscope/perf/benchmark.py +5 -0
- evalscope/perf/http_client.py +15 -5
- evalscope/perf/main.py +1 -0
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/report/app.py +3 -0
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +6 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/eval.py +220 -55
- evalscope/third_party/thinkbench/infer.py +37 -7
- evalscope/third_party/thinkbench/tools/llm.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/version.py +2 -2
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
- tests/cli/test_all.py +144 -0
- tests/cli/test_collection.py +28 -2
- tests/cli/test_run.py +201 -32
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,9 @@ from abc import ABC, abstractmethod
|
|
|
5
5
|
from collections import defaultdict
|
|
6
6
|
from typing import Any, List, Optional, Union
|
|
7
7
|
|
|
8
|
+
from evalscope.benchmarks.utils import PromptData, preprocess_decorator
|
|
8
9
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
10
|
+
from evalscope.metrics.llm_judge import LLMJudge
|
|
9
11
|
from evalscope.metrics.named_metrics import metric_registry
|
|
10
12
|
from evalscope.report import Report, ReportGenerator
|
|
11
13
|
from evalscope.utils.logger import get_logger
|
|
@@ -18,8 +20,10 @@ class DataAdapter(ABC):
|
|
|
18
20
|
def __init__(self,
|
|
19
21
|
name: str,
|
|
20
22
|
dataset_id: str,
|
|
23
|
+
model_adapter: str,
|
|
21
24
|
subset_list: list,
|
|
22
25
|
metric_list: List[str],
|
|
26
|
+
llm_as_a_judge: bool = False,
|
|
23
27
|
few_shot_num: Optional[int] = 0,
|
|
24
28
|
train_split: Optional[str] = None,
|
|
25
29
|
eval_split: Optional[str] = None,
|
|
@@ -48,6 +52,7 @@ class DataAdapter(ABC):
|
|
|
48
52
|
"""
|
|
49
53
|
self.name = name
|
|
50
54
|
self.dataset_id = dataset_id
|
|
55
|
+
self.model_adapter = model_adapter
|
|
51
56
|
self.subset_list = subset_list
|
|
52
57
|
self.metric_list = metric_list
|
|
53
58
|
self.few_shot_num = few_shot_num
|
|
@@ -58,7 +63,17 @@ class DataAdapter(ABC):
|
|
|
58
63
|
self.query_template = query_template
|
|
59
64
|
self.pretty_name = pretty_name
|
|
60
65
|
self.config_kwargs = kwargs
|
|
66
|
+
self.llm_as_a_judge = llm_as_a_judge
|
|
61
67
|
self.category_map = kwargs.get('category_map', {})
|
|
68
|
+
self.choices = kwargs.get('choices', None)
|
|
69
|
+
|
|
70
|
+
def __init_subclass__(cls, **kwargs):
|
|
71
|
+
super().__init_subclass__(**kwargs)
|
|
72
|
+
|
|
73
|
+
# find and decorate parse_pred_result method
|
|
74
|
+
if hasattr(cls, 'parse_pred_result'):
|
|
75
|
+
original_method = cls.parse_pred_result
|
|
76
|
+
cls.parse_pred_result = preprocess_decorator(original_method)
|
|
62
77
|
|
|
63
78
|
def load(self,
|
|
64
79
|
dataset_name_or_path: str = None,
|
|
@@ -78,11 +93,17 @@ class DataAdapter(ABC):
|
|
|
78
93
|
|
|
79
94
|
# Try to load dataset from local disk
|
|
80
95
|
if os.path.exists(dataset_name_or_path):
|
|
81
|
-
|
|
96
|
+
logger.info(f'Loading dataset from local disk: {dataset_name_or_path}')
|
|
97
|
+
trust_remote_code = kwargs.pop('trust_remote_code', False)
|
|
98
|
+
data_dict = self.load_from_disk(
|
|
99
|
+
dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
|
|
82
100
|
else:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
101
|
+
logger.info(f'Loading dataset from hub: {dataset_name_or_path}')
|
|
102
|
+
trust_remote_code = kwargs.pop('trust_remote_code', True)
|
|
103
|
+
data_dict = self.load_from_hub(
|
|
104
|
+
dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
|
|
105
|
+
if len(data_dict) == 0:
|
|
106
|
+
raise ValueError(f'Dataset is empty: {dataset_name_or_path}')
|
|
86
107
|
return data_dict
|
|
87
108
|
|
|
88
109
|
def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
|
|
@@ -91,8 +112,7 @@ class DataAdapter(ABC):
|
|
|
91
112
|
datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
|
|
92
113
|
split_as_subset: bool = kwargs.pop('split_as_subset', False)
|
|
93
114
|
# Load dataset from remote
|
|
94
|
-
logger.info(
|
|
95
|
-
f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
115
|
+
logger.info(f'Loading dataset: dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
96
116
|
|
|
97
117
|
data_dict = {}
|
|
98
118
|
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
@@ -133,21 +153,7 @@ class DataAdapter(ABC):
|
|
|
133
153
|
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
|
|
134
154
|
Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
|
|
135
155
|
"""
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
|
|
139
|
-
subsets: {subset_list}')
|
|
140
|
-
data_dict = {}
|
|
141
|
-
subset_list = subset_list or self.subset_list
|
|
142
|
-
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
143
|
-
for sub_name in subset_list:
|
|
144
|
-
data_dict[sub_name] = {}
|
|
145
|
-
# e.g. train: few-shot, test: target dataset to evaluate
|
|
146
|
-
for split in split_list:
|
|
147
|
-
dataset = MsDataset.load(
|
|
148
|
-
dataset_name=dataset_name_or_path, subset_name=sub_name, split=split, cache_dir=work_dir, **kwargs)
|
|
149
|
-
data_dict[sub_name].update({split: dataset})
|
|
150
|
-
return data_dict
|
|
156
|
+
return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
151
157
|
|
|
152
158
|
def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
|
|
153
159
|
"""
|
|
@@ -285,6 +291,12 @@ class DataAdapter(ABC):
|
|
|
285
291
|
kwargs['metric_list'] = self.metric_list
|
|
286
292
|
return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
|
|
287
293
|
|
|
294
|
+
def gen_prompt_data(self, prompt: str, **kwargs) -> dict:
|
|
295
|
+
if not isinstance(prompt, list):
|
|
296
|
+
prompt = [prompt]
|
|
297
|
+
prompt_data = PromptData(data=prompt, multi_choices=self.choices, system_prompt=self.system_prompt)
|
|
298
|
+
return prompt_data.to_dict()
|
|
299
|
+
|
|
288
300
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
289
301
|
"""
|
|
290
302
|
Generate model prompt from raw input, unify the prompt format for different datasets.
|
|
@@ -348,3 +360,29 @@ class DataAdapter(ABC):
|
|
|
348
360
|
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
349
361
|
"""
|
|
350
362
|
raise NotImplementedError
|
|
363
|
+
|
|
364
|
+
def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
|
|
365
|
+
"""
|
|
366
|
+
Use LLM as a judge to evaluate the predicted answer against the gold answer.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
gold (Any): The golden answer.
|
|
370
|
+
pred (Any): The predicted answer.
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
The match result as a float score between 0 and 1.
|
|
374
|
+
"""
|
|
375
|
+
# Default judge handling
|
|
376
|
+
if judge is None:
|
|
377
|
+
logger.warning('No judge LLM provided, please specify a judge LLM in the config.')
|
|
378
|
+
return 0
|
|
379
|
+
|
|
380
|
+
# Extract question from raw_input if available
|
|
381
|
+
raw_input = kwargs.get('raw_input', {})
|
|
382
|
+
question_keys = ['question', 'prompt', 'query', 'problem']
|
|
383
|
+
question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
|
|
384
|
+
|
|
385
|
+
# Request judge and obtain score
|
|
386
|
+
prompt = judge.build_prompt(pred, gold, question)
|
|
387
|
+
score = judge(prompt)
|
|
388
|
+
return judge.get_score(score)
|
|
@@ -3,9 +3,8 @@ import csv
|
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics.metrics import exact_match
|
|
8
|
-
from evalscope.models import MultiChoiceModelAdapter
|
|
9
8
|
from evalscope.utils import ResponseParser
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
11
10
|
|
|
@@ -16,8 +15,10 @@ logger = get_logger()
|
|
|
16
15
|
|
|
17
16
|
@Benchmark.register(
|
|
18
17
|
name='general_mcq',
|
|
18
|
+
pretty_name='General MCQ',
|
|
19
19
|
dataset_id='general_mcq',
|
|
20
|
-
model_adapter=
|
|
20
|
+
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
21
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
21
22
|
subset_list=['default'],
|
|
22
23
|
metric_list=['AverageAccuracy'],
|
|
23
24
|
few_shot_num=0,
|
|
@@ -27,11 +28,11 @@ logger = get_logger()
|
|
|
27
28
|
query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
|
|
28
29
|
class GeneralMCQAdapter(DataAdapter):
|
|
29
30
|
|
|
30
|
-
choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
31
|
-
|
|
32
31
|
def __init__(self, **kwargs):
|
|
33
32
|
super().__init__(**kwargs)
|
|
34
33
|
|
|
34
|
+
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
35
|
+
|
|
35
36
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
36
37
|
data_dict = {}
|
|
37
38
|
for subset_name in subset_list:
|
|
@@ -85,7 +86,7 @@ class GeneralMCQAdapter(DataAdapter):
|
|
|
85
86
|
|
|
86
87
|
full_prompt = self.prompt_template.format(query=context)
|
|
87
88
|
|
|
88
|
-
return
|
|
89
|
+
return self.gen_prompt_data(full_prompt)
|
|
89
90
|
|
|
90
91
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
91
92
|
# Get the gold choice
|
|
@@ -103,14 +104,10 @@ class GeneralMCQAdapter(DataAdapter):
|
|
|
103
104
|
Returns:
|
|
104
105
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
105
106
|
"""
|
|
106
|
-
if
|
|
107
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
107
108
|
return result
|
|
108
|
-
elif eval_type == EvalType.SERVICE:
|
|
109
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
110
|
-
elif eval_type == EvalType.CUSTOM:
|
|
111
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
112
109
|
else:
|
|
113
|
-
|
|
110
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
114
111
|
|
|
115
112
|
def match(self, gold: str, pred: str) -> float:
|
|
116
113
|
return exact_match(gold=gold, pred=pred)
|
|
@@ -1,12 +1,10 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import glob
|
|
3
2
|
import os.path
|
|
4
3
|
from collections import defaultdict
|
|
5
4
|
from typing import List
|
|
6
5
|
|
|
7
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
8
7
|
from evalscope.metrics import bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
|
|
9
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
10
8
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
9
|
from evalscope.utils.logger import get_logger
|
|
12
10
|
|
|
@@ -16,9 +14,8 @@ logger = get_logger()
|
|
|
16
14
|
@Benchmark.register(
|
|
17
15
|
name='general_qa',
|
|
18
16
|
dataset_id='general_qa',
|
|
19
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
20
17
|
subset_list=['default'],
|
|
21
|
-
metric_list=['AverageBLEU'],
|
|
18
|
+
metric_list=['AverageBLEU', 'AverageRouge'],
|
|
22
19
|
few_shot_num=0,
|
|
23
20
|
train_split=None,
|
|
24
21
|
eval_split='test',
|
|
@@ -31,18 +28,31 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
31
28
|
|
|
32
29
|
super().__init__(**kwargs)
|
|
33
30
|
|
|
34
|
-
def load(self, **kwargs) -> dict:
|
|
31
|
+
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
32
|
+
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
33
|
+
subset_list = subset_list or self.subset_list
|
|
35
34
|
|
|
36
|
-
|
|
35
|
+
data_file_dict = defaultdict(str)
|
|
37
36
|
data_list = []
|
|
38
37
|
|
|
38
|
+
# get data file path and subset name
|
|
39
|
+
if os.path.isdir(dataset_name_or_path):
|
|
40
|
+
for subset_name in subset_list:
|
|
41
|
+
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
42
|
+
elif os.path.isfile(dataset_name_or_path):
|
|
43
|
+
cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
|
|
44
|
+
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
45
|
+
else:
|
|
46
|
+
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
47
|
+
|
|
48
|
+
# load data from local disk
|
|
39
49
|
try:
|
|
40
|
-
for file_path in
|
|
50
|
+
for subset_name, file_path in data_file_dict.items():
|
|
41
51
|
data_list.extend(jsonl_to_list(file_path))
|
|
42
52
|
except Exception as e:
|
|
43
53
|
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
44
54
|
|
|
45
|
-
data_dict = {
|
|
55
|
+
data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
|
|
46
56
|
|
|
47
57
|
return data_dict
|
|
48
58
|
|
|
@@ -65,7 +75,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
65
75
|
|
|
66
76
|
query = input_d.get('question', '') or input_d.get('query', '')
|
|
67
77
|
prompt = self.prompt_template.format(query=query)
|
|
68
|
-
return
|
|
78
|
+
return self.gen_prompt_data(prompt)
|
|
69
79
|
|
|
70
80
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
71
81
|
"""
|
|
@@ -100,10 +110,12 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
100
110
|
|
|
101
111
|
"""
|
|
102
112
|
res = dict()
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
113
|
+
if 'AverageRouge' in self.metric_list:
|
|
114
|
+
rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
|
|
115
|
+
res.update(rouge_dict)
|
|
116
|
+
if 'AverageBLEU' in self.metric_list:
|
|
117
|
+
bleu_dict = bleu_ngram_one_sample(pred, gold)
|
|
118
|
+
res.update(bleu_dict)
|
|
107
119
|
return res
|
|
108
120
|
|
|
109
121
|
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
@@ -119,7 +131,10 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
119
131
|
"""
|
|
120
132
|
items = defaultdict(list)
|
|
121
133
|
for scores in review_res_list:
|
|
122
|
-
|
|
123
|
-
|
|
134
|
+
if isinstance(scores, dict):
|
|
135
|
+
for k, v in scores.items():
|
|
136
|
+
items[k].append(v)
|
|
137
|
+
else:
|
|
138
|
+
items['AverageAccuracy'].append(scores)
|
|
124
139
|
# items = [(score, 1.0) for score in review_res_list]
|
|
125
140
|
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
@@ -3,15 +3,16 @@ import random
|
|
|
3
3
|
import re
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='gpqa',
|
|
12
|
+
pretty_name='GPQA',
|
|
13
13
|
dataset_id='modelscope/gpqa',
|
|
14
|
-
model_adapter=
|
|
14
|
+
model_adapter=OutputType.GENERATION,
|
|
15
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
15
16
|
subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
|
|
16
17
|
metric_list=['AveragePass@1'],
|
|
17
18
|
few_shot_num=5,
|
|
@@ -27,8 +28,9 @@ class GPQAAdapter(DataAdapter):
|
|
|
27
28
|
self.choices = ['A', 'B', 'C', 'D']
|
|
28
29
|
if self.few_shot_num and self.few_shot_num > 0:
|
|
29
30
|
self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n' # noqa: E501
|
|
30
|
-
self.prompt_prefix += open(
|
|
31
|
-
|
|
31
|
+
self.prompt_prefix += open(
|
|
32
|
+
os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'), 'r',
|
|
33
|
+
encoding='utf-8').read() + '\nQuestion: '
|
|
32
34
|
else:
|
|
33
35
|
self.prompt_prefix = 'What is the correct answer to this question:'
|
|
34
36
|
|
|
@@ -50,7 +52,7 @@ class GPQAAdapter(DataAdapter):
|
|
|
50
52
|
query = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}" # noqa: E501
|
|
51
53
|
|
|
52
54
|
prompt = self.prompt_template.format(query=query)
|
|
53
|
-
return
|
|
55
|
+
return self.gen_prompt_data(prompt)
|
|
54
56
|
|
|
55
57
|
def __process_input(self, input_d: dict) -> dict:
|
|
56
58
|
|
|
@@ -94,7 +96,10 @@ class GPQAAdapter(DataAdapter):
|
|
|
94
96
|
"""
|
|
95
97
|
Parse the predicted result and extract proper answer.
|
|
96
98
|
"""
|
|
97
|
-
|
|
99
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
100
|
+
return result
|
|
101
|
+
else:
|
|
102
|
+
return GPQAAdapter.get_multiple_choice_answer(result)
|
|
98
103
|
|
|
99
104
|
def match(self, gold: str, pred: str) -> float:
|
|
100
105
|
"""
|
|
@@ -6,7 +6,6 @@ import os
|
|
|
6
6
|
import re
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
10
9
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
@@ -15,8 +14,8 @@ logger = get_logger()
|
|
|
15
14
|
|
|
16
15
|
@Benchmark.register(
|
|
17
16
|
name='gsm8k',
|
|
17
|
+
pretty_name='GSM8K',
|
|
18
18
|
dataset_id='modelscope/gsm8k',
|
|
19
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
20
19
|
subset_list=['main'],
|
|
21
20
|
metric_list=['AverageAccuracy'],
|
|
22
21
|
few_shot_num=4,
|
|
@@ -76,7 +75,7 @@ class GSM8KAdapter(DataAdapter):
|
|
|
76
75
|
|
|
77
76
|
full_prompt = context + self.prompt_template.format(query=input_d['question'])
|
|
78
77
|
|
|
79
|
-
return
|
|
78
|
+
return self.gen_prompt_data(full_prompt)
|
|
80
79
|
|
|
81
80
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
82
81
|
# Extract the gold answer from the input dict.
|
|
@@ -4,9 +4,8 @@ import os
|
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
-
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.constants import EvalType, OutputType
|
|
8
8
|
from evalscope.metrics import exact_match
|
|
9
|
-
from evalscope.models import ContinuationLogitsModelAdapter
|
|
10
9
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
from evalscope.utils.utils import ResponseParser
|
|
@@ -18,20 +17,19 @@ logger = get_logger()
|
|
|
18
17
|
|
|
19
18
|
@Benchmark.register(
|
|
20
19
|
name='hellaswag',
|
|
20
|
+
pretty_name='HellaSwag',
|
|
21
21
|
dataset_id='modelscope/hellaswag',
|
|
22
|
-
model_adapter=
|
|
22
|
+
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
23
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
23
24
|
subset_list=['default'],
|
|
24
25
|
metric_list=['AverageAccuracy'],
|
|
25
26
|
few_shot_num=0,
|
|
26
27
|
train_split='train',
|
|
27
28
|
eval_split='validation',
|
|
28
|
-
prompt_template=
|
|
29
|
-
'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.', # noqa: E501
|
|
29
|
+
prompt_template='{query}', # noqa: E501
|
|
30
30
|
)
|
|
31
31
|
class HellaSwagAdapter(DataAdapter):
|
|
32
32
|
|
|
33
|
-
choices = ['0', '1', '2', '3']
|
|
34
|
-
|
|
35
33
|
def __init__(self, **kwargs):
|
|
36
34
|
|
|
37
35
|
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
@@ -40,6 +38,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
40
38
|
kwargs['few_shot_num'] = 0
|
|
41
39
|
|
|
42
40
|
super().__init__(**kwargs)
|
|
41
|
+
self.choices = ['A', 'B', 'C', 'D']
|
|
43
42
|
|
|
44
43
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
45
44
|
data_dict = {}
|
|
@@ -85,15 +84,14 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
85
84
|
self._generate_prompt(input_d=sample, endings=endings, include_answer=True) for sample in few_shot_list
|
|
86
85
|
]
|
|
87
86
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
88
|
-
context
|
|
89
|
-
|
|
90
|
-
ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
|
|
87
|
+
query = context.strip() + self._generate_prompt(input_d=input_d, endings=endings, include_answer=False)
|
|
91
88
|
|
|
92
|
-
|
|
89
|
+
full_prompt = self.prompt_template.format(query=query)
|
|
90
|
+
return self.gen_prompt_data(full_prompt)
|
|
93
91
|
|
|
94
92
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
95
|
-
# Get the gold choice
|
|
96
|
-
return input_d['label']
|
|
93
|
+
# Get the gold choice from the label
|
|
94
|
+
return self.choices[int(input_d['label'])]
|
|
97
95
|
|
|
98
96
|
def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
99
97
|
"""
|
|
@@ -107,34 +105,22 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
107
105
|
Returns:
|
|
108
106
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
109
107
|
"""
|
|
110
|
-
if
|
|
111
|
-
|
|
112
|
-
result = np.array(result)
|
|
113
|
-
endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
|
|
114
|
-
completion_len = np.array([float(len(i)) for i in endings])
|
|
115
|
-
best_choice_idx = np.argmax(result / completion_len)
|
|
116
|
-
|
|
117
|
-
return str(best_choice_idx)
|
|
118
|
-
elif eval_type == EvalType.SERVICE:
|
|
119
|
-
return ResponseParser.parse_first_option(result)
|
|
120
|
-
elif eval_type == EvalType.CUSTOM:
|
|
121
|
-
return ResponseParser.parse_first_option(result)
|
|
108
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
109
|
+
return result
|
|
122
110
|
else:
|
|
123
|
-
|
|
111
|
+
return ResponseParser.parse_first_option(result)
|
|
124
112
|
|
|
125
113
|
def match(self, gold: str, pred: str) -> float:
|
|
126
114
|
return exact_match(gold=str(gold), pred=str(pred))
|
|
127
115
|
|
|
128
|
-
|
|
129
|
-
def _preprocess(cls, text):
|
|
116
|
+
def _preprocess(self, text):
|
|
130
117
|
text = text.strip()
|
|
131
118
|
text = text.replace(' [title]', '. ')
|
|
132
119
|
text = re.sub('\\[.*?\\]', '', text)
|
|
133
120
|
text = text.replace(' ', ' ')
|
|
134
121
|
return text
|
|
135
122
|
|
|
136
|
-
|
|
137
|
-
def _generate_prompt(cls, input_d: dict, endings: list, include_answer=True) -> str:
|
|
123
|
+
def _generate_prompt(self, input_d: dict, endings: list, include_answer=True) -> str:
|
|
138
124
|
"""
|
|
139
125
|
Generate prompt for HellaSwag dataset.
|
|
140
126
|
|
|
@@ -148,7 +134,13 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
148
134
|
"""
|
|
149
135
|
|
|
150
136
|
ctx = input_d['ctx_a'] + ' ' + input_d['ctx_b'].capitalize()
|
|
151
|
-
example: str = cls._preprocess(input_d['activity_label'] + ': ' + ctx)
|
|
137
|
+
# example: str = cls._preprocess(input_d['activity_label'] + ': ' + ctx)
|
|
138
|
+
example: str = self._preprocess(ctx)
|
|
139
|
+
|
|
140
|
+
example += '\nQuestion: Which ending makes the most sense?'
|
|
141
|
+
for i, ending in enumerate(endings):
|
|
142
|
+
example += f'\n{self.choices[i]}. {ending}'
|
|
143
|
+
example += '\nYou may choose from A, B, C, D. Derive your final answer as `The answer is ...`.'
|
|
152
144
|
|
|
153
145
|
if include_answer:
|
|
154
146
|
example += '{}\n\n'.format(endings[int(input_d['label'])])
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
import re
|
|
3
3
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
6
5
|
from evalscope.utils.logger import get_logger
|
|
7
6
|
|
|
8
7
|
logger = get_logger()
|
|
@@ -13,14 +12,18 @@ logger = get_logger()
|
|
|
13
12
|
|
|
14
13
|
@Benchmark.register(
|
|
15
14
|
name='humaneval',
|
|
15
|
+
pretty_name='HumanEval',
|
|
16
16
|
dataset_id='modelscope/humaneval',
|
|
17
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
18
17
|
subset_list=['openai_humaneval'],
|
|
19
18
|
metric_list=['Pass@1'],
|
|
20
19
|
few_shot_num=0,
|
|
21
20
|
train_split=None,
|
|
22
21
|
eval_split='test',
|
|
23
22
|
prompt_template='Complete the following python code:\n{query}',
|
|
23
|
+
extra_params={
|
|
24
|
+
'num_workers': 4,
|
|
25
|
+
'timeout': 4
|
|
26
|
+
},
|
|
24
27
|
)
|
|
25
28
|
class HumanevalAdapter(DataAdapter):
|
|
26
29
|
"""
|
|
@@ -35,17 +38,17 @@ class HumanevalAdapter(DataAdapter):
|
|
|
35
38
|
raise ImportError('Please install human_eval:'
|
|
36
39
|
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
37
40
|
'Note that you need to enable the execution code in the human_eval/execution.py first.')
|
|
41
|
+
super().__init__(**kwargs)
|
|
38
42
|
|
|
43
|
+
extra_params = kwargs.get('extra_params', {})
|
|
39
44
|
self.k = [1]
|
|
40
|
-
self.num_workers = 4
|
|
41
|
-
self.timeout = 4
|
|
45
|
+
self.num_workers = extra_params.get('num_workers', 4)
|
|
46
|
+
self.timeout = extra_params.get('timeout', 4)
|
|
42
47
|
|
|
43
48
|
self.read_problems_func = stream_jsonl
|
|
44
49
|
self.write_jsonl_func = write_jsonl
|
|
45
50
|
self.eval_func = check_correctness
|
|
46
51
|
|
|
47
|
-
super().__init__(**kwargs)
|
|
48
|
-
|
|
49
52
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
50
53
|
data_dict = {}
|
|
51
54
|
for subset_name in subset_list:
|
|
@@ -66,7 +69,7 @@ class HumanevalAdapter(DataAdapter):
|
|
|
66
69
|
query = input_d['prompt']
|
|
67
70
|
full_prompt = self.prompt_template.format(query=query)
|
|
68
71
|
|
|
69
|
-
return
|
|
72
|
+
return self.gen_prompt_data(full_prompt)
|
|
70
73
|
|
|
71
74
|
@classmethod
|
|
72
75
|
def _postprocess(cls, text: str) -> str:
|
|
@@ -5,13 +5,12 @@ from evalscope.benchmarks import Benchmark, DataAdapter
|
|
|
5
5
|
from evalscope.benchmarks.ifeval.utils import process_results
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
7
|
from evalscope.metrics import Metric, mean, metric_registry
|
|
8
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='ifeval',
|
|
12
|
+
pretty_name='IFEval',
|
|
13
13
|
dataset_id='opencompass/ifeval',
|
|
14
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
15
14
|
subset_list=['default'],
|
|
16
15
|
metric_list=[
|
|
17
16
|
'prompt_level_strict_acc',
|
|
@@ -36,7 +35,7 @@ class IFEvalAdapter(DataAdapter):
|
|
|
36
35
|
metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
|
|
37
36
|
|
|
38
37
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
39
|
-
return
|
|
38
|
+
return self.gen_prompt_data(input_d['prompt'])
|
|
40
39
|
|
|
41
40
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
42
41
|
return input_d
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
-
from evalscope.constants import EvalType
|
|
2
|
+
from evalscope.constants import EvalType, OutputType
|
|
3
3
|
from evalscope.metrics import exact_match
|
|
4
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
5
4
|
from evalscope.utils.utils import ResponseParser
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
@Benchmark.register(
|
|
9
8
|
name='iquiz',
|
|
9
|
+
pretty_name='IQuiz',
|
|
10
10
|
dataset_id='AI-ModelScope/IQuiz',
|
|
11
|
-
model_adapter=
|
|
11
|
+
model_adapter=OutputType.GENERATION,
|
|
12
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
12
13
|
subset_list=['IQ', 'EQ'],
|
|
13
14
|
metric_list=['AverageAccuracy'],
|
|
14
15
|
few_shot_num=0,
|
|
@@ -36,7 +37,7 @@ class IQuizAdapter(DataAdapter):
|
|
|
36
37
|
"""
|
|
37
38
|
prompt = f"问题: {input_d['question']}\n"
|
|
38
39
|
prompt += self.__form_options(input_d['choices'])
|
|
39
|
-
return
|
|
40
|
+
return self.gen_prompt_data(prompt)
|
|
40
41
|
|
|
41
42
|
def __form_options(self, options: list):
|
|
42
43
|
option_str = '选项:\n'
|
|
@@ -54,7 +55,10 @@ class IQuizAdapter(DataAdapter):
|
|
|
54
55
|
"""
|
|
55
56
|
Parse the predicted result and extract proper answer.
|
|
56
57
|
"""
|
|
57
|
-
|
|
58
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
59
|
+
return result
|
|
60
|
+
else:
|
|
61
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
58
62
|
|
|
59
63
|
def match(self, gold: str, pred: str) -> float:
|
|
60
64
|
"""
|
|
File without changes
|