evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +27 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +30 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +45 -7
- evalscope/constants.py +7 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +89 -121
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +140 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/arguments.py +3 -1
- evalscope/perf/benchmark.py +3 -3
- evalscope/perf/main.py +5 -7
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +54 -50
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +4 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +693 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +64 -125
- evalscope/run_arena.py +3 -2
- evalscope/summarizer.py +15 -27
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +6 -5
- evalscope/utils/io_utils.py +170 -0
- evalscope/utils/logger.py +13 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -200
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +57 -7
- tests/perf/test_perf.py +3 -2
- tests/rag/test_mteb.py +3 -2
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -135
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
import os.path
|
|
3
3
|
import random
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
-
from typing import Any, Optional
|
|
5
|
+
from typing import Any, List, Optional
|
|
6
6
|
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
7
|
+
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
8
|
+
from evalscope.metrics import Metric
|
|
9
|
+
from evalscope.report import Report, ReportGenerator
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
@@ -14,15 +15,22 @@ logger = get_logger()
|
|
|
14
15
|
class DataAdapter(ABC):
|
|
15
16
|
|
|
16
17
|
def __init__(self,
|
|
18
|
+
name: str,
|
|
17
19
|
subset_list: list,
|
|
18
|
-
metric_list:
|
|
20
|
+
metric_list: List[Metric],
|
|
19
21
|
few_shot_num: Optional[int] = 0,
|
|
20
22
|
train_split: Optional[str] = None,
|
|
21
23
|
eval_split: Optional[str] = None,
|
|
22
|
-
prompt_template: str =
|
|
24
|
+
prompt_template: Optional[str] = None,
|
|
23
25
|
**kwargs):
|
|
24
26
|
"""
|
|
27
|
+
Data Adapter for the benchmark. You need to implement the following methods:
|
|
28
|
+
- gen_prompt
|
|
29
|
+
- get_gold_answer
|
|
30
|
+
- parse_pred_result
|
|
31
|
+
- match
|
|
25
32
|
Args:
|
|
33
|
+
name: str, the name of the benchmark.
|
|
26
34
|
subset_list: list of subset names for the dataset.
|
|
27
35
|
metric_list: list, the metric list to evaluate the model on specific benchmark.
|
|
28
36
|
few_shot_num: int, number of few-shot examples. Default: 0
|
|
@@ -32,6 +40,7 @@ class DataAdapter(ABC):
|
|
|
32
40
|
e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
|
|
33
41
|
the form of A or B or C or D, do not output explanation:`
|
|
34
42
|
"""
|
|
43
|
+
self.name = name
|
|
35
44
|
self.subset_list = subset_list
|
|
36
45
|
self.metric_list = metric_list
|
|
37
46
|
self.few_shot_num = few_shot_num
|
|
@@ -39,6 +48,7 @@ class DataAdapter(ABC):
|
|
|
39
48
|
self.eval_split = eval_split
|
|
40
49
|
self.prompt_template = prompt_template
|
|
41
50
|
self.config_kwargs = kwargs
|
|
51
|
+
self.category_map = kwargs.get('category_map', {})
|
|
42
52
|
|
|
43
53
|
def load(self,
|
|
44
54
|
dataset_name_or_path: str,
|
|
@@ -55,33 +65,36 @@ class DataAdapter(ABC):
|
|
|
55
65
|
|
|
56
66
|
"""
|
|
57
67
|
dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
68
|
+
subset_list = subset_list or self.subset_list
|
|
58
69
|
|
|
59
70
|
# Try to load dataset from local disk
|
|
60
71
|
if os.path.exists(dataset_name_or_path):
|
|
61
|
-
logger.info(
|
|
62
|
-
|
|
72
|
+
logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
|
|
73
|
+
subsets: {subset_list}')
|
|
63
74
|
data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
64
75
|
if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
|
|
65
76
|
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
66
77
|
else:
|
|
78
|
+
from modelscope.msdatasets import MsDataset
|
|
79
|
+
|
|
67
80
|
# Load dataset from remote
|
|
68
|
-
logger.info(
|
|
81
|
+
logger.info(
|
|
82
|
+
f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
69
83
|
data_dict = {}
|
|
70
84
|
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
71
85
|
if len(split_list) == 0:
|
|
72
86
|
logger.error(f'Got empty split list: {split_list}')
|
|
73
87
|
|
|
74
|
-
subset_list = subset_list if subset_list is not None else self.subset_list
|
|
75
88
|
for sub_name in subset_list:
|
|
76
89
|
data_dict[sub_name] = {}
|
|
77
90
|
# e.g. train: few-shot, test: target dataset to evaluate
|
|
78
91
|
for split in split_list:
|
|
79
|
-
dataset =
|
|
92
|
+
dataset = MsDataset.load(
|
|
80
93
|
dataset_name=dataset_name_or_path,
|
|
81
|
-
|
|
94
|
+
subset_name=sub_name,
|
|
82
95
|
split=split,
|
|
96
|
+
cache_dir=work_dir,
|
|
83
97
|
hub=datasets_hub,
|
|
84
|
-
work_dir=work_dir,
|
|
85
98
|
**kwargs)
|
|
86
99
|
|
|
87
100
|
data_dict[sub_name].update({split: dataset})
|
|
@@ -132,30 +145,105 @@ class DataAdapter(ABC):
|
|
|
132
145
|
prompt_d[AnswerKeys.RAW_INPUT] = sample_d
|
|
133
146
|
res_dict[sub_name].append(prompt_d)
|
|
134
147
|
|
|
135
|
-
rnd = random.Random()
|
|
136
|
-
rnd.seed(42)
|
|
137
|
-
for k, v in res_dict.items():
|
|
138
|
-
rnd.shuffle(v)
|
|
139
|
-
|
|
140
148
|
return res_dict
|
|
141
149
|
|
|
142
|
-
|
|
143
|
-
|
|
150
|
+
def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
|
|
151
|
+
|
|
152
|
+
if k > len(data_list):
|
|
153
|
+
k = len(data_list)
|
|
154
|
+
if few_shot_random:
|
|
155
|
+
return random.sample(data_list, k)
|
|
156
|
+
else:
|
|
157
|
+
return data_list[:k]
|
|
158
|
+
|
|
159
|
+
def compute_metric(self, review_res_list: list) -> List[dict]:
|
|
160
|
+
"""
|
|
161
|
+
Compute evaluation result by specific metrics.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
review_res_list: list, the review result list, each item of which is match result for gold and pred.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]
|
|
168
|
+
"""
|
|
169
|
+
if len(self.metric_list) == 0:
|
|
170
|
+
raise ValueError('No metric list found for the benchmark.')
|
|
171
|
+
|
|
172
|
+
res_list = []
|
|
173
|
+
for metric in self.metric_list:
|
|
174
|
+
metric_name = metric.name
|
|
175
|
+
metric_func = metric.object
|
|
176
|
+
res_list.append({
|
|
177
|
+
'metric_name': metric_name,
|
|
178
|
+
'score': metric_func(review_res_list),
|
|
179
|
+
'num': len(review_res_list)
|
|
180
|
+
})
|
|
181
|
+
return res_list
|
|
182
|
+
|
|
183
|
+
def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
|
|
184
|
+
"""
|
|
185
|
+
Generate report for the evaluation results for all subsets.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
subset_score_map: The subset-score map.
|
|
189
|
+
e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
|
|
190
|
+
|
|
191
|
+
report_name: str, the user-defined report name. Default: None
|
|
192
|
+
|
|
193
|
+
Returns: The evaluation report.
|
|
194
|
+
|
|
195
|
+
Here is a format example for gsm8k:
|
|
196
|
+
{
|
|
197
|
+
"name": "qwen2.5_gsm8k",
|
|
198
|
+
"metrics": [
|
|
199
|
+
{
|
|
200
|
+
"name": "AverageAccuracy",
|
|
201
|
+
"categories": [
|
|
202
|
+
{
|
|
203
|
+
"name": "default",
|
|
204
|
+
"subsets": [
|
|
205
|
+
{
|
|
206
|
+
"name": "main",
|
|
207
|
+
"score": 0.0,
|
|
208
|
+
"num": 2
|
|
209
|
+
}
|
|
210
|
+
],
|
|
211
|
+
"num": 2,
|
|
212
|
+
"score": 0.0,
|
|
213
|
+
"macro_score": 0.0
|
|
214
|
+
}
|
|
215
|
+
],
|
|
216
|
+
"num": 2,
|
|
217
|
+
"score": 0.0,
|
|
218
|
+
"macro_score": 0.0
|
|
219
|
+
}
|
|
220
|
+
],
|
|
221
|
+
"dataset_name": "gsm8k",
|
|
222
|
+
"model_name": "qwen2.5"
|
|
223
|
+
}
|
|
224
|
+
""" # noqa: E501
|
|
225
|
+
kwargs['category_map'] = self.category_map
|
|
226
|
+
kwargs['metric_list'] = self.metric_list
|
|
227
|
+
return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
|
|
228
|
+
|
|
229
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
144
230
|
"""
|
|
145
231
|
Generate model prompt from raw input, unify the prompt format for different datasets.
|
|
146
232
|
The input format is compatible with OpenAI Chat Completions APIs.
|
|
147
|
-
Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
|
|
148
233
|
|
|
149
234
|
Args:
|
|
150
235
|
input_d (Any): The raw input. Depending on the dataset.
|
|
236
|
+
subset_name (str): The subset name.
|
|
237
|
+
few_shot_list (list): The few-shot examples.
|
|
151
238
|
|
|
152
239
|
Returns:
|
|
240
|
+
For class ChatGenerationModelAdapter, the output format is:
|
|
241
|
+
{'data': [full_prompt], 'system_prompt': (str, optional)}, -- full_prompt: str, the constructed prompt for each sample from dataset.
|
|
153
242
|
For class MultiChoiceModelAdapter, the output format is:
|
|
154
|
-
{'data': [full_prompt]}
|
|
155
|
-
|
|
243
|
+
{'data': [full_prompt], 'multi_choices': self.choices} -- full_prompt: str, the constructed prompt for each sample from dataset.
|
|
156
244
|
For class ContinuationEvalModelAdapter, the output format is:
|
|
157
|
-
{'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
|
|
158
|
-
"""
|
|
245
|
+
{'data': ctx_continuation_pair_list, 'multi_choices': self.choices} -- ctx_continuation_pair_list: list, the context-continuation pair list.
|
|
246
|
+
""" # noqa: E501
|
|
159
247
|
raise NotImplementedError
|
|
160
248
|
|
|
161
249
|
@abstractmethod
|
|
@@ -172,7 +260,7 @@ class DataAdapter(ABC):
|
|
|
172
260
|
raise NotImplementedError
|
|
173
261
|
|
|
174
262
|
@abstractmethod
|
|
175
|
-
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str =
|
|
263
|
+
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
|
|
176
264
|
"""
|
|
177
265
|
Parse the predicted result and extract proper answer.
|
|
178
266
|
|
|
@@ -193,71 +281,11 @@ class DataAdapter(ABC):
|
|
|
193
281
|
|
|
194
282
|
Args:
|
|
195
283
|
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
|
|
196
|
-
e.g. 'A'
|
|
284
|
+
e.g. 'A', extracted from get_gold_answer method.
|
|
197
285
|
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
|
|
198
|
-
e.g. 'B'
|
|
286
|
+
e.g. 'B', extracted from parse_pred_result method.
|
|
199
287
|
|
|
200
288
|
Returns:
|
|
201
289
|
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
202
290
|
"""
|
|
203
291
|
raise NotImplementedError
|
|
204
|
-
|
|
205
|
-
@abstractmethod
|
|
206
|
-
def compute_metric(self, review_res_list: list) -> Any:
|
|
207
|
-
"""
|
|
208
|
-
Compute evaluation result by specific metrics.
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
review_res_list: list, the review result list, each item of which is match result for gold and pred.
|
|
212
|
-
|
|
213
|
-
Attributes:
|
|
214
|
-
DataAdapter.metric_func_map: metric_name -> metric_func mapping,
|
|
215
|
-
e.g. {'WeightedAverageAccuracy': weighted_average_acc}
|
|
216
|
-
|
|
217
|
-
Returns:
|
|
218
|
-
Metric results.
|
|
219
|
-
"""
|
|
220
|
-
raise NotImplementedError
|
|
221
|
-
|
|
222
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
223
|
-
"""
|
|
224
|
-
Generate report for the evaluation results for all subsets.
|
|
225
|
-
|
|
226
|
-
Args:
|
|
227
|
-
subset_score_map: The subset-score map.
|
|
228
|
-
e.g. {subset_name: (score, num)}
|
|
229
|
-
|
|
230
|
-
report_name: str, the user-defined report name. Default: None
|
|
231
|
-
|
|
232
|
-
Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
|
|
233
|
-
|
|
234
|
-
Here is a format example for ARC-Challenge:
|
|
235
|
-
{
|
|
236
|
-
"name":"ARC-Challenge",
|
|
237
|
-
"metric":"WeightedAverageAccuracy",
|
|
238
|
-
"score": 0.3389,
|
|
239
|
-
"category":[
|
|
240
|
-
{
|
|
241
|
-
"name":"DEFAULT",
|
|
242
|
-
"score": 0.3389,
|
|
243
|
-
"subset":[
|
|
244
|
-
{
|
|
245
|
-
"name":"ARC-Challenge",
|
|
246
|
-
"score": 0.3389
|
|
247
|
-
},
|
|
248
|
-
]
|
|
249
|
-
}
|
|
250
|
-
],
|
|
251
|
-
"total_num":100
|
|
252
|
-
}
|
|
253
|
-
"""
|
|
254
|
-
raise NotImplementedError
|
|
255
|
-
|
|
256
|
-
def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
|
|
257
|
-
|
|
258
|
-
if k > len(data_list):
|
|
259
|
-
k = len(data_list)
|
|
260
|
-
if few_shot_random:
|
|
261
|
-
return random.sample(data_list, k)
|
|
262
|
-
else:
|
|
263
|
-
return data_list[:k]
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter
|
|
5
|
-
from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
|
|
@@ -1,39 +1,34 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import glob
|
|
3
|
-
import json
|
|
4
3
|
import os.path
|
|
5
4
|
from collections import defaultdict
|
|
6
|
-
from typing import
|
|
5
|
+
from typing import List
|
|
7
6
|
|
|
8
|
-
from evalscope.benchmarks
|
|
9
|
-
from evalscope.metrics
|
|
10
|
-
from evalscope.
|
|
11
|
-
from evalscope.utils import jsonl_to_list
|
|
7
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
8
|
+
from evalscope.metrics import AverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
|
|
9
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
10
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
12
11
|
from evalscope.utils.logger import get_logger
|
|
13
12
|
|
|
14
13
|
logger = get_logger()
|
|
15
14
|
|
|
16
|
-
DATASET_ID = 'general_qa'
|
|
17
|
-
SUBSET_LIST = ['default']
|
|
18
|
-
|
|
19
15
|
|
|
16
|
+
@Benchmark.register(
|
|
17
|
+
name='general_qa',
|
|
18
|
+
dataset_id='general_qa',
|
|
19
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
20
|
+
subset_list=['default'],
|
|
21
|
+
metric_list=[AverageBLEU],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
20
26
|
class GeneralQAAdapter(DataAdapter):
|
|
21
27
|
# TODO: set few_shot_num
|
|
22
28
|
|
|
23
|
-
def __init__(self,
|
|
24
|
-
subset_list: list = None,
|
|
25
|
-
metric_list: list = None,
|
|
26
|
-
train_split: str = None,
|
|
27
|
-
eval_split: str = 'test',
|
|
28
|
-
**kwargs):
|
|
29
|
-
if subset_list is None:
|
|
30
|
-
subset_list = SUBSET_LIST
|
|
31
|
-
|
|
32
|
-
if metric_list is None:
|
|
33
|
-
metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
|
|
29
|
+
def __init__(self, **kwargs):
|
|
34
30
|
|
|
35
|
-
super().__init__(
|
|
36
|
-
subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs)
|
|
31
|
+
super().__init__(**kwargs)
|
|
37
32
|
|
|
38
33
|
def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:
|
|
39
34
|
|
|
@@ -71,7 +66,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
71
66
|
|
|
72
67
|
# if len(history) > 0:
|
|
73
68
|
# prompt = '\n'.join(history) + '\n' + prompt
|
|
74
|
-
return {'data': [prompt]}
|
|
69
|
+
return {'data': [prompt], 'system_prompt': self.prompt_template}
|
|
75
70
|
|
|
76
71
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
77
72
|
"""
|
|
@@ -95,14 +90,14 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
95
90
|
"""
|
|
96
91
|
return result
|
|
97
92
|
|
|
98
|
-
def match(self, gold: str, pred: str) ->
|
|
93
|
+
def match(self, gold: str, pred: str) -> dict:
|
|
99
94
|
"""
|
|
100
95
|
Args:
|
|
101
96
|
gold: str
|
|
102
97
|
pred: str
|
|
103
98
|
|
|
104
99
|
Returns:
|
|
105
|
-
bleu_score:
|
|
100
|
+
bleu_score: dict
|
|
106
101
|
|
|
107
102
|
"""
|
|
108
103
|
res = dict()
|
|
@@ -110,10 +105,9 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
110
105
|
bleu_dict = bleu_ngram_one_sample(pred, gold)
|
|
111
106
|
res.update(rouge_dict)
|
|
112
107
|
res.update(bleu_dict)
|
|
113
|
-
# return bleu(item)
|
|
114
108
|
return res
|
|
115
109
|
|
|
116
|
-
def compute_metric(self, review_res_list:
|
|
110
|
+
def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
|
|
117
111
|
"""
|
|
118
112
|
compute weighted mean of the bleu score of all samples
|
|
119
113
|
|
|
@@ -121,62 +115,12 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
121
115
|
review_res_list: [score1, score2, ...]
|
|
122
116
|
|
|
123
117
|
Returns:
|
|
124
|
-
avg_res:
|
|
118
|
+
avg_res: List[dict]
|
|
125
119
|
|
|
126
120
|
"""
|
|
127
121
|
items = defaultdict(list)
|
|
128
122
|
for scores in review_res_list:
|
|
129
123
|
for k, v in scores.items():
|
|
130
|
-
items[k].append(
|
|
124
|
+
items[k].append(v)
|
|
131
125
|
# items = [(score, 1.0) for score in review_res_list]
|
|
132
|
-
|
|
133
|
-
# return weighted_mean(items)
|
|
134
|
-
return res
|
|
135
|
-
|
|
136
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
137
|
-
"""
|
|
138
|
-
Args:
|
|
139
|
-
subset_score_map: {subset_name: (score_dict, num), ...}
|
|
140
|
-
report_name: str, the user-defined report name.
|
|
141
|
-
|
|
142
|
-
Returns:
|
|
143
|
-
{
|
|
144
|
-
"name":"GeneralQA",
|
|
145
|
-
"metric":"WeightedAverageBLEU",
|
|
146
|
-
"score":0.399,
|
|
147
|
-
"category":[
|
|
148
|
-
{
|
|
149
|
-
"name":"DEFAULT",
|
|
150
|
-
"score":0.399,
|
|
151
|
-
"subset":[
|
|
152
|
-
{
|
|
153
|
-
"name":"default",
|
|
154
|
-
"score":0.399
|
|
155
|
-
},
|
|
156
|
-
]
|
|
157
|
-
}
|
|
158
|
-
],
|
|
159
|
-
"total_num":10
|
|
160
|
-
}
|
|
161
|
-
"""
|
|
162
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
163
|
-
# weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
164
|
-
cate_avg_list = [{
|
|
165
|
-
'name': subset_name,
|
|
166
|
-
'score': score_dict
|
|
167
|
-
} for subset_name, (score_dict, _) in subset_score_map.items()]
|
|
168
|
-
total_avg_list = defaultdict(float)
|
|
169
|
-
for score_dict, num in subset_score_map.values():
|
|
170
|
-
for metric, score in score_dict.items():
|
|
171
|
-
total_avg_list[metric] += score * num / total_num
|
|
172
|
-
|
|
173
|
-
category_d = dict(name='DEFAULT', score=total_avg_list, subset=cate_avg_list)
|
|
174
|
-
|
|
175
|
-
res_map = dict(
|
|
176
|
-
name=report_name or 'general_qa',
|
|
177
|
-
metric=self.metric_list[0]['name'],
|
|
178
|
-
score=total_avg_list,
|
|
179
|
-
category=[category_d],
|
|
180
|
-
total_num=total_num)
|
|
181
|
-
|
|
182
|
-
return res_map
|
|
126
|
+
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
File without changes
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
Question: In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?
|
|
2
|
+
Choices:
|
|
3
|
+
(A) 1/400
|
|
4
|
+
(B) 19/400
|
|
5
|
+
(C) 20/400
|
|
6
|
+
(D) 38/400
|
|
7
|
+
Let's think step by step:
|
|
8
|
+
The expected proportion of individuals who carry the b allele but are not expected to develop the cancer equals to the frequency of heterozygous allele in the given population.
|
|
9
|
+
According to the Hardy-Weinberg equation p∧2 + 2pq + q∧2 = 1, where p is the frequency of dominant allele frequency, q is the frequency of recessive allele frequency, p∧2 is the frequency of the homozygous dominant allele, q∧2 is the frequency of the recessive allele, and 2pq is the frequency of the heterozygous allele.
|
|
10
|
+
Given that q∧2=1/400, hence, q=0.05 and p=1-q=0.95.
|
|
11
|
+
The frequency of the heterozygous allele is 2pq=2*0.05*0.95=38/400.
|
|
12
|
+
The correct answer is (D)
|
|
13
|
+
Question: A Fe pellet of 0.056 g is first dissolved in 10 mL of hydrobromic acid HBr (0.1 M). The resulting solution is then titrated by KMnO4 (0.02 M). How many equivalence points are there?
|
|
14
|
+
Choices:
|
|
15
|
+
(A) Two points, 25 ml and 35 ml
|
|
16
|
+
(B) One point, 25 mL
|
|
17
|
+
(C) One point, 10 ml
|
|
18
|
+
(D) Two points, 25 ml and 30 ml
|
|
19
|
+
Let's think step by step:
|
|
20
|
+
HBr will react with Fe to produce Fe2+. MnO4- will first react with Fe2+ then Br-.
|
|
21
|
+
Two equivalence points will exist 25 ml and 35 ml.
|
|
22
|
+
HBr will react with Fe to produce Fe2+. MnO4- will first react with Fe2+ then Br-.
|
|
23
|
+
Two equivalence points will exist 25 ml and 35 ml.
|
|
24
|
+
In the beaker there is Fe2+ and Br-.
|
|
25
|
+
When considering titration with two analytes one will have to consider which reaction will occur first.
|
|
26
|
+
Since it is a redox titration consider the reduction potential of:
|
|
27
|
+
E0 (Br2 /Br- ) = 1.09 V E0 (MnO4-/ Mn2+) = 1.49 V E0 (Fe3+/Fe2+) =0.77 V
|
|
28
|
+
[Fe2+]=m/MV=0.1M.
|
|
29
|
+
Reaction 1: MnO4- + 5Fe2+ + 8H+ → Mn2+ + 5Fe3+ + 4H2O
|
|
30
|
+
Reaction 2: 2MnO4- + 10Br- + 16H+ → 2Mn2+ + 5Br2 + 8H2O
|
|
31
|
+
So MnO4- will first react with Fe2+ with a stoichiometry of 1:5 so Veq1 will be 10 ml.
|
|
32
|
+
Then when Fe2+ is used up, MnO4- will react with Br- with a stoichiometry of 2:10 then V added will be 25 ml so Veq2=25+10=35 ml.
|
|
33
|
+
The correct answer is (A)
|
|
34
|
+
Question: Consider a quantum mechanical system containing a particle of mass $m$ moving in an istropic three dimensional potential of the form $V(r) = 1/2 m \omega^2 r^2$ corresponding to the acted force obeying Hooke’s law. Here, $\omega$ is the angular frequency of oscillation and $r$ is the radial distance of the particle from the origin in spherical polar coordinate. What is the value of energy of the third excited state, and how many linearly independent eigenfunctions are possible for the same energy eigenvalue?
|
|
35
|
+
Choices:
|
|
36
|
+
(A) 11 \pi^2 \hbar^2 / (2m r^2), 3
|
|
37
|
+
(B) (9/2) \hbar \omega , 10
|
|
38
|
+
(C) 11 \pi^2 \hbar^2 / (2m r^2), 10
|
|
39
|
+
(D) (9/2) \hbar \omega, 3
|
|
40
|
+
Let's think step by step:
|
|
41
|
+
This problem is nothing but the three dimensional simple harmonic oscillator (SHO) problem.
|
|
42
|
+
The energy spectrum of three dimensional SHO is $E_n= (n+3/2)\hbar \omega$ where $n=0,1,2,3….$.
|
|
43
|
+
For third excited state n=3.
|
|
44
|
+
3+3/2=6/2+3/2=9/2.
|
|
45
|
+
Thus the corresponding energy is $(9/2)\hbar \omega$.
|
|
46
|
+
The degeneracy of the state is $g_n= (n+1)(n+2)/2$.
|
|
47
|
+
For n=3, degeneracy is (3+1)*(3+2)/2=4*5/2=10.
|
|
48
|
+
The correct answer is (B)
|
|
49
|
+
Question: "Your overhear two chemists talking to each other as they leave a synthetic organic chemistry lab. One asks the other "So, how did it go?" The second chemist replies, "Not well - my compounds are on top of each other." What is the second chemist most likely referring to?"
|
|
50
|
+
Choices:
|
|
51
|
+
(A) The compounds they are working with have similar polarities.
|
|
52
|
+
(B) The compounds they are working with have similar boiling points.
|
|
53
|
+
(C) The compounds they are working with are bonding to each other through non-covalent/van der Waals interactions.
|
|
54
|
+
(D) The compounds they are working with have similar optical rotations.
|
|
55
|
+
Let's think step by step:
|
|
56
|
+
"On top of each other" commonly refers to two compounds that have similar Rf values on chromatography (a common operation in synthetic chemistry).
|
|
57
|
+
Similar Rf values arise for compounds with similar polarities.
|
|
58
|
+
The correct answer is (A)
|
|
59
|
+
Question: Two people are playing the following game. A fair coin is tossed into the air. Person A says that in a single toss of the coin, the tail will come. So it's like the first shot or the third shot or the fifth shot. Person B says that the coin will come with a double toss. So like the second, fourth, sixth or eighth shot. Imagine this game played forever. What is the probability that person A wins this game?
|
|
60
|
+
Choices:
|
|
61
|
+
(A) 1/2
|
|
62
|
+
(B) 1/4
|
|
63
|
+
(C) 2/3
|
|
64
|
+
(D) 1/8
|
|
65
|
+
Let's think step by step:
|
|
66
|
+
When finding the correct answer, the probability of playing forever and the coin's single-point toss will be calculated.
|
|
67
|
+
For example, a tail may appear on the first shot.
|
|
68
|
+
This probability is 1/2. if the first toss doesn't come up, it shouldn't come to the second roll either, because the second throw is an even number.
|
|
69
|
+
So it can come in the third shot.
|
|
70
|
+
This is (1/2)(1/2)(1/2).
|
|
71
|
+
So (1/2)^3=1/8.
|
|
72
|
+
Or it could come on the fifth shot.
|
|
73
|
+
This is (1/2)^5=1/32.
|
|
74
|
+
This is actually a geometric series that goes on forever.
|
|
75
|
+
We can write this series as follows.
|
|
76
|
+
(1/2) + (1/2)^3 + (1/2)^5 + (1/2)^7 + ……….
|
|
77
|
+
The solution for this series is as follows : a1/(1-r) where a1 is the first number and r is the sequence or r= a2/a1 or a3/a2 etc.
|
|
78
|
+
a1=1/2
|
|
79
|
+
r=(1/2)^2=1/4
|
|
80
|
+
So a1/(1-r)=(1/2)/(1-1/4)=(1/2)/(3/4)=2/3.
|
|
81
|
+
The correct answer is (C)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import random
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.constants import AnswerKeys, EvalType
|
|
7
|
+
from evalscope.metrics import Pass1, exact_match
|
|
8
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
+
from evalscope.utils.utils import ResponseParser
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@Benchmark.register(
|
|
13
|
+
name='gpqa',
|
|
14
|
+
dataset_id='modelscope/gpqa',
|
|
15
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
16
|
+
subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
|
|
17
|
+
metric_list=[Pass1],
|
|
18
|
+
few_shot_num=5,
|
|
19
|
+
train_split='train',
|
|
20
|
+
eval_split='train', # only have train split
|
|
21
|
+
prompt_template='',
|
|
22
|
+
)
|
|
23
|
+
class GPQAAdapter(DataAdapter):
|
|
24
|
+
|
|
25
|
+
def __init__(self, **kwargs):
|
|
26
|
+
super().__init__(**kwargs)
|
|
27
|
+
|
|
28
|
+
self.choices = ['A', 'B', 'C', 'D']
|
|
29
|
+
if self.few_shot_num and self.few_shot_num > 0:
|
|
30
|
+
self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n' # noqa: E501
|
|
31
|
+
self.prompt_prefix += open(os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'),
|
|
32
|
+
'r').read() + '\nQuestion: '
|
|
33
|
+
else:
|
|
34
|
+
self.prompt_prefix = 'What is the correct answer to this question:'
|
|
35
|
+
|
|
36
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
37
|
+
"""
|
|
38
|
+
Generate model prompt from input data.
|
|
39
|
+
example:
|
|
40
|
+
{
|
|
41
|
+
"question":"Two people are playing the following game. A fair coin is tossed into the air. Person A says that in a single toss of the coin, the tail will come. So it's like the first shot or the third shot or the fifth shot. Person B says that the coin will come with a double toss. So like the second, fourth, sixth or eighth shot. Imagine this game played forever. What is the probability that person A wins this game?",
|
|
42
|
+
"choice1":"1/2",
|
|
43
|
+
"choice2":"1/4",
|
|
44
|
+
"choice3":"2/3",
|
|
45
|
+
"choice4":"1/8",
|
|
46
|
+
"answer":"C",
|
|
47
|
+
}
|
|
48
|
+
""" # noqa: E501
|
|
49
|
+
processed_input_d = self.__process_input(input_d)
|
|
50
|
+
input_d['answer'] = processed_input_d['answer'] # add answer to input_d for answer extraction
|
|
51
|
+
prompt = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}Let's think step by step: " # noqa: E501
|
|
52
|
+
|
|
53
|
+
return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
54
|
+
|
|
55
|
+
def __process_input(self, input_d: dict) -> dict:
|
|
56
|
+
|
|
57
|
+
def preprocess(text):
|
|
58
|
+
if text is None:
|
|
59
|
+
return ' '
|
|
60
|
+
text = text.strip()
|
|
61
|
+
text = text.replace(' [title]', '. ')
|
|
62
|
+
text = re.sub('\\[.*?\\]', '', text)
|
|
63
|
+
text = text.replace(' ', ' ')
|
|
64
|
+
return text
|
|
65
|
+
|
|
66
|
+
choices = [
|
|
67
|
+
preprocess(input_d['Incorrect Answer 1']),
|
|
68
|
+
preprocess(input_d['Incorrect Answer 2']),
|
|
69
|
+
preprocess(input_d['Incorrect Answer 3']),
|
|
70
|
+
preprocess(input_d['Correct Answer']),
|
|
71
|
+
]
|
|
72
|
+
random.shuffle(choices)
|
|
73
|
+
correct_answer_index = choices.index(preprocess(input_d['Correct Answer']))
|
|
74
|
+
|
|
75
|
+
out_doc = {
|
|
76
|
+
'choices': [choices[0], choices[1], choices[2], choices[3]],
|
|
77
|
+
'answer': f'{chr(65 + correct_answer_index)}',
|
|
78
|
+
}
|
|
79
|
+
return out_doc
|
|
80
|
+
|
|
81
|
+
def __form_options(self, options: list):
|
|
82
|
+
option_str = 'Choices:\n'
|
|
83
|
+
for opt, choice in zip(options, self.choices):
|
|
84
|
+
option_str += f'({choice}) {opt}' + '\n'
|
|
85
|
+
return option_str
|
|
86
|
+
|
|
87
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
88
|
+
"""
|
|
89
|
+
Parse the raw input labels (gold).
|
|
90
|
+
"""
|
|
91
|
+
return input_d['answer']
|
|
92
|
+
|
|
93
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Parse the predicted result and extract proper answer.
|
|
96
|
+
"""
|
|
97
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
98
|
+
|
|
99
|
+
def match(self, gold: str, pred: str) -> float:
|
|
100
|
+
"""
|
|
101
|
+
Match the gold answer and the predicted answer.
|
|
102
|
+
"""
|
|
103
|
+
return exact_match(gold=gold, pred=pred)
|
|
@@ -1,5 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|