evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +67 -59
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +12 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +101 -18
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/utils.py +28 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +94 -32
- evalscope/config.py +54 -17
- evalscope/evaluator/evaluator.py +80 -41
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +15 -8
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +17 -25
- evalscope/perf/arguments.py +16 -7
- evalscope/perf/benchmark.py +0 -15
- evalscope/perf/main.py +72 -15
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +34 -16
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +61 -4
- evalscope/run.py +12 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -32
- tests/cli/test_collection.py +8 -6
- tests/cli/test_run.py +43 -17
- tests/perf/test_perf.py +23 -0
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
|
|
|
5
5
|
from collections import defaultdict
|
|
6
6
|
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
|
-
from evalscope.benchmarks.utils import PromptData, preprocess_decorator
|
|
8
|
+
from evalscope.benchmarks.utils import PromptData, load_file_with_extension, preprocess_decorator
|
|
9
9
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
10
10
|
from evalscope.metrics import LLMJudge, metric_registry
|
|
11
11
|
from evalscope.report import Report, ReportGenerator
|
|
@@ -15,6 +15,13 @@ logger = get_logger()
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class DataAdapter(ABC):
|
|
18
|
+
"""
|
|
19
|
+
Data Adapter for the benchmark. You need to implement the following methods:
|
|
20
|
+
- gen_prompt
|
|
21
|
+
- get_gold_answer
|
|
22
|
+
- parse_pred_result
|
|
23
|
+
- match
|
|
24
|
+
"""
|
|
18
25
|
|
|
19
26
|
def __init__(self,
|
|
20
27
|
name: str,
|
|
@@ -31,30 +38,36 @@ class DataAdapter(ABC):
|
|
|
31
38
|
system_prompt: Optional[str] = None,
|
|
32
39
|
query_template: Optional[str] = None,
|
|
33
40
|
pretty_name: Optional[str] = None,
|
|
41
|
+
description: Optional[str] = None,
|
|
34
42
|
**kwargs):
|
|
35
43
|
"""
|
|
36
|
-
Data Adapter for the benchmark. You need to implement the following methods:
|
|
37
|
-
- gen_prompt
|
|
38
|
-
- get_gold_answer
|
|
39
|
-
- parse_pred_result
|
|
40
|
-
- match
|
|
41
44
|
Args:
|
|
42
45
|
name: str, the name of the benchmark.
|
|
43
46
|
dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
|
|
47
|
+
model_adapter: str, the model adapter to use for the benchmark.
|
|
44
48
|
subset_list: list of subset names for the dataset.
|
|
45
49
|
metric_list: list, the metric list to evaluate the model on specific benchmark.
|
|
50
|
+
llm_as_a_judge: bool, whether to use LLM as a judge to evaluate the predicted answer against the gold answer.
|
|
51
|
+
output_types: list, the output types of the model adapter. Default: [model_adapter]
|
|
46
52
|
few_shot_num: int, number of few-shot examples. Default: 0
|
|
47
53
|
train_split: str, usually for few-shot examples. e.g. 'train'
|
|
48
54
|
eval_split: str, the target eval split name. e.g. 'test'
|
|
49
55
|
prompt_template: str, the prompt template for the benchmark,
|
|
50
56
|
e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
|
|
51
57
|
the form of A or B or C or D, do not output explanation:`
|
|
52
|
-
|
|
58
|
+
system_prompt: str, the system prompt for the benchmark, e.g. 'You are a helpful assistant.'
|
|
59
|
+
query_template: str, the query template for the benchmark, e.g. 'Please answer the following question: {}'
|
|
60
|
+
pretty_name: str, the pretty name of the benchmark, e.g. 'ARC Challenge Set'.
|
|
61
|
+
description: str, the description of the benchmark,
|
|
62
|
+
e.g. 'ARC Challenge Set is a benchmark for evaluating reasoning abilities of models on science questions.'
|
|
63
|
+
""" # noqa: E501
|
|
53
64
|
self.name = name
|
|
54
65
|
self.dataset_id = dataset_id
|
|
55
66
|
self.model_adapter = model_adapter
|
|
56
67
|
self.subset_list = subset_list
|
|
57
68
|
self.metric_list = metric_list
|
|
69
|
+
self.llm_as_a_judge = llm_as_a_judge
|
|
70
|
+
self.output_types = output_types or [model_adapter]
|
|
58
71
|
self.few_shot_num = few_shot_num
|
|
59
72
|
self.train_split = train_split
|
|
60
73
|
self.eval_split = eval_split
|
|
@@ -62,9 +75,8 @@ class DataAdapter(ABC):
|
|
|
62
75
|
self.system_prompt = system_prompt
|
|
63
76
|
self.query_template = query_template
|
|
64
77
|
self.pretty_name = pretty_name
|
|
78
|
+
self.description = description
|
|
65
79
|
self.config_kwargs = kwargs
|
|
66
|
-
self.output_types = output_types or [model_adapter]
|
|
67
|
-
self.llm_as_a_judge = llm_as_a_judge
|
|
68
80
|
self.category_map = kwargs.get('category_map', {})
|
|
69
81
|
self.choices = kwargs.get('choices', None)
|
|
70
82
|
|
|
@@ -156,6 +168,49 @@ class DataAdapter(ABC):
|
|
|
156
168
|
"""
|
|
157
169
|
return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
158
170
|
|
|
171
|
+
def load_with_snapshot(self,
|
|
172
|
+
file_structure: Dict[str, List[str]],
|
|
173
|
+
dataset_name_or_path: str = None,
|
|
174
|
+
subset_list: list = None,
|
|
175
|
+
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
176
|
+
**kwargs) -> dict:
|
|
177
|
+
"""
|
|
178
|
+
For datasets that cannot be correctly loaded using MsDataset, utilize snapshot downloading to load the data.
|
|
179
|
+
This feature supports both remote and local datasets.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
file_structure: dict, the file structure of the dataset, e.g. {'subset_name': ['file1.jsonl', 'file2.jsonl']}.
|
|
183
|
+
dataset_name_or_path: str, the dataset id on ModelScope or local path for the benchmark.
|
|
184
|
+
subset_list: list of subset names for the dataset.
|
|
185
|
+
work_dir: str, the working directory to store the dataset.
|
|
186
|
+
Returns: {'subset_name': {'eval': eval_dataset}}
|
|
187
|
+
""" # noqa: E501
|
|
188
|
+
dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
|
|
189
|
+
subset_list = subset_list or self.subset_list
|
|
190
|
+
|
|
191
|
+
# Try to load dataset from local disk
|
|
192
|
+
if os.path.exists(dataset_name_or_path):
|
|
193
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
194
|
+
dataset_path = dataset_name_or_path
|
|
195
|
+
else:
|
|
196
|
+
from modelscope import dataset_snapshot_download
|
|
197
|
+
|
|
198
|
+
# Load dataset from remote
|
|
199
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
200
|
+
# flatten file structure
|
|
201
|
+
file_names = [file for sub_files in file_structure.values() for file in sub_files]
|
|
202
|
+
# download dataset snapshot
|
|
203
|
+
dataset_path = dataset_snapshot_download(
|
|
204
|
+
dataset_name_or_path, cache_dir=work_dir, allow_file_pattern=file_names)
|
|
205
|
+
# read and process files
|
|
206
|
+
data_dict = defaultdict(dict)
|
|
207
|
+
for sub_name in subset_list:
|
|
208
|
+
file_paths = [os.path.join(dataset_path, file_name) for file_name in file_structure[sub_name]]
|
|
209
|
+
# not train split, only eval split
|
|
210
|
+
data_dict[sub_name][self.eval_split] = load_file_with_extension(file_paths)
|
|
211
|
+
|
|
212
|
+
return data_dict
|
|
213
|
+
|
|
159
214
|
def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
|
|
160
215
|
"""
|
|
161
216
|
Reformat the dataset subset with subset_key and format.
|
|
@@ -249,7 +304,7 @@ class DataAdapter(ABC):
|
|
|
249
304
|
def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
|
|
250
305
|
**kwargs) -> Dict[str, List[float]]:
|
|
251
306
|
"""
|
|
252
|
-
compute weighted mean of
|
|
307
|
+
compute weighted mean of score of all samples
|
|
253
308
|
|
|
254
309
|
Args:
|
|
255
310
|
review_res_list: [score1, score2, ...]
|
|
@@ -258,7 +313,7 @@ class DataAdapter(ABC):
|
|
|
258
313
|
avg_res: Dict[str, List[float]]
|
|
259
314
|
|
|
260
315
|
"""
|
|
261
|
-
if isinstance(review_res_list[0], list):
|
|
316
|
+
if len(review_res_list) > 0 and isinstance(review_res_list[0], list):
|
|
262
317
|
review_res_list = [item for sublist in review_res_list for item in sublist]
|
|
263
318
|
|
|
264
319
|
items = defaultdict(list)
|
|
@@ -270,7 +325,7 @@ class DataAdapter(ABC):
|
|
|
270
325
|
items['AverageAccuracy'].append(scores)
|
|
271
326
|
return items
|
|
272
327
|
|
|
273
|
-
def gen_report(self, subset_score_map: dict,
|
|
328
|
+
def gen_report(self, subset_score_map: dict, model_name: str, **kwargs) -> Report:
|
|
274
329
|
"""
|
|
275
330
|
Generate report for the evaluation results for all subsets.
|
|
276
331
|
|
|
@@ -278,7 +333,7 @@ class DataAdapter(ABC):
|
|
|
278
333
|
subset_score_map: The subset-score map.
|
|
279
334
|
e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
|
|
280
335
|
|
|
281
|
-
|
|
336
|
+
model_name: The evaluation model name.
|
|
282
337
|
|
|
283
338
|
Returns: The evaluation report.
|
|
284
339
|
|
|
@@ -312,9 +367,17 @@ class DataAdapter(ABC):
|
|
|
312
367
|
"model_name": "qwen2.5"
|
|
313
368
|
}
|
|
314
369
|
""" # noqa: E501
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
370
|
+
return ReportGenerator.gen_report(subset_score_map, model_name, data_adapter=self, **kwargs)
|
|
371
|
+
|
|
372
|
+
def post_process_report(self, report: Report, **kwargs):
|
|
373
|
+
"""
|
|
374
|
+
Post-process the report after generation. Draw a chart, save to file, etc.
|
|
375
|
+
This method can be overridden to customize the report format or content.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
report (Report): The generated report.
|
|
379
|
+
"""
|
|
380
|
+
pass
|
|
318
381
|
|
|
319
382
|
def gen_prompt_data(self,
|
|
320
383
|
prompt: str,
|
|
@@ -322,14 +385,33 @@ class DataAdapter(ABC):
|
|
|
322
385
|
choices: Optional[List[str]] = None,
|
|
323
386
|
index: Optional[Union[int, str]] = None,
|
|
324
387
|
id: Optional[Union[int, str]] = None,
|
|
388
|
+
messages: Optional[List[dict]] = None,
|
|
325
389
|
**kwargs) -> dict:
|
|
390
|
+
"""
|
|
391
|
+
Generates a dictionary representation of prompt data for evaluation or inference.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
prompt (str): The main prompt or input text. Can also be a list of prompts.
|
|
395
|
+
system_prompt (Optional[str], optional): An optional system-level prompt to provide context or instructions. Defaults to None.
|
|
396
|
+
choices (Optional[List[str]], optional): A list of possible choices for multi-choice tasks.
|
|
397
|
+
If not provided, uses self.choices. Defaults to None.
|
|
398
|
+
index (Optional[Union[int, str]], optional): An optional index or identifier for the prompt.
|
|
399
|
+
Defaults to 0 if not provided. Defaults to None.
|
|
400
|
+
id (Optional[Union[int, str]], optional): An optional unique identifier for the prompt data. Defaults to None.
|
|
401
|
+
messages (Optional[List[dict]], optional): An optional list of message dictionaries, typically for chat-based prompts. Defaults to None.
|
|
402
|
+
If messages is provided, it will be used as the prompt data instead of the prompt string.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
dict: A dictionary representation of the prompt data, suitable for further processing or model input.
|
|
406
|
+
""" # noqa: E501
|
|
326
407
|
data = [prompt] if not isinstance(prompt, list) else prompt
|
|
327
408
|
prompt_data = PromptData(
|
|
328
409
|
data=data,
|
|
329
410
|
multi_choices=choices or self.choices,
|
|
330
411
|
system_prompt=system_prompt or self.system_prompt,
|
|
331
412
|
index=index or 0,
|
|
332
|
-
id=id
|
|
413
|
+
id=id,
|
|
414
|
+
messages=messages)
|
|
333
415
|
return prompt_data.to_dict()
|
|
334
416
|
|
|
335
417
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
@@ -414,7 +496,8 @@ class DataAdapter(ABC):
|
|
|
414
496
|
|
|
415
497
|
# Extract question from raw_input if available
|
|
416
498
|
raw_input = kwargs.get('raw_input', {})
|
|
417
|
-
question_keys = ['question', 'prompt', 'query', 'problem']
|
|
499
|
+
question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
|
|
500
|
+
# Find the first non-empty question key in raw_input
|
|
418
501
|
question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
|
|
419
502
|
|
|
420
503
|
# Request judge and obtain score
|
|
File without changes
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.constants import EvalType
|
|
3
|
+
from evalscope.metrics import LLMJudge
|
|
4
|
+
|
|
5
|
+
TEMPLATE_0SHOT = """Please read the following text and answer the question below.
|
|
6
|
+
|
|
7
|
+
<text>
|
|
8
|
+
{context}
|
|
9
|
+
</text>
|
|
10
|
+
|
|
11
|
+
{question}
|
|
12
|
+
|
|
13
|
+
Format your response as follows: "Therefore, the answer is (insert answer here)"."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@Benchmark.register(
|
|
17
|
+
name='docmath',
|
|
18
|
+
pretty_name='DocMath',
|
|
19
|
+
description=
|
|
20
|
+
'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
|
|
21
|
+
dataset_id='yale-nlp/DocMath-Eval',
|
|
22
|
+
metric_list=['AverageAccuracy'],
|
|
23
|
+
subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
|
|
24
|
+
few_shot_num=0,
|
|
25
|
+
train_split=None,
|
|
26
|
+
eval_split='test',
|
|
27
|
+
prompt_template=TEMPLATE_0SHOT,
|
|
28
|
+
)
|
|
29
|
+
class DocMathAdapter(DataAdapter):
|
|
30
|
+
|
|
31
|
+
def __init__(self, **kwargs):
|
|
32
|
+
super().__init__(**kwargs)
|
|
33
|
+
|
|
34
|
+
def load(self, **kwargs):
|
|
35
|
+
# default load mini test
|
|
36
|
+
kwargs['split_as_subset'] = True
|
|
37
|
+
data_dict = super().load(**kwargs)
|
|
38
|
+
return data_dict
|
|
39
|
+
|
|
40
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
41
|
+
"""
|
|
42
|
+
Generate model prompt from input data.
|
|
43
|
+
"""
|
|
44
|
+
context = context = '\n'.join(input_d['paragraphs'])
|
|
45
|
+
question = input_d['question']
|
|
46
|
+
prompt = self.prompt_template.format(context=context, question=question)
|
|
47
|
+
return self.gen_prompt_data(prompt)
|
|
48
|
+
|
|
49
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
50
|
+
"""
|
|
51
|
+
Parse the raw input labels (gold).
|
|
52
|
+
"""
|
|
53
|
+
return input_d['ground_truth']
|
|
54
|
+
|
|
55
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Parse the predicted result and extract proper answer.
|
|
58
|
+
"""
|
|
59
|
+
from .utils import extract_answer
|
|
60
|
+
|
|
61
|
+
extracted_answer = extract_answer(result)
|
|
62
|
+
return extracted_answer
|
|
63
|
+
|
|
64
|
+
def match(self, gold: str, pred: str) -> float:
|
|
65
|
+
"""
|
|
66
|
+
Match the gold answer and the predicted answer.
|
|
67
|
+
"""
|
|
68
|
+
from .utils import get_acc
|
|
69
|
+
|
|
70
|
+
return get_acc(prediction=pred, gt=gold)
|
|
71
|
+
|
|
72
|
+
def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
|
|
73
|
+
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
|
|
74
|
+
|
|
75
|
+
raw_input = kwargs.get('raw_input', None)
|
|
76
|
+
question = raw_input['question']
|
|
77
|
+
# get grading response
|
|
78
|
+
prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
|
|
79
|
+
orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
80
|
+
# parse grading response
|
|
81
|
+
if 'YES' in orm_response:
|
|
82
|
+
return 1.0
|
|
83
|
+
else:
|
|
84
|
+
return 0.0
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import numpy as np
|
|
3
|
+
import re
|
|
4
|
+
from sympy import Rational
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
GENERAL_ORM_PROMPT = """You are an expert in verifying if two answers are the same.
|
|
11
|
+
Your input is a problem and two answers, Answer 1 and Answer 2. You need to check if they are equivalent.
|
|
12
|
+
Your task is to determine if two answers are equivalent, without attempting to solve the original problem.
|
|
13
|
+
Compare the answers to verify they represent identical values or meaning, even when written in different forms or notations.
|
|
14
|
+
|
|
15
|
+
Your output must follow the following format:
|
|
16
|
+
1) Provide an explanation for why the answers are equivalent or not.
|
|
17
|
+
2) Then provide your final answer in the form of: [[YES]] or [[NO]]
|
|
18
|
+
""" # noqa: E501
|
|
19
|
+
|
|
20
|
+
ORM_USER_TEMPLATE = """
|
|
21
|
+
Problem: {problem}
|
|
22
|
+
Answer 1: {answer_1}
|
|
23
|
+
Answer 2: {answer_2}
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def round_up_to_decimal(number, decimals):
|
|
28
|
+
factor = 10**decimals
|
|
29
|
+
return math.ceil(number * factor) / factor
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def is_number(string):
|
|
33
|
+
pattern = r'^[-+]?(\d{1,3}(,\d{3})*|(\d+))(\.\d+)?$'
|
|
34
|
+
match = re.match(pattern, string)
|
|
35
|
+
return bool(match)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def is_scientific_number(string):
|
|
39
|
+
pattern = r'^[-+]?\d+(\.\d+)?e[-]?\d+$'
|
|
40
|
+
match = re.match(pattern, string)
|
|
41
|
+
return bool(match)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def normalize(prediction: str):
|
|
45
|
+
# Preprocessing the string [Stage 1]
|
|
46
|
+
prediction = prediction.strip()
|
|
47
|
+
prediction = prediction.rstrip('.')
|
|
48
|
+
if not isinstance(prediction, str):
|
|
49
|
+
prediction = str(prediction) if prediction is not None else '0'
|
|
50
|
+
|
|
51
|
+
for money in ['£', '€', '¥', 'million', 'billion', 'thousand', 'US', 'USD', 'RMB']:
|
|
52
|
+
prediction = prediction.replace(money, '')
|
|
53
|
+
|
|
54
|
+
# Replace special tokens
|
|
55
|
+
if '=' in prediction:
|
|
56
|
+
prediction = prediction.split('=')[-1].strip()
|
|
57
|
+
if '≈' in prediction:
|
|
58
|
+
prediction = prediction.split('≈')[-1].strip()
|
|
59
|
+
if '`' in prediction:
|
|
60
|
+
prediction = prediction.replace('`', '')
|
|
61
|
+
if '%' in prediction:
|
|
62
|
+
prediction = prediction.replace('%', '')
|
|
63
|
+
if '$' in prediction:
|
|
64
|
+
prediction = prediction.replace('$', '')
|
|
65
|
+
if '°' in prediction:
|
|
66
|
+
prediction = prediction.replace('°', '')
|
|
67
|
+
|
|
68
|
+
# Detect the boolean keyword in the generation
|
|
69
|
+
if prediction in ['true', 'yes', 'false', 'no']:
|
|
70
|
+
if prediction == 'true' or prediction == 'yes':
|
|
71
|
+
prediction = 'True'
|
|
72
|
+
else:
|
|
73
|
+
prediction = 'False'
|
|
74
|
+
if 'True' in prediction or 'False' in prediction:
|
|
75
|
+
prediction = 'True' if 'True' in prediction else 'False'
|
|
76
|
+
|
|
77
|
+
# Detect the approximation keyword
|
|
78
|
+
if 'approximately' in prediction:
|
|
79
|
+
prediction = prediction.replace('approximately', '').strip()
|
|
80
|
+
if ' or ' in prediction:
|
|
81
|
+
prediction = prediction.split(' or ')[0]
|
|
82
|
+
|
|
83
|
+
# Drop the units before and after the number
|
|
84
|
+
if re.match(r'[-+]?(?:[\d,]*\.*\d+) [^0-9 ]+$', prediction):
|
|
85
|
+
prediction = re.search(r'([-+]?(?:[\d,]*\.*\d+)) [^0-9 ]+$', prediction).group(1)
|
|
86
|
+
if re.match(r'[^0-9 ]+ [-+]?(?:[\d,]*\.*\d+)$', prediction):
|
|
87
|
+
prediction = re.search(r'[^0-9 ]+ ([-+]?(?:[\d,]*\.*\d+))$', prediction).group(1)
|
|
88
|
+
if re.match(r'[-+]?(?:[\d,]*\.*\d+)[^\d]{1,2}$', prediction):
|
|
89
|
+
prediction = re.search(r'([-+]?(?:[\d,]*\.*\d+))[^\d]{1,2}$', prediction).group(1)
|
|
90
|
+
if re.match(r'[^-+\d]{1,2}(?:[\d,]*\.*\d+)$', prediction):
|
|
91
|
+
prediction = re.search(r'[^-+\d]{1,2}((?:[\d,]*\.*\d+))$', prediction).group(1)
|
|
92
|
+
|
|
93
|
+
# Preprocessing the number [Stage 1]
|
|
94
|
+
if '10^' in prediction:
|
|
95
|
+
prediction = re.sub(r'10\^(-?\d+)', r'math.pow(10, \1)', prediction)
|
|
96
|
+
if ' x ' in prediction:
|
|
97
|
+
prediction = prediction.replace(' x ', '*')
|
|
98
|
+
if ' × ' in prediction:
|
|
99
|
+
prediction = prediction.replace(' × ', '*')
|
|
100
|
+
if is_number(prediction):
|
|
101
|
+
prediction = prediction.replace(',', '')
|
|
102
|
+
|
|
103
|
+
# Preprocessing the option [Stage 3]
|
|
104
|
+
if '(a)' in prediction or '(b)' in prediction or '(c)' in prediction or '(d)' in prediction:
|
|
105
|
+
prediction = '"' + re.search(r'\([a-d]\)', prediction).group(0) + '"'
|
|
106
|
+
|
|
107
|
+
# If the prediction is empty, use dummy '0'
|
|
108
|
+
if not prediction:
|
|
109
|
+
prediction = '0'
|
|
110
|
+
|
|
111
|
+
# Converting the string answer to a number/list/bool/option
|
|
112
|
+
try:
|
|
113
|
+
prediction = eval(prediction)
|
|
114
|
+
except Exception:
|
|
115
|
+
# TO CHECK
|
|
116
|
+
prediction = 0
|
|
117
|
+
|
|
118
|
+
# Performing common type conversion
|
|
119
|
+
if isinstance(prediction, (set, tuple)):
|
|
120
|
+
prediction = list(prediction)
|
|
121
|
+
if isinstance(prediction[0], complex):
|
|
122
|
+
prediction = [tmp.real for tmp in prediction]
|
|
123
|
+
elif isinstance(prediction[0], Rational):
|
|
124
|
+
prediction = [float(tmp) for tmp in prediction]
|
|
125
|
+
elif isinstance(prediction, np.ndarray):
|
|
126
|
+
prediction = prediction.tolist()
|
|
127
|
+
else:
|
|
128
|
+
if isinstance(prediction, complex):
|
|
129
|
+
prediction = prediction.real
|
|
130
|
+
elif isinstance(prediction, Rational):
|
|
131
|
+
prediction = float(prediction)
|
|
132
|
+
|
|
133
|
+
return prediction
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def extract_answer(response: str):
|
|
137
|
+
"""Parses the final answer from the model's response text.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
response: Text extracted from the model's response
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
The final answer as a numeric value (string), or None if not found
|
|
144
|
+
"""
|
|
145
|
+
# Remove any asterisks or other unwanted characters
|
|
146
|
+
response = response.replace('*', '')
|
|
147
|
+
response = response.replace('(', '')
|
|
148
|
+
response = response.replace(')', '')
|
|
149
|
+
|
|
150
|
+
# Search for the pattern 'the answer is {final answer}.'
|
|
151
|
+
match = re.search(r'the answer is (\=?\≈?\`?\%?\$?\°?\£?\€?\¥?-?[0-9\.,]+)', response, re.IGNORECASE)
|
|
152
|
+
|
|
153
|
+
if match:
|
|
154
|
+
# Remove commas from the matched number (if any)
|
|
155
|
+
res = match.group(1).replace(',', '').rstrip('.')
|
|
156
|
+
return res
|
|
157
|
+
else:
|
|
158
|
+
return response
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def within_eps(pred: float, gt: float):
|
|
162
|
+
eps = abs(gt) * 0.0015
|
|
163
|
+
if pred >= gt - eps and pred <= gt + eps:
|
|
164
|
+
return True
|
|
165
|
+
else:
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def compare_two_numbers(p, gt):
|
|
170
|
+
if isinstance(p, int) or isinstance(p, float):
|
|
171
|
+
pass
|
|
172
|
+
elif isinstance(p, list) or isinstance(p, bool) or isinstance(p, str):
|
|
173
|
+
return False
|
|
174
|
+
elif isinstance(p, tuple) or isinstance(p, complex) or isinstance(p, dict):
|
|
175
|
+
return False
|
|
176
|
+
else:
|
|
177
|
+
raise ValueError(p)
|
|
178
|
+
|
|
179
|
+
v1, v2 = max(abs(gt), abs(p)), min(abs(gt), abs(p))
|
|
180
|
+
if (v1 != 0 and v2 != 0) and int(math.log10(v1 / v2)) == math.log10(v1 / v2):
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
if v2 <= v1 / 50 and within_eps(pred=v2 * 100, gt=v1):
|
|
184
|
+
return True
|
|
185
|
+
elif v2 <= v1 / 500 and within_eps(pred=v2 * 1000, gt=v1):
|
|
186
|
+
return True
|
|
187
|
+
elif v2 <= v1 / 50000 and within_eps(pred=v2 * 100000, gt=v1):
|
|
188
|
+
return True
|
|
189
|
+
|
|
190
|
+
if round_up_to_decimal(v1, 2) == round_up_to_decimal(v2, 2):
|
|
191
|
+
return True
|
|
192
|
+
|
|
193
|
+
return within_eps(pred=p, gt=gt)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_acc(prediction, gt, cot=True):
|
|
197
|
+
try:
|
|
198
|
+
if cot:
|
|
199
|
+
prediction = normalize(prediction)
|
|
200
|
+
else:
|
|
201
|
+
prediction = float(prediction)
|
|
202
|
+
|
|
203
|
+
answer_type = type(gt).__name__
|
|
204
|
+
assert answer_type in ['int', 'float', 'float64', 'bool'], answer_type
|
|
205
|
+
if isinstance(prediction, (str, int, float, bool)) or isinstance(prediction, list):
|
|
206
|
+
# Comparing prediction against the reference
|
|
207
|
+
if answer_type in ['bool']:
|
|
208
|
+
acc = int(prediction == gt)
|
|
209
|
+
elif answer_type == 'int':
|
|
210
|
+
acc = int(compare_two_numbers(prediction, gt))
|
|
211
|
+
elif answer_type == 'float' or answer_type == 'float64':
|
|
212
|
+
acc = int(compare_two_numbers(prediction, gt))
|
|
213
|
+
else:
|
|
214
|
+
acc = 0
|
|
215
|
+
else:
|
|
216
|
+
acc = 0
|
|
217
|
+
logger.error('Error: ', prediction, type(prediction))
|
|
218
|
+
return acc
|
|
219
|
+
except Exception:
|
|
220
|
+
return 0
|
|
File without changes
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
+
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
DROP_EXAMPLES = '''Some examples of passages and Q&A are provided below.
|
|
11
|
+
|
|
12
|
+
# Examples
|
|
13
|
+
---
|
|
14
|
+
Passage: Trunajaya rebellion or Trunajaya War was the ultimately unsuccessful rebellion waged by the Madurese prince Trunajaya and fighters from Makassar against the Mataram Sultanate and its Dutch East India Company supporters in Java during the 1670s. The rebellion was initially successful: the rebels defeated the royal army at Gegodog , captured most of the Javanese north coast, and took the Mataram capital Plered . King Amangkurat I died during the retreat of the royal court. His son and successor, Amangkurat II, requested help from the VOC in exchange for financial remuneration and geopolitical concessions. The VOC\'s subsequent involvement turned the tide of the war. VOC and Mataram forces recovered lost territories and overran Trunajaya\'s new capital at Kediri . However, the rebellion continued until the capture of Trunajaya at the end of 1679, and the defeat, death, or surrender of the other rebel leaders . Trunajaya was killed by Amangkurat II personally in 1680 while a prisoner of the VOC. After his father\'s death in 1677, Amangkurat II also faced rival claims to the throne. The most serious rival was his brother Pangeran Puger, who took the capital Plered in 1677 and did not surrender until 1681.
|
|
15
|
+
Question: How many years was it between Trunajaya\'s capture and his death while prisoner of the VOC?
|
|
16
|
+
Answer: 1
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
Passage: Led by former Giant Kurt Warner, the defending NFC champions took the field at Giants Stadium against a Giants team still reeling from their bad loss in New Orleans. The Giants scored first, sending Jacobs in for a 4-yard touchdown run following a Terrell Thomas interception. Later, Arizona running back Beanie Wells scored his first career touchdown on a 13-yard rush. Manning responded by throwing a 62-yard touchdown to Nicks for his longest reception of the year. In the second half, the Cardinals\' Tim Hightower and Jason Wright scored touchdowns. But it was turnovers that decided this game; Manning\'s 3 interceptions were as many as he had thrown all season. The Giants scored only 3 points in the second half, ending the game on an interception to Antrel Rolle. The Giants notable streak of 38 consecutive starts by the same offensive line unit was ended here, as offensive tackle Kareem McKenzie missed the game with a groin injury. McKenzie returned the following week.
|
|
20
|
+
Question: Which player made the first score of the game?
|
|
21
|
+
Answer: Jacobs
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
Passage: Hoping to rebound from their road loss to the Bills, the Chargers flew to Wembley Stadium for the 2008 International Series game with the New Orleans Saints. In the first quarter, San Diego trailed early as kicker Taylor Mehlhaff got a 23-yard field goal. The \'Bolts would respond with kicker Nate Kaeding getting a 33-yard field goal. In the second quarter, New Orleans regained the lead as QB Drew Brees (a former Charger) completed a 12-yard TD pass to WR Devery Henderson (with a failed PAT) and RB Deuce McAllister getting a 1-yard TD run. San Diego answered as QB Philip Rivers completed a 12-yard TD pass to RB LaDainian Tomlinson, but the Saints replied with Brees completing a 30-yard TD pass to WR Lance Moore. The Chargers closed out the half with Rivers completing a 12-yard TD pass to TE Antonio Gates. In the third quarter, New Orleans increased its lead Brees completing a 1-yard TD pass to TE Mark Campbell, after a very controversial Pass interference call on cornerback Cletis Gordon put the Saints on the 1-yard line. The \'Bolts would answer with Kaeding getting a 24-yard field goal. In the fourth quarter, the Saints continued to build its lead as FB Mike Karney got a 1-yard TD run. San Diego tried to rally as Kaeding nailed a 31-yard field goal, Rivers completed a 14-yard TD pass to WR Vincent Jackson, and Brees giving the \'Bolts a safety via an incomplete pass thrown into the back of his own endzone. However, New Orleans\' defense stiffened for the win. With the loss, the Chargers went into their bye week at 3-5.
|
|
25
|
+
Question: How many total yards of touchdown passes did Drew Brees make?
|
|
26
|
+
Answer: 43
|
|
27
|
+
|
|
28
|
+
''' # noqa: E501
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@Benchmark.register(
|
|
32
|
+
name='drop',
|
|
33
|
+
pretty_name='DROP',
|
|
34
|
+
dataset_id='AI-ModelScope/DROP',
|
|
35
|
+
metric_list=['AverageAccuracy'],
|
|
36
|
+
few_shot_num=0,
|
|
37
|
+
train_split=None,
|
|
38
|
+
eval_split='validation',
|
|
39
|
+
prompt_template=
|
|
40
|
+
'You will be asked to read a passage and answer a question.{drop_examples}# Your Task\n\n---\n{query}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.', # noqa: E501
|
|
41
|
+
)
|
|
42
|
+
class DROPAdapter(DataAdapter):
|
|
43
|
+
|
|
44
|
+
def __init__(self, **kwargs):
|
|
45
|
+
super().__init__(**kwargs)
|
|
46
|
+
|
|
47
|
+
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
48
|
+
if few_shot_num != 0:
|
|
49
|
+
self.few_shot_num = 3
|
|
50
|
+
logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
|
|
51
|
+
else:
|
|
52
|
+
self.few_shot_num = 0
|
|
53
|
+
|
|
54
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
55
|
+
"""
|
|
56
|
+
Generate model prompt from input data.
|
|
57
|
+
"""
|
|
58
|
+
drop_examples = '' if self.few_shot_num == 0 else DROP_EXAMPLES
|
|
59
|
+
query = f"Passage: {input_d['passage']}\nQuestion: {input_d['question']}"
|
|
60
|
+
prompt = self.prompt_template.format(
|
|
61
|
+
drop_examples=drop_examples,
|
|
62
|
+
query=query,
|
|
63
|
+
)
|
|
64
|
+
return self.gen_prompt_data(prompt)
|
|
65
|
+
|
|
66
|
+
def get_gold_answer(self, input_d: dict) -> List[str]:
|
|
67
|
+
"""
|
|
68
|
+
Parse the raw input labels (gold).
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def _flatten_validated_answers(validated_answers):
|
|
72
|
+
"""Flattens a dict of lists of validated answers.
|
|
73
|
+
{"number": ['1', '8'], ...}
|
|
74
|
+
-> [{"number": ['1'], ...}, {"number": ['8'], ...}]
|
|
75
|
+
"""
|
|
76
|
+
valid_answers = []
|
|
77
|
+
for i in range(len(validated_answers['number'])):
|
|
78
|
+
valid_answers.append({
|
|
79
|
+
'number': validated_answers['number'][i],
|
|
80
|
+
'date': validated_answers['date'][i],
|
|
81
|
+
'spans': validated_answers['spans'][i],
|
|
82
|
+
})
|
|
83
|
+
return valid_answers
|
|
84
|
+
|
|
85
|
+
answers = []
|
|
86
|
+
answers_set = set()
|
|
87
|
+
candidates = [input_d['answer']] + _flatten_validated_answers(input_d['validated_answers'])
|
|
88
|
+
for candidate in candidates:
|
|
89
|
+
answer = DROPAdapter.parse_answer(candidate)
|
|
90
|
+
if answer in answers_set:
|
|
91
|
+
continue
|
|
92
|
+
answers_set.add(answer)
|
|
93
|
+
answers.append(answer)
|
|
94
|
+
return answers
|
|
95
|
+
|
|
96
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Parse the predicted result and extract proper answer.
|
|
99
|
+
"""
|
|
100
|
+
match = re.search(r'(?i)Answer\s*:\s*([^\n]+)', result)
|
|
101
|
+
extracted_answer = match.group(1) if match else result
|
|
102
|
+
return extracted_answer
|
|
103
|
+
|
|
104
|
+
def match(self, gold: List[str], pred: str) -> float:
|
|
105
|
+
"""
|
|
106
|
+
Match the gold answer and the predicted answer.
|
|
107
|
+
"""
|
|
108
|
+
from .utils import _answer_to_bags
|
|
109
|
+
|
|
110
|
+
max_em = 0
|
|
111
|
+
for gold_answer in gold:
|
|
112
|
+
# Convert the answers to bags of answers
|
|
113
|
+
predicted_bags = _answer_to_bags(pred)
|
|
114
|
+
gold_bags = _answer_to_bags(gold_answer)
|
|
115
|
+
|
|
116
|
+
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
|
|
117
|
+
exact_match = 1.0
|
|
118
|
+
else:
|
|
119
|
+
exact_match = 0.0
|
|
120
|
+
# Check if the answer is empty
|
|
121
|
+
if gold_answer[0].strip():
|
|
122
|
+
max_em = max(max_em, exact_match)
|
|
123
|
+
|
|
124
|
+
return max_em
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def parse_answer(answer):
|
|
128
|
+
# NOTE: Everything is returned as a tuple for uniformity and hashability.
|
|
129
|
+
if answer['number'] != '':
|
|
130
|
+
return (str(answer['number']), )
|
|
131
|
+
if answer['spans'] != []:
|
|
132
|
+
return tuple(answer['spans'])
|
|
133
|
+
return (' '.join([answer['date']['day'], answer['date']['month'], answer['date']['year']]).strip(), )
|