evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +40 -30
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +77 -39
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +2 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +99 -16
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +91 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
- evalscope/benchmarks/tool_bench/utils.py +5 -4
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/utils.py +25 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +68 -34
- evalscope/config.py +8 -2
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +40 -28
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/llm_judge.py +12 -5
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +80 -23
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +4 -2
- evalscope/perf/benchmark.py +16 -12
- evalscope/perf/main.py +7 -0
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +1 -1
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +40 -6
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +84 -4
- evalscope/run.py +12 -0
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
- tests/aigc/test_t2i.py +48 -11
- tests/cli/test_all.py +14 -3
- tests/cli/test_collection.py +6 -4
- tests/cli/test_run.py +50 -25
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +51 -7
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.constants import EvalType
|
|
3
|
+
from evalscope.metrics import LLMJudge
|
|
4
|
+
|
|
5
|
+
TEMPLATE_0SHOT = """Please read the following text and answer the question below.
|
|
6
|
+
|
|
7
|
+
<text>
|
|
8
|
+
{context}
|
|
9
|
+
</text>
|
|
10
|
+
|
|
11
|
+
{question}
|
|
12
|
+
|
|
13
|
+
Format your response as follows: "Therefore, the answer is (insert answer here)"."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@Benchmark.register(
|
|
17
|
+
name='docmath',
|
|
18
|
+
pretty_name='DocMath',
|
|
19
|
+
tags=['Reasoning', 'Mathematics', 'Long Context'],
|
|
20
|
+
description=
|
|
21
|
+
'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
|
|
22
|
+
dataset_id='yale-nlp/DocMath-Eval',
|
|
23
|
+
metric_list=['AverageAccuracy'],
|
|
24
|
+
subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
|
|
25
|
+
few_shot_num=0,
|
|
26
|
+
train_split=None,
|
|
27
|
+
eval_split='test',
|
|
28
|
+
prompt_template=TEMPLATE_0SHOT,
|
|
29
|
+
)
|
|
30
|
+
class DocMathAdapter(DataAdapter):
|
|
31
|
+
|
|
32
|
+
def __init__(self, **kwargs):
|
|
33
|
+
super().__init__(**kwargs)
|
|
34
|
+
|
|
35
|
+
def load(self, **kwargs):
|
|
36
|
+
# default load mini test
|
|
37
|
+
kwargs['split_as_subset'] = True
|
|
38
|
+
data_dict = super().load(**kwargs)
|
|
39
|
+
return data_dict
|
|
40
|
+
|
|
41
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
42
|
+
"""
|
|
43
|
+
Generate model prompt from input data.
|
|
44
|
+
"""
|
|
45
|
+
context = context = '\n'.join(input_d['paragraphs'])
|
|
46
|
+
question = input_d['question']
|
|
47
|
+
prompt = self.prompt_template.format(context=context, question=question)
|
|
48
|
+
return self.gen_prompt_data(prompt)
|
|
49
|
+
|
|
50
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
51
|
+
"""
|
|
52
|
+
Parse the raw input labels (gold).
|
|
53
|
+
"""
|
|
54
|
+
return input_d['ground_truth']
|
|
55
|
+
|
|
56
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
57
|
+
"""
|
|
58
|
+
Parse the predicted result and extract proper answer.
|
|
59
|
+
"""
|
|
60
|
+
from .utils import extract_answer
|
|
61
|
+
|
|
62
|
+
extracted_answer = extract_answer(result)
|
|
63
|
+
return extracted_answer
|
|
64
|
+
|
|
65
|
+
def match(self, gold: str, pred: str) -> float:
|
|
66
|
+
"""
|
|
67
|
+
Match the gold answer and the predicted answer.
|
|
68
|
+
"""
|
|
69
|
+
from .utils import get_acc
|
|
70
|
+
|
|
71
|
+
return get_acc(prediction=pred, gt=gold)
|
|
72
|
+
|
|
73
|
+
def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
|
|
74
|
+
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
|
|
75
|
+
|
|
76
|
+
raw_input = kwargs.get('raw_input', None)
|
|
77
|
+
question = raw_input['question']
|
|
78
|
+
# get grading response
|
|
79
|
+
prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
|
|
80
|
+
orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
81
|
+
# parse grading response
|
|
82
|
+
if 'YES' in orm_response:
|
|
83
|
+
return 1.0
|
|
84
|
+
else:
|
|
85
|
+
return 0.0
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import numpy as np
|
|
3
|
+
import re
|
|
4
|
+
from sympy import Rational
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
GENERAL_ORM_PROMPT = """You are an expert in verifying if two answers are the same.
|
|
11
|
+
Your input is a problem and two answers, Answer 1 and Answer 2. You need to check if they are equivalent.
|
|
12
|
+
Your task is to determine if two answers are equivalent, without attempting to solve the original problem.
|
|
13
|
+
Compare the answers to verify they represent identical values or meaning, even when written in different forms or notations.
|
|
14
|
+
|
|
15
|
+
Your output must follow the following format:
|
|
16
|
+
1) Provide an explanation for why the answers are equivalent or not.
|
|
17
|
+
2) Then provide your final answer in the form of: [[YES]] or [[NO]]
|
|
18
|
+
""" # noqa: E501
|
|
19
|
+
|
|
20
|
+
ORM_USER_TEMPLATE = """
|
|
21
|
+
Problem: {problem}
|
|
22
|
+
Answer 1: {answer_1}
|
|
23
|
+
Answer 2: {answer_2}
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def round_up_to_decimal(number, decimals):
|
|
28
|
+
factor = 10**decimals
|
|
29
|
+
return math.ceil(number * factor) / factor
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def is_number(string):
|
|
33
|
+
pattern = r'^[-+]?(\d{1,3}(,\d{3})*|(\d+))(\.\d+)?$'
|
|
34
|
+
match = re.match(pattern, string)
|
|
35
|
+
return bool(match)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def is_scientific_number(string):
|
|
39
|
+
pattern = r'^[-+]?\d+(\.\d+)?e[-]?\d+$'
|
|
40
|
+
match = re.match(pattern, string)
|
|
41
|
+
return bool(match)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def normalize(prediction: str):
|
|
45
|
+
# Preprocessing the string [Stage 1]
|
|
46
|
+
prediction = prediction.strip()
|
|
47
|
+
prediction = prediction.rstrip('.')
|
|
48
|
+
if not isinstance(prediction, str):
|
|
49
|
+
prediction = str(prediction) if prediction is not None else '0'
|
|
50
|
+
|
|
51
|
+
for money in ['£', '€', '¥', 'million', 'billion', 'thousand', 'US', 'USD', 'RMB']:
|
|
52
|
+
prediction = prediction.replace(money, '')
|
|
53
|
+
|
|
54
|
+
# Replace special tokens
|
|
55
|
+
if '=' in prediction:
|
|
56
|
+
prediction = prediction.split('=')[-1].strip()
|
|
57
|
+
if '≈' in prediction:
|
|
58
|
+
prediction = prediction.split('≈')[-1].strip()
|
|
59
|
+
if '`' in prediction:
|
|
60
|
+
prediction = prediction.replace('`', '')
|
|
61
|
+
if '%' in prediction:
|
|
62
|
+
prediction = prediction.replace('%', '')
|
|
63
|
+
if '$' in prediction:
|
|
64
|
+
prediction = prediction.replace('$', '')
|
|
65
|
+
if '°' in prediction:
|
|
66
|
+
prediction = prediction.replace('°', '')
|
|
67
|
+
|
|
68
|
+
# Detect the boolean keyword in the generation
|
|
69
|
+
if prediction in ['true', 'yes', 'false', 'no']:
|
|
70
|
+
if prediction == 'true' or prediction == 'yes':
|
|
71
|
+
prediction = 'True'
|
|
72
|
+
else:
|
|
73
|
+
prediction = 'False'
|
|
74
|
+
if 'True' in prediction or 'False' in prediction:
|
|
75
|
+
prediction = 'True' if 'True' in prediction else 'False'
|
|
76
|
+
|
|
77
|
+
# Detect the approximation keyword
|
|
78
|
+
if 'approximately' in prediction:
|
|
79
|
+
prediction = prediction.replace('approximately', '').strip()
|
|
80
|
+
if ' or ' in prediction:
|
|
81
|
+
prediction = prediction.split(' or ')[0]
|
|
82
|
+
|
|
83
|
+
# Drop the units before and after the number
|
|
84
|
+
if re.match(r'[-+]?(?:[\d,]*\.*\d+) [^0-9 ]+$', prediction):
|
|
85
|
+
prediction = re.search(r'([-+]?(?:[\d,]*\.*\d+)) [^0-9 ]+$', prediction).group(1)
|
|
86
|
+
if re.match(r'[^0-9 ]+ [-+]?(?:[\d,]*\.*\d+)$', prediction):
|
|
87
|
+
prediction = re.search(r'[^0-9 ]+ ([-+]?(?:[\d,]*\.*\d+))$', prediction).group(1)
|
|
88
|
+
if re.match(r'[-+]?(?:[\d,]*\.*\d+)[^\d]{1,2}$', prediction):
|
|
89
|
+
prediction = re.search(r'([-+]?(?:[\d,]*\.*\d+))[^\d]{1,2}$', prediction).group(1)
|
|
90
|
+
if re.match(r'[^-+\d]{1,2}(?:[\d,]*\.*\d+)$', prediction):
|
|
91
|
+
prediction = re.search(r'[^-+\d]{1,2}((?:[\d,]*\.*\d+))$', prediction).group(1)
|
|
92
|
+
|
|
93
|
+
# Preprocessing the number [Stage 1]
|
|
94
|
+
if '10^' in prediction:
|
|
95
|
+
prediction = re.sub(r'10\^(-?\d+)', r'math.pow(10, \1)', prediction)
|
|
96
|
+
if ' x ' in prediction:
|
|
97
|
+
prediction = prediction.replace(' x ', '*')
|
|
98
|
+
if ' × ' in prediction:
|
|
99
|
+
prediction = prediction.replace(' × ', '*')
|
|
100
|
+
if is_number(prediction):
|
|
101
|
+
prediction = prediction.replace(',', '')
|
|
102
|
+
|
|
103
|
+
# Preprocessing the option [Stage 3]
|
|
104
|
+
if '(a)' in prediction or '(b)' in prediction or '(c)' in prediction or '(d)' in prediction:
|
|
105
|
+
prediction = '"' + re.search(r'\([a-d]\)', prediction).group(0) + '"'
|
|
106
|
+
|
|
107
|
+
# If the prediction is empty, use dummy '0'
|
|
108
|
+
if not prediction:
|
|
109
|
+
prediction = '0'
|
|
110
|
+
|
|
111
|
+
# Converting the string answer to a number/list/bool/option
|
|
112
|
+
try:
|
|
113
|
+
prediction = eval(prediction)
|
|
114
|
+
except Exception:
|
|
115
|
+
# TO CHECK
|
|
116
|
+
prediction = 0
|
|
117
|
+
|
|
118
|
+
# Performing common type conversion
|
|
119
|
+
if isinstance(prediction, (set, tuple)):
|
|
120
|
+
prediction = list(prediction)
|
|
121
|
+
if isinstance(prediction[0], complex):
|
|
122
|
+
prediction = [tmp.real for tmp in prediction]
|
|
123
|
+
elif isinstance(prediction[0], Rational):
|
|
124
|
+
prediction = [float(tmp) for tmp in prediction]
|
|
125
|
+
elif isinstance(prediction, np.ndarray):
|
|
126
|
+
prediction = prediction.tolist()
|
|
127
|
+
else:
|
|
128
|
+
if isinstance(prediction, complex):
|
|
129
|
+
prediction = prediction.real
|
|
130
|
+
elif isinstance(prediction, Rational):
|
|
131
|
+
prediction = float(prediction)
|
|
132
|
+
|
|
133
|
+
return prediction
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def extract_answer(response: str):
|
|
137
|
+
"""Parses the final answer from the model's response text.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
response: Text extracted from the model's response
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
The final answer as a numeric value (string), or None if not found
|
|
144
|
+
"""
|
|
145
|
+
# Remove any asterisks or other unwanted characters
|
|
146
|
+
response = response.replace('*', '')
|
|
147
|
+
response = response.replace('(', '')
|
|
148
|
+
response = response.replace(')', '')
|
|
149
|
+
|
|
150
|
+
# Search for the pattern 'the answer is {final answer}.'
|
|
151
|
+
match = re.search(r'the answer is (\=?\≈?\`?\%?\$?\°?\£?\€?\¥?-?[0-9\.,]+)', response, re.IGNORECASE)
|
|
152
|
+
|
|
153
|
+
if match:
|
|
154
|
+
# Remove commas from the matched number (if any)
|
|
155
|
+
res = match.group(1).replace(',', '').rstrip('.')
|
|
156
|
+
return res
|
|
157
|
+
else:
|
|
158
|
+
return response
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def within_eps(pred: float, gt: float):
|
|
162
|
+
eps = abs(gt) * 0.0015
|
|
163
|
+
if pred >= gt - eps and pred <= gt + eps:
|
|
164
|
+
return True
|
|
165
|
+
else:
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def compare_two_numbers(p, gt):
|
|
170
|
+
if isinstance(p, int) or isinstance(p, float):
|
|
171
|
+
pass
|
|
172
|
+
elif isinstance(p, list) or isinstance(p, bool) or isinstance(p, str):
|
|
173
|
+
return False
|
|
174
|
+
elif isinstance(p, tuple) or isinstance(p, complex) or isinstance(p, dict):
|
|
175
|
+
return False
|
|
176
|
+
else:
|
|
177
|
+
raise ValueError(p)
|
|
178
|
+
|
|
179
|
+
v1, v2 = max(abs(gt), abs(p)), min(abs(gt), abs(p))
|
|
180
|
+
if (v1 != 0 and v2 != 0) and int(math.log10(v1 / v2)) == math.log10(v1 / v2):
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
if v2 <= v1 / 50 and within_eps(pred=v2 * 100, gt=v1):
|
|
184
|
+
return True
|
|
185
|
+
elif v2 <= v1 / 500 and within_eps(pred=v2 * 1000, gt=v1):
|
|
186
|
+
return True
|
|
187
|
+
elif v2 <= v1 / 50000 and within_eps(pred=v2 * 100000, gt=v1):
|
|
188
|
+
return True
|
|
189
|
+
|
|
190
|
+
if round_up_to_decimal(v1, 2) == round_up_to_decimal(v2, 2):
|
|
191
|
+
return True
|
|
192
|
+
|
|
193
|
+
return within_eps(pred=p, gt=gt)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_acc(prediction, gt, cot=True):
|
|
197
|
+
try:
|
|
198
|
+
if cot:
|
|
199
|
+
prediction = normalize(prediction)
|
|
200
|
+
else:
|
|
201
|
+
prediction = float(prediction)
|
|
202
|
+
|
|
203
|
+
answer_type = type(gt).__name__
|
|
204
|
+
assert answer_type in ['int', 'float', 'float64', 'bool'], answer_type
|
|
205
|
+
if isinstance(prediction, (str, int, float, bool)) or isinstance(prediction, list):
|
|
206
|
+
# Comparing prediction against the reference
|
|
207
|
+
if answer_type in ['bool']:
|
|
208
|
+
acc = int(prediction == gt)
|
|
209
|
+
elif answer_type == 'int':
|
|
210
|
+
acc = int(compare_two_numbers(prediction, gt))
|
|
211
|
+
elif answer_type == 'float' or answer_type == 'float64':
|
|
212
|
+
acc = int(compare_two_numbers(prediction, gt))
|
|
213
|
+
else:
|
|
214
|
+
acc = 0
|
|
215
|
+
else:
|
|
216
|
+
acc = 0
|
|
217
|
+
logger.error('Error: ', prediction, type(prediction))
|
|
218
|
+
return acc
|
|
219
|
+
except Exception:
|
|
220
|
+
return 0
|
|
@@ -31,6 +31,9 @@ Answer: 43
|
|
|
31
31
|
@Benchmark.register(
|
|
32
32
|
name='drop',
|
|
33
33
|
pretty_name='DROP',
|
|
34
|
+
tags=['Reasoning'],
|
|
35
|
+
description=
|
|
36
|
+
'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
|
|
34
37
|
dataset_id='AI-ModelScope/DROP',
|
|
35
38
|
metric_list=['AverageAccuracy'],
|
|
36
39
|
few_shot_num=0,
|
|
File without changes
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.constants import EvalType, OutputType
|
|
3
|
+
from evalscope.metrics import LLMJudge, exact_match
|
|
4
|
+
|
|
5
|
+
TEMPLATE_0SHOT = """Please read the following text and answer the question below.
|
|
6
|
+
|
|
7
|
+
<text>
|
|
8
|
+
{context}
|
|
9
|
+
</text>
|
|
10
|
+
|
|
11
|
+
{question}
|
|
12
|
+
|
|
13
|
+
Format your response as follows: "Therefore, the answer is (insert answer here)"."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@Benchmark.register(
|
|
17
|
+
name='frames',
|
|
18
|
+
pretty_name='FRAMES',
|
|
19
|
+
tags=['Reasoning', 'Long Context'],
|
|
20
|
+
description=
|
|
21
|
+
'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.', # noqa: E501
|
|
22
|
+
dataset_id='iic/frames',
|
|
23
|
+
model_adapter=OutputType.GENERATION,
|
|
24
|
+
output_types=[OutputType.GENERATION],
|
|
25
|
+
metric_list=['AverageAccuracy'],
|
|
26
|
+
few_shot_num=0,
|
|
27
|
+
train_split=None,
|
|
28
|
+
eval_split='test',
|
|
29
|
+
prompt_template=TEMPLATE_0SHOT,
|
|
30
|
+
)
|
|
31
|
+
class FramesAdapter(DataAdapter):
|
|
32
|
+
|
|
33
|
+
def __init__(self, **kwargs):
|
|
34
|
+
super().__init__(**kwargs)
|
|
35
|
+
|
|
36
|
+
def load(self, **kwargs):
|
|
37
|
+
# default load with snapshot
|
|
38
|
+
kwargs['file_structure'] = {'default': ['test.jsonl']}
|
|
39
|
+
data_dict = super().load_with_snapshot(**kwargs)
|
|
40
|
+
return data_dict
|
|
41
|
+
|
|
42
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
43
|
+
"""
|
|
44
|
+
Generate model prompt from input data.
|
|
45
|
+
"""
|
|
46
|
+
context = '\n'.join([f"{i['title']}\n{i['text']}" for i in input_d['wiki_items']])
|
|
47
|
+
question = input_d['Prompt']
|
|
48
|
+
prompt = self.prompt_template.format(context=context, question=question)
|
|
49
|
+
return self.gen_prompt_data(prompt)
|
|
50
|
+
|
|
51
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Parse the raw input labels (gold).
|
|
54
|
+
"""
|
|
55
|
+
return input_d['Answer']
|
|
56
|
+
|
|
57
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Parse the predicted result and extract proper answer.
|
|
60
|
+
"""
|
|
61
|
+
response = result.replace('*', '')
|
|
62
|
+
|
|
63
|
+
if 'the answer is' in response:
|
|
64
|
+
ans = response.rsplit('the answer is', 1)[-1].strip().strip('.').strip()
|
|
65
|
+
else:
|
|
66
|
+
ans = ''
|
|
67
|
+
|
|
68
|
+
return ans
|
|
69
|
+
|
|
70
|
+
def match(self, gold: str, pred: str) -> float:
|
|
71
|
+
"""
|
|
72
|
+
Match the gold answer and the predicted answer.
|
|
73
|
+
"""
|
|
74
|
+
from .utils import normalize_answer
|
|
75
|
+
gold = normalize_answer(gold)
|
|
76
|
+
pred = normalize_answer(pred)
|
|
77
|
+
return exact_match(gold=gold, pred=pred)
|
|
78
|
+
|
|
79
|
+
def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
|
|
80
|
+
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
|
|
81
|
+
|
|
82
|
+
raw_input = kwargs.get('raw_input', None)
|
|
83
|
+
question = raw_input['Prompt']
|
|
84
|
+
# get grading response
|
|
85
|
+
prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
|
|
86
|
+
orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
87
|
+
# parse grading response
|
|
88
|
+
if 'YES' in orm_response:
|
|
89
|
+
return 1.0
|
|
90
|
+
else:
|
|
91
|
+
return 0.0
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import string
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def normalize_answer(s):
|
|
6
|
+
|
|
7
|
+
def remove_articles(text):
|
|
8
|
+
return re.sub(r'\b(a|an|the)\b', ' ', text)
|
|
9
|
+
|
|
10
|
+
def white_space_fix(text):
|
|
11
|
+
return ' '.join(text.split())
|
|
12
|
+
|
|
13
|
+
def remove_punc(text):
|
|
14
|
+
exclude = set(string.punctuation)
|
|
15
|
+
return ''.join(ch for ch in text if ch not in exclude)
|
|
16
|
+
|
|
17
|
+
def lower(text):
|
|
18
|
+
return text.lower()
|
|
19
|
+
|
|
20
|
+
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
GENERAL_ORM_PROMPT = """You are an expert in verifying if two answers are the same.
|
|
24
|
+
Your input is a problem and two answers, Answer 1 and Answer 2. You need to check if they are equivalent.
|
|
25
|
+
Your task is to determine if two answers are equivalent, without attempting to solve the original problem.
|
|
26
|
+
Compare the answers to verify they represent identical values or meaning, even when written in different forms or notations.
|
|
27
|
+
|
|
28
|
+
Your output must follow the following format:
|
|
29
|
+
1) Provide an explanation for why the answers are equivalent or not.
|
|
30
|
+
2) Then provide your final answer in the form of: [[YES]] or [[NO]]
|
|
31
|
+
""" # noqa: E501
|
|
32
|
+
|
|
33
|
+
ORM_USER_TEMPLATE = """
|
|
34
|
+
Problem: {problem}
|
|
35
|
+
Answer 1: {answer_1}
|
|
36
|
+
Answer 2: {answer_2}
|
|
37
|
+
"""
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import csv
|
|
3
2
|
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.utils import ResponseParser
|
|
9
|
+
from evalscope.utils.io_utils import csv_to_list, jsonl_to_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
# flake8: noqa
|
|
@@ -15,7 +16,9 @@ logger = get_logger()
|
|
|
15
16
|
|
|
16
17
|
@Benchmark.register(
|
|
17
18
|
name='general_mcq',
|
|
18
|
-
pretty_name='General
|
|
19
|
+
pretty_name='General-MCQ',
|
|
20
|
+
description='A general multiple-choice question answering dataset.',
|
|
21
|
+
tags=['MCQ', 'Custom'],
|
|
19
22
|
dataset_id='general_mcq',
|
|
20
23
|
model_adapter=OutputType.GENERATION,
|
|
21
24
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -24,7 +27,7 @@ logger = get_logger()
|
|
|
24
27
|
few_shot_num=0,
|
|
25
28
|
train_split='dev',
|
|
26
29
|
eval_split='val',
|
|
27
|
-
prompt_template='
|
|
30
|
+
prompt_template='请回答问题,并选出其中的正确答案。你的回答的最后一行应该是这样的格式:“答案是:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
|
|
28
31
|
query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
|
|
29
32
|
class GeneralMCQAdapter(DataAdapter):
|
|
30
33
|
|
|
@@ -34,28 +37,21 @@ class GeneralMCQAdapter(DataAdapter):
|
|
|
34
37
|
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
35
38
|
|
|
36
39
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
37
|
-
data_dict =
|
|
40
|
+
data_dict = defaultdict(dict)
|
|
38
41
|
for subset_name in subset_list:
|
|
39
42
|
for split_name in [self.train_split, self.eval_split]:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if subset_name in data_dict:
|
|
54
|
-
data_dict[subset_name].update({split_name: rows})
|
|
55
|
-
else:
|
|
56
|
-
data_dict[subset_name] = {split_name: rows}
|
|
57
|
-
|
|
58
|
-
return data_dict
|
|
43
|
+
# Check for files with different extensions
|
|
44
|
+
for ext, loader in [('.jsonl', jsonl_to_list), ('.csv', csv_to_list)]:
|
|
45
|
+
if os.path.exists(dataset_name_or_path):
|
|
46
|
+
file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
|
|
47
|
+
else:
|
|
48
|
+
file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
|
|
49
|
+
|
|
50
|
+
if os.path.exists(file_path):
|
|
51
|
+
data_dict[subset_name][split_name] = loader(file_path)
|
|
52
|
+
break # Stop checking other extensions once a file is found
|
|
53
|
+
|
|
54
|
+
return dict(data_dict)
|
|
59
55
|
|
|
60
56
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
61
57
|
"""
|
|
@@ -13,6 +13,9 @@ logger = get_logger()
|
|
|
13
13
|
|
|
14
14
|
@Benchmark.register(
|
|
15
15
|
name='general_qa',
|
|
16
|
+
pretty_name='General-QA',
|
|
17
|
+
description='General Question Answering dataset',
|
|
18
|
+
tags=['QA', 'Custom'],
|
|
16
19
|
dataset_id='general_qa',
|
|
17
20
|
subset_list=['default'],
|
|
18
21
|
metric_list=['AverageBLEU', 'AverageRouge'],
|
|
@@ -10,6 +10,9 @@ from evalscope.metrics import exact_match
|
|
|
10
10
|
@Benchmark.register(
|
|
11
11
|
name='gpqa',
|
|
12
12
|
pretty_name='GPQA',
|
|
13
|
+
tags=['MCQ', 'Knowledge'],
|
|
14
|
+
description=
|
|
15
|
+
'GPQA is a dataset for evaluating the reasoning ability of large language models (LLMs) on complex mathematical problems. It contains questions that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
|
|
13
16
|
dataset_id='modelscope/gpqa',
|
|
14
17
|
model_adapter=OutputType.GENERATION,
|
|
15
18
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -15,6 +15,9 @@ logger = get_logger()
|
|
|
15
15
|
@Benchmark.register(
|
|
16
16
|
name='gsm8k',
|
|
17
17
|
pretty_name='GSM8K',
|
|
18
|
+
tags=['Mathematics'],
|
|
19
|
+
description=
|
|
20
|
+
'GSM8K (Grade School Math 8K) is a dataset of grade school math problems, designed to evaluate the mathematical reasoning abilities of AI models.',
|
|
18
21
|
dataset_id='modelscope/gsm8k',
|
|
19
22
|
subset_list=['main'],
|
|
20
23
|
metric_list=['AverageAccuracy'],
|
|
@@ -18,6 +18,9 @@ logger = get_logger()
|
|
|
18
18
|
@Benchmark.register(
|
|
19
19
|
name='hellaswag',
|
|
20
20
|
pretty_name='HellaSwag',
|
|
21
|
+
tags=['Commonsense', 'MCQ', 'Knowledge'],
|
|
22
|
+
description=
|
|
23
|
+
'HellaSwag is a benchmark for commonsense reasoning in natural language understanding tasks. It consists of multiple-choice questions where the model must select the most plausible continuation of a given context.',
|
|
21
24
|
dataset_id='modelscope/hellaswag',
|
|
22
25
|
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
23
26
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -13,6 +13,9 @@ logger = get_logger()
|
|
|
13
13
|
@Benchmark.register(
|
|
14
14
|
name='humaneval',
|
|
15
15
|
pretty_name='HumanEval',
|
|
16
|
+
tags=['Coding'],
|
|
17
|
+
description=
|
|
18
|
+
'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.', # noqa: E501
|
|
16
19
|
dataset_id='modelscope/humaneval',
|
|
17
20
|
subset_list=['openai_humaneval'],
|
|
18
21
|
metric_list=['Pass@1'],
|
|
@@ -10,6 +10,9 @@ from evalscope.metrics import Metric, mean, metric_registry
|
|
|
10
10
|
@Benchmark.register(
|
|
11
11
|
name='ifeval',
|
|
12
12
|
pretty_name='IFEval',
|
|
13
|
+
tags=['Instruction-Following'],
|
|
14
|
+
description=
|
|
15
|
+
'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
|
|
13
16
|
dataset_id='opencompass/ifeval',
|
|
14
17
|
subset_list=['default'],
|
|
15
18
|
metric_list=[
|
|
@@ -7,6 +7,9 @@ from evalscope.utils.utils import ResponseParser
|
|
|
7
7
|
@Benchmark.register(
|
|
8
8
|
name='iquiz',
|
|
9
9
|
pretty_name='IQuiz',
|
|
10
|
+
tags=['Knowledge', 'MCQ', 'Chinese'],
|
|
11
|
+
description=
|
|
12
|
+
'IQuiz is a benchmark for evaluating AI models on IQ and EQ questions. It consists of multiple-choice questions where the model must select the correct answer and provide an explanation.', # noqa: E501
|
|
10
13
|
dataset_id='AI-ModelScope/IQuiz',
|
|
11
14
|
model_adapter=OutputType.GENERATION,
|
|
12
15
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -8,7 +8,10 @@ logger = get_logger()
|
|
|
8
8
|
|
|
9
9
|
@Benchmark.register(
|
|
10
10
|
name='live_code_bench',
|
|
11
|
-
pretty_name='Live
|
|
11
|
+
pretty_name='Live-Code-Bench',
|
|
12
|
+
tags=['Coding'],
|
|
13
|
+
description=
|
|
14
|
+
'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.', # noqa: E501
|
|
12
15
|
dataset_id='AI-ModelScope/code_generation_lite',
|
|
13
16
|
subset_list=['release_latest'],
|
|
14
17
|
metric_list=['Pass@1'],
|
|
@@ -11,6 +11,9 @@ SUBSET_LIST = ['default']
|
|
|
11
11
|
@Benchmark.register(
|
|
12
12
|
name='maritime_bench',
|
|
13
13
|
pretty_name='MaritimeBench',
|
|
14
|
+
tags=['Maritime', 'MCQ', 'Knowledge'],
|
|
15
|
+
description=
|
|
16
|
+
'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.', # noqa: E501
|
|
14
17
|
dataset_id='HiDolphin/MaritimeBench',
|
|
15
18
|
model_adapter=OutputType.GENERATION,
|
|
16
19
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -10,6 +10,9 @@ logger = get_logger()
|
|
|
10
10
|
@Benchmark.register(
|
|
11
11
|
name='math_500',
|
|
12
12
|
pretty_name='MATH-500',
|
|
13
|
+
tags=['Mathematics'],
|
|
14
|
+
description=
|
|
15
|
+
"MATH-500 is a benchmark for evaluating mathematical reasoning capabilities of AI models. It consists of 500 diverse math problems across five levels of difficulty, designed to test a model's ability to solve complex mathematical problems by generating step-by-step solutions and providing the correct final answer.", # noqa: E501
|
|
13
16
|
dataset_id='AI-ModelScope/MATH-500',
|
|
14
17
|
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
15
18
|
metric_list=['AveragePass@1'],
|
|
@@ -136,6 +136,9 @@ SUBJECT_MAPPING = {
|
|
|
136
136
|
@Benchmark.register(
|
|
137
137
|
name='mmlu',
|
|
138
138
|
pretty_name='MMLU',
|
|
139
|
+
tags=['Knowledge', 'MCQ'],
|
|
140
|
+
description=
|
|
141
|
+
"The MMLU (Massive Multitask Language Understanding) benchmark is a comprehensive evaluation suite designed to assess the performance of language models across a wide range of subjects and tasks. It includes multiple-choice questions from various domains, such as history, science, mathematics, and more, providing a robust measure of a model's understanding and reasoning capabilities.", # noqa: E501
|
|
139
142
|
dataset_id='modelscope/mmlu',
|
|
140
143
|
model_adapter=OutputType.GENERATION,
|
|
141
144
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -15,6 +15,9 @@ SUBSET_LIST = [
|
|
|
15
15
|
@Benchmark.register(
|
|
16
16
|
name='mmlu_pro',
|
|
17
17
|
pretty_name='MMLU-Pro',
|
|
18
|
+
tags=['MCQ', 'Knowledge'],
|
|
19
|
+
description=
|
|
20
|
+
'MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
|
|
18
21
|
dataset_id='modelscope/MMLU-Pro',
|
|
19
22
|
model_adapter=OutputType.GENERATION,
|
|
20
23
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|