evalscope 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/aime24/__init__.py +0 -0
- evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- evalscope/benchmarks/benchmark.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- evalscope/benchmarks/data_adapter.py +18 -12
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- evalscope/collections/evaluator.py +103 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +78 -64
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +16 -1
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/chat_adapter.py +69 -49
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +4 -0
- evalscope/models/server_adapter.py +28 -34
- evalscope/report/app.py +30 -15
- evalscope/run.py +10 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/METADATA +14 -5
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/RECORD +53 -46
- tests/cli/test_run.py +93 -16
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
|
@@ -1,200 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
# Adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
|
|
5
|
-
def is_equiv(str1, str2, verbose=False):
|
|
6
|
-
if str1 is None and str2 is None:
|
|
7
|
-
print('WARNING: Both None')
|
|
8
|
-
return True
|
|
9
|
-
if str1 is None or str2 is None:
|
|
10
|
-
return False
|
|
11
|
-
|
|
12
|
-
try:
|
|
13
|
-
ss1 = strip_string(str1)
|
|
14
|
-
ss2 = strip_string(str2)
|
|
15
|
-
if verbose:
|
|
16
|
-
print(ss1, ss2)
|
|
17
|
-
return ss1 == ss2
|
|
18
|
-
except Exception:
|
|
19
|
-
return str1 == str2
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def remove_boxed(s):
|
|
23
|
-
if '\\boxed ' in s:
|
|
24
|
-
left = '\\boxed '
|
|
25
|
-
assert s[:len(left)] == left
|
|
26
|
-
return s[len(left):]
|
|
27
|
-
|
|
28
|
-
left = '\\boxed{'
|
|
29
|
-
|
|
30
|
-
assert s[:len(left)] == left
|
|
31
|
-
assert s[-1] == '}'
|
|
32
|
-
|
|
33
|
-
return s[len(left):-1]
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def last_boxed_only_string(string):
|
|
37
|
-
idx = string.rfind('\\boxed')
|
|
38
|
-
if '\\boxed ' in string:
|
|
39
|
-
return '\\boxed ' + string.split('\\boxed ')[-1].split('$')[0]
|
|
40
|
-
if idx < 0:
|
|
41
|
-
idx = string.rfind('\\fbox')
|
|
42
|
-
if idx < 0:
|
|
43
|
-
return None
|
|
44
|
-
|
|
45
|
-
i = idx
|
|
46
|
-
right_brace_idx = None
|
|
47
|
-
num_left_braces_open = 0
|
|
48
|
-
while i < len(string):
|
|
49
|
-
if string[i] == '{':
|
|
50
|
-
num_left_braces_open += 1
|
|
51
|
-
if string[i] == '}':
|
|
52
|
-
num_left_braces_open -= 1
|
|
53
|
-
if num_left_braces_open == 0:
|
|
54
|
-
right_brace_idx = i
|
|
55
|
-
break
|
|
56
|
-
i += 1
|
|
57
|
-
|
|
58
|
-
if right_brace_idx is None:
|
|
59
|
-
retval = None
|
|
60
|
-
else:
|
|
61
|
-
retval = string[idx:right_brace_idx + 1]
|
|
62
|
-
|
|
63
|
-
return retval
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def fix_fracs(string):
|
|
67
|
-
substrs = string.split('\\frac')
|
|
68
|
-
new_str = substrs[0]
|
|
69
|
-
if len(substrs) > 1:
|
|
70
|
-
substrs = substrs[1:]
|
|
71
|
-
for substr in substrs:
|
|
72
|
-
new_str += '\\frac'
|
|
73
|
-
if substr[0] == '{':
|
|
74
|
-
new_str += substr
|
|
75
|
-
else:
|
|
76
|
-
try:
|
|
77
|
-
assert len(substr) >= 2
|
|
78
|
-
except AssertionError:
|
|
79
|
-
return string
|
|
80
|
-
a = substr[0]
|
|
81
|
-
b = substr[1]
|
|
82
|
-
if b != '{':
|
|
83
|
-
if len(substr) > 2:
|
|
84
|
-
post_substr = substr[2:]
|
|
85
|
-
new_str += '{' + a + '}{' + b + '}' + post_substr
|
|
86
|
-
else:
|
|
87
|
-
new_str += '{' + a + '}{' + b + '}'
|
|
88
|
-
else:
|
|
89
|
-
if len(substr) > 2:
|
|
90
|
-
post_substr = substr[2:]
|
|
91
|
-
new_str += '{' + a + '}' + b + post_substr
|
|
92
|
-
else:
|
|
93
|
-
new_str += '{' + a + '}' + b
|
|
94
|
-
string = new_str
|
|
95
|
-
return string
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def fix_a_slash_b(string):
|
|
99
|
-
if len(string.split('/')) != 2:
|
|
100
|
-
return string
|
|
101
|
-
a = string.split('/')[0]
|
|
102
|
-
b = string.split('/')[1]
|
|
103
|
-
try:
|
|
104
|
-
a = int(a)
|
|
105
|
-
b = int(b)
|
|
106
|
-
assert string == '{}/{}'.format(a, b)
|
|
107
|
-
new_string = '\\frac{' + str(a) + '}{' + str(b) + '}'
|
|
108
|
-
return new_string
|
|
109
|
-
except AssertionError:
|
|
110
|
-
return string
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def remove_right_units(string):
|
|
114
|
-
# "\\text{ " only ever occurs (at least in the val set) when describing units
|
|
115
|
-
if '\\text{ ' in string:
|
|
116
|
-
splits = string.split('\\text{ ')
|
|
117
|
-
assert len(splits) == 2
|
|
118
|
-
return splits[0]
|
|
119
|
-
else:
|
|
120
|
-
return string
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def fix_sqrt(string):
|
|
124
|
-
if '\\sqrt' not in string:
|
|
125
|
-
return string
|
|
126
|
-
splits = string.split('\\sqrt')
|
|
127
|
-
new_string = splits[0]
|
|
128
|
-
for split in splits[1:]:
|
|
129
|
-
if split[0] != '{':
|
|
130
|
-
a = split[0]
|
|
131
|
-
new_substr = '\\sqrt{' + a + '}' + split[1:]
|
|
132
|
-
else:
|
|
133
|
-
new_substr = '\\sqrt' + split
|
|
134
|
-
new_string += new_substr
|
|
135
|
-
return new_string
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def strip_string(string):
|
|
139
|
-
# linebreaks
|
|
140
|
-
string = string.replace('\n', '')
|
|
141
|
-
|
|
142
|
-
# remove inverse spaces
|
|
143
|
-
string = string.replace('\\!', '')
|
|
144
|
-
|
|
145
|
-
# replace \\ with \
|
|
146
|
-
string = string.replace('\\\\', '\\')
|
|
147
|
-
|
|
148
|
-
# replace tfrac and dfrac with frac
|
|
149
|
-
string = string.replace('tfrac', 'frac')
|
|
150
|
-
string = string.replace('dfrac', 'frac')
|
|
151
|
-
|
|
152
|
-
# remove \left and \right
|
|
153
|
-
string = string.replace('\\left', '')
|
|
154
|
-
string = string.replace('\\right', '')
|
|
155
|
-
|
|
156
|
-
# Remove circ (degrees)
|
|
157
|
-
string = string.replace('^{\\circ}', '')
|
|
158
|
-
string = string.replace('^\\circ', '')
|
|
159
|
-
|
|
160
|
-
# remove dollar signs
|
|
161
|
-
string = string.replace('\\$', '')
|
|
162
|
-
|
|
163
|
-
# remove units (on the right)
|
|
164
|
-
string = remove_right_units(string)
|
|
165
|
-
|
|
166
|
-
# remove percentage
|
|
167
|
-
string = string.replace('\\%', '')
|
|
168
|
-
string = string.replace('\%', '') # noqa: W605
|
|
169
|
-
|
|
170
|
-
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
|
|
171
|
-
string = string.replace(' .', ' 0.')
|
|
172
|
-
string = string.replace('{.', '{0.')
|
|
173
|
-
# if empty, return empty string
|
|
174
|
-
if len(string) == 0:
|
|
175
|
-
return string
|
|
176
|
-
if string[0] == '.':
|
|
177
|
-
string = '0' + string
|
|
178
|
-
|
|
179
|
-
# to consider: get rid of e.g. "k = " or "q = " at beginning
|
|
180
|
-
if len(string.split('=')) == 2:
|
|
181
|
-
if len(string.split('=')[0]) <= 2:
|
|
182
|
-
string = string.split('=')[1]
|
|
183
|
-
|
|
184
|
-
# fix sqrt3 --> sqrt{3}
|
|
185
|
-
string = fix_sqrt(string)
|
|
186
|
-
|
|
187
|
-
# remove spaces
|
|
188
|
-
string = string.replace(' ', '')
|
|
189
|
-
|
|
190
|
-
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b} # noqa: E501
|
|
191
|
-
string = fix_fracs(string)
|
|
192
|
-
|
|
193
|
-
# manually change 0.5 --> \frac{1}{2}
|
|
194
|
-
if string == '0.5':
|
|
195
|
-
string = '\\frac{1}{2}'
|
|
196
|
-
|
|
197
|
-
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
|
|
198
|
-
string = fix_a_slash_b(string)
|
|
199
|
-
|
|
200
|
-
return string
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|