evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (59) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/aime24/__init__.py +0 -0
  3. evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  5. evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  6. evalscope/benchmarks/benchmark.py +2 -2
  7. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  10. evalscope/benchmarks/data_adapter.py +18 -12
  11. evalscope/benchmarks/data_collection/__init__.py +0 -0
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  16. evalscope/benchmarks/gpqa/__init__.py +0 -0
  17. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  18. evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
  19. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  20. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  21. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  22. evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
  23. evalscope/benchmarks/ifeval/instructions.py +3 -4
  24. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  25. evalscope/benchmarks/math_500/__init__.py +0 -0
  26. evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  28. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  29. evalscope/benchmarks/race/race_adapter.py +3 -3
  30. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  31. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  32. evalscope/cli/start_app.py +3 -2
  33. evalscope/collections/evaluator.py +103 -39
  34. evalscope/collections/sampler.py +2 -1
  35. evalscope/collections/schema.py +1 -2
  36. evalscope/config.py +1 -0
  37. evalscope/evaluator/evaluator.py +78 -64
  38. evalscope/metrics/math_parser.py +526 -0
  39. evalscope/metrics/metrics.py +16 -1
  40. evalscope/metrics/named_metrics.py +31 -7
  41. evalscope/models/chat_adapter.py +69 -47
  42. evalscope/models/choice_adapter.py +52 -45
  43. evalscope/models/custom_adapter.py +2 -2
  44. evalscope/models/local_model.py +4 -0
  45. evalscope/models/server_adapter.py +28 -34
  46. evalscope/report/app.py +298 -96
  47. evalscope/run.py +10 -7
  48. evalscope/utils/chat_service.py +2 -2
  49. evalscope/utils/io_utils.py +1 -1
  50. evalscope/version.py +2 -2
  51. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
  52. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
  53. tests/cli/test_run.py +93 -16
  54. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  55. evalscope/metrics/math_accuracy.py +0 -200
  56. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
  57. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
  58. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
  59. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
@@ -1,200 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
-
4
- # Adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
5
- def is_equiv(str1, str2, verbose=False):
6
- if str1 is None and str2 is None:
7
- print('WARNING: Both None')
8
- return True
9
- if str1 is None or str2 is None:
10
- return False
11
-
12
- try:
13
- ss1 = strip_string(str1)
14
- ss2 = strip_string(str2)
15
- if verbose:
16
- print(ss1, ss2)
17
- return ss1 == ss2
18
- except Exception:
19
- return str1 == str2
20
-
21
-
22
- def remove_boxed(s):
23
- if '\\boxed ' in s:
24
- left = '\\boxed '
25
- assert s[:len(left)] == left
26
- return s[len(left):]
27
-
28
- left = '\\boxed{'
29
-
30
- assert s[:len(left)] == left
31
- assert s[-1] == '}'
32
-
33
- return s[len(left):-1]
34
-
35
-
36
- def last_boxed_only_string(string):
37
- idx = string.rfind('\\boxed')
38
- if '\\boxed ' in string:
39
- return '\\boxed ' + string.split('\\boxed ')[-1].split('$')[0]
40
- if idx < 0:
41
- idx = string.rfind('\\fbox')
42
- if idx < 0:
43
- return None
44
-
45
- i = idx
46
- right_brace_idx = None
47
- num_left_braces_open = 0
48
- while i < len(string):
49
- if string[i] == '{':
50
- num_left_braces_open += 1
51
- if string[i] == '}':
52
- num_left_braces_open -= 1
53
- if num_left_braces_open == 0:
54
- right_brace_idx = i
55
- break
56
- i += 1
57
-
58
- if right_brace_idx is None:
59
- retval = None
60
- else:
61
- retval = string[idx:right_brace_idx + 1]
62
-
63
- return retval
64
-
65
-
66
- def fix_fracs(string):
67
- substrs = string.split('\\frac')
68
- new_str = substrs[0]
69
- if len(substrs) > 1:
70
- substrs = substrs[1:]
71
- for substr in substrs:
72
- new_str += '\\frac'
73
- if substr[0] == '{':
74
- new_str += substr
75
- else:
76
- try:
77
- assert len(substr) >= 2
78
- except AssertionError:
79
- return string
80
- a = substr[0]
81
- b = substr[1]
82
- if b != '{':
83
- if len(substr) > 2:
84
- post_substr = substr[2:]
85
- new_str += '{' + a + '}{' + b + '}' + post_substr
86
- else:
87
- new_str += '{' + a + '}{' + b + '}'
88
- else:
89
- if len(substr) > 2:
90
- post_substr = substr[2:]
91
- new_str += '{' + a + '}' + b + post_substr
92
- else:
93
- new_str += '{' + a + '}' + b
94
- string = new_str
95
- return string
96
-
97
-
98
- def fix_a_slash_b(string):
99
- if len(string.split('/')) != 2:
100
- return string
101
- a = string.split('/')[0]
102
- b = string.split('/')[1]
103
- try:
104
- a = int(a)
105
- b = int(b)
106
- assert string == '{}/{}'.format(a, b)
107
- new_string = '\\frac{' + str(a) + '}{' + str(b) + '}'
108
- return new_string
109
- except AssertionError:
110
- return string
111
-
112
-
113
- def remove_right_units(string):
114
- # "\\text{ " only ever occurs (at least in the val set) when describing units
115
- if '\\text{ ' in string:
116
- splits = string.split('\\text{ ')
117
- assert len(splits) == 2
118
- return splits[0]
119
- else:
120
- return string
121
-
122
-
123
- def fix_sqrt(string):
124
- if '\\sqrt' not in string:
125
- return string
126
- splits = string.split('\\sqrt')
127
- new_string = splits[0]
128
- for split in splits[1:]:
129
- if split[0] != '{':
130
- a = split[0]
131
- new_substr = '\\sqrt{' + a + '}' + split[1:]
132
- else:
133
- new_substr = '\\sqrt' + split
134
- new_string += new_substr
135
- return new_string
136
-
137
-
138
- def strip_string(string):
139
- # linebreaks
140
- string = string.replace('\n', '')
141
-
142
- # remove inverse spaces
143
- string = string.replace('\\!', '')
144
-
145
- # replace \\ with \
146
- string = string.replace('\\\\', '\\')
147
-
148
- # replace tfrac and dfrac with frac
149
- string = string.replace('tfrac', 'frac')
150
- string = string.replace('dfrac', 'frac')
151
-
152
- # remove \left and \right
153
- string = string.replace('\\left', '')
154
- string = string.replace('\\right', '')
155
-
156
- # Remove circ (degrees)
157
- string = string.replace('^{\\circ}', '')
158
- string = string.replace('^\\circ', '')
159
-
160
- # remove dollar signs
161
- string = string.replace('\\$', '')
162
-
163
- # remove units (on the right)
164
- string = remove_right_units(string)
165
-
166
- # remove percentage
167
- string = string.replace('\\%', '')
168
- string = string.replace('\%', '') # noqa: W605
169
-
170
- # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
171
- string = string.replace(' .', ' 0.')
172
- string = string.replace('{.', '{0.')
173
- # if empty, return empty string
174
- if len(string) == 0:
175
- return string
176
- if string[0] == '.':
177
- string = '0' + string
178
-
179
- # to consider: get rid of e.g. "k = " or "q = " at beginning
180
- if len(string.split('=')) == 2:
181
- if len(string.split('=')[0]) <= 2:
182
- string = string.split('=')[1]
183
-
184
- # fix sqrt3 --> sqrt{3}
185
- string = fix_sqrt(string)
186
-
187
- # remove spaces
188
- string = string.replace(' ', '')
189
-
190
- # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b} # noqa: E501
191
- string = fix_fracs(string)
192
-
193
- # manually change 0.5 --> \frac{1}{2}
194
- if string == '0.5':
195
- string = '\\frac{1}{2}'
196
-
197
- # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
198
- string = fix_a_slash_b(string)
199
-
200
- return string