evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (114) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +40 -30
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  7. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  8. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  9. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  10. evalscope/backend/rag_eval/utils/embedding.py +77 -39
  11. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  12. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  13. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  14. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  16. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  17. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  18. evalscope/benchmarks/benchmark.py +2 -0
  19. evalscope/benchmarks/bfcl/__init__.py +0 -0
  20. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  21. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  22. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  23. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  24. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  25. evalscope/benchmarks/data_adapter.py +99 -16
  26. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  27. evalscope/benchmarks/docmath/__init__.py +0 -0
  28. evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
  29. evalscope/benchmarks/docmath/utils.py +220 -0
  30. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  31. evalscope/benchmarks/frames/__init__.py +0 -0
  32. evalscope/benchmarks/frames/frames_adapter.py +91 -0
  33. evalscope/benchmarks/frames/utils.py +37 -0
  34. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  35. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  36. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  37. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  38. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  39. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  40. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  41. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  42. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  43. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  44. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  45. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  46. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  47. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  48. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  49. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  50. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
  51. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  52. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  53. evalscope/benchmarks/race/race_adapter.py +3 -0
  54. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  55. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  56. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  57. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  58. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
  59. evalscope/benchmarks/tool_bench/utils.py +5 -4
  60. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  61. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  62. evalscope/benchmarks/utils.py +25 -0
  63. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  64. evalscope/cli/start_app.py +2 -2
  65. evalscope/collections/__init__.py +35 -3
  66. evalscope/collections/evaluator.py +68 -34
  67. evalscope/config.py +8 -2
  68. evalscope/constants.py +1 -1
  69. evalscope/evaluator/evaluator.py +40 -28
  70. evalscope/metrics/__init__.py +3 -1
  71. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  72. evalscope/metrics/llm_judge.py +12 -5
  73. evalscope/metrics/math_parser.py +1 -1
  74. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  75. evalscope/models/adapters/__init__.py +2 -0
  76. evalscope/models/adapters/base_adapter.py +31 -27
  77. evalscope/models/adapters/bfcl_adapter.py +244 -0
  78. evalscope/models/adapters/server_adapter.py +80 -23
  79. evalscope/models/custom/custom_model.py +0 -3
  80. evalscope/models/custom/dummy_model.py +77 -39
  81. evalscope/models/local_model.py +1 -1
  82. evalscope/models/register.py +2 -1
  83. evalscope/perf/arguments.py +4 -2
  84. evalscope/perf/benchmark.py +16 -12
  85. evalscope/perf/main.py +7 -0
  86. evalscope/perf/plugin/api/openai_api.py +2 -0
  87. evalscope/perf/plugin/datasets/custom.py +15 -0
  88. evalscope/perf/utils/benchmark_util.py +1 -1
  89. evalscope/perf/utils/local_server.py +1 -0
  90. evalscope/perf/utils/log_utils.py +12 -5
  91. evalscope/perf/utils/rich_display.py +1 -1
  92. evalscope/report/__init__.py +36 -4
  93. evalscope/report/combinator.py +40 -6
  94. evalscope/report/generator.py +33 -9
  95. evalscope/report/utils.py +84 -4
  96. evalscope/run.py +12 -0
  97. evalscope/summarizer.py +1 -1
  98. evalscope/utils/io_utils.py +59 -2
  99. evalscope/utils/logger.py +1 -1
  100. evalscope/utils/utils.py +12 -0
  101. evalscope/version.py +2 -2
  102. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
  103. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
  104. tests/aigc/test_t2i.py +48 -11
  105. tests/cli/test_all.py +14 -3
  106. tests/cli/test_collection.py +6 -4
  107. tests/cli/test_run.py +50 -25
  108. tests/rag/test_clip_benchmark.py +5 -1
  109. tests/rag/test_mteb.py +51 -7
  110. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  111. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  112. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  113. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  114. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,85 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.constants import EvalType
3
+ from evalscope.metrics import LLMJudge
4
+
5
+ TEMPLATE_0SHOT = """Please read the following text and answer the question below.
6
+
7
+ <text>
8
+ {context}
9
+ </text>
10
+
11
+ {question}
12
+
13
+ Format your response as follows: "Therefore, the answer is (insert answer here)"."""
14
+
15
+
16
+ @Benchmark.register(
17
+ name='docmath',
18
+ pretty_name='DocMath',
19
+ tags=['Reasoning', 'Mathematics', 'Long Context'],
20
+ description=
21
+ 'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
22
+ dataset_id='yale-nlp/DocMath-Eval',
23
+ metric_list=['AverageAccuracy'],
24
+ subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
25
+ few_shot_num=0,
26
+ train_split=None,
27
+ eval_split='test',
28
+ prompt_template=TEMPLATE_0SHOT,
29
+ )
30
+ class DocMathAdapter(DataAdapter):
31
+
32
+ def __init__(self, **kwargs):
33
+ super().__init__(**kwargs)
34
+
35
+ def load(self, **kwargs):
36
+ # default load mini test
37
+ kwargs['split_as_subset'] = True
38
+ data_dict = super().load(**kwargs)
39
+ return data_dict
40
+
41
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
42
+ """
43
+ Generate model prompt from input data.
44
+ """
45
+ context = context = '\n'.join(input_d['paragraphs'])
46
+ question = input_d['question']
47
+ prompt = self.prompt_template.format(context=context, question=question)
48
+ return self.gen_prompt_data(prompt)
49
+
50
+ def get_gold_answer(self, input_d: dict) -> str:
51
+ """
52
+ Parse the raw input labels (gold).
53
+ """
54
+ return input_d['ground_truth']
55
+
56
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
57
+ """
58
+ Parse the predicted result and extract proper answer.
59
+ """
60
+ from .utils import extract_answer
61
+
62
+ extracted_answer = extract_answer(result)
63
+ return extracted_answer
64
+
65
+ def match(self, gold: str, pred: str) -> float:
66
+ """
67
+ Match the gold answer and the predicted answer.
68
+ """
69
+ from .utils import get_acc
70
+
71
+ return get_acc(prediction=pred, gt=gold)
72
+
73
+ def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
74
+ from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
75
+
76
+ raw_input = kwargs.get('raw_input', None)
77
+ question = raw_input['question']
78
+ # get grading response
79
+ prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
80
+ orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
81
+ # parse grading response
82
+ if 'YES' in orm_response:
83
+ return 1.0
84
+ else:
85
+ return 0.0
@@ -0,0 +1,220 @@
1
+ import math
2
+ import numpy as np
3
+ import re
4
+ from sympy import Rational
5
+
6
+ from evalscope.utils.logger import get_logger
7
+
8
+ logger = get_logger()
9
+
10
+ GENERAL_ORM_PROMPT = """You are an expert in verifying if two answers are the same.
11
+ Your input is a problem and two answers, Answer 1 and Answer 2. You need to check if they are equivalent.
12
+ Your task is to determine if two answers are equivalent, without attempting to solve the original problem.
13
+ Compare the answers to verify they represent identical values or meaning, even when written in different forms or notations.
14
+
15
+ Your output must follow the following format:
16
+ 1) Provide an explanation for why the answers are equivalent or not.
17
+ 2) Then provide your final answer in the form of: [[YES]] or [[NO]]
18
+ """ # noqa: E501
19
+
20
+ ORM_USER_TEMPLATE = """
21
+ Problem: {problem}
22
+ Answer 1: {answer_1}
23
+ Answer 2: {answer_2}
24
+ """
25
+
26
+
27
+ def round_up_to_decimal(number, decimals):
28
+ factor = 10**decimals
29
+ return math.ceil(number * factor) / factor
30
+
31
+
32
+ def is_number(string):
33
+ pattern = r'^[-+]?(\d{1,3}(,\d{3})*|(\d+))(\.\d+)?$'
34
+ match = re.match(pattern, string)
35
+ return bool(match)
36
+
37
+
38
+ def is_scientific_number(string):
39
+ pattern = r'^[-+]?\d+(\.\d+)?e[-]?\d+$'
40
+ match = re.match(pattern, string)
41
+ return bool(match)
42
+
43
+
44
+ def normalize(prediction: str):
45
+ # Preprocessing the string [Stage 1]
46
+ prediction = prediction.strip()
47
+ prediction = prediction.rstrip('.')
48
+ if not isinstance(prediction, str):
49
+ prediction = str(prediction) if prediction is not None else '0'
50
+
51
+ for money in ['£', '€', '¥', 'million', 'billion', 'thousand', 'US', 'USD', 'RMB']:
52
+ prediction = prediction.replace(money, '')
53
+
54
+ # Replace special tokens
55
+ if '=' in prediction:
56
+ prediction = prediction.split('=')[-1].strip()
57
+ if '≈' in prediction:
58
+ prediction = prediction.split('≈')[-1].strip()
59
+ if '`' in prediction:
60
+ prediction = prediction.replace('`', '')
61
+ if '%' in prediction:
62
+ prediction = prediction.replace('%', '')
63
+ if '$' in prediction:
64
+ prediction = prediction.replace('$', '')
65
+ if '°' in prediction:
66
+ prediction = prediction.replace('°', '')
67
+
68
+ # Detect the boolean keyword in the generation
69
+ if prediction in ['true', 'yes', 'false', 'no']:
70
+ if prediction == 'true' or prediction == 'yes':
71
+ prediction = 'True'
72
+ else:
73
+ prediction = 'False'
74
+ if 'True' in prediction or 'False' in prediction:
75
+ prediction = 'True' if 'True' in prediction else 'False'
76
+
77
+ # Detect the approximation keyword
78
+ if 'approximately' in prediction:
79
+ prediction = prediction.replace('approximately', '').strip()
80
+ if ' or ' in prediction:
81
+ prediction = prediction.split(' or ')[0]
82
+
83
+ # Drop the units before and after the number
84
+ if re.match(r'[-+]?(?:[\d,]*\.*\d+) [^0-9 ]+$', prediction):
85
+ prediction = re.search(r'([-+]?(?:[\d,]*\.*\d+)) [^0-9 ]+$', prediction).group(1)
86
+ if re.match(r'[^0-9 ]+ [-+]?(?:[\d,]*\.*\d+)$', prediction):
87
+ prediction = re.search(r'[^0-9 ]+ ([-+]?(?:[\d,]*\.*\d+))$', prediction).group(1)
88
+ if re.match(r'[-+]?(?:[\d,]*\.*\d+)[^\d]{1,2}$', prediction):
89
+ prediction = re.search(r'([-+]?(?:[\d,]*\.*\d+))[^\d]{1,2}$', prediction).group(1)
90
+ if re.match(r'[^-+\d]{1,2}(?:[\d,]*\.*\d+)$', prediction):
91
+ prediction = re.search(r'[^-+\d]{1,2}((?:[\d,]*\.*\d+))$', prediction).group(1)
92
+
93
+ # Preprocessing the number [Stage 1]
94
+ if '10^' in prediction:
95
+ prediction = re.sub(r'10\^(-?\d+)', r'math.pow(10, \1)', prediction)
96
+ if ' x ' in prediction:
97
+ prediction = prediction.replace(' x ', '*')
98
+ if ' × ' in prediction:
99
+ prediction = prediction.replace(' × ', '*')
100
+ if is_number(prediction):
101
+ prediction = prediction.replace(',', '')
102
+
103
+ # Preprocessing the option [Stage 3]
104
+ if '(a)' in prediction or '(b)' in prediction or '(c)' in prediction or '(d)' in prediction:
105
+ prediction = '"' + re.search(r'\([a-d]\)', prediction).group(0) + '"'
106
+
107
+ # If the prediction is empty, use dummy '0'
108
+ if not prediction:
109
+ prediction = '0'
110
+
111
+ # Converting the string answer to a number/list/bool/option
112
+ try:
113
+ prediction = eval(prediction)
114
+ except Exception:
115
+ # TO CHECK
116
+ prediction = 0
117
+
118
+ # Performing common type conversion
119
+ if isinstance(prediction, (set, tuple)):
120
+ prediction = list(prediction)
121
+ if isinstance(prediction[0], complex):
122
+ prediction = [tmp.real for tmp in prediction]
123
+ elif isinstance(prediction[0], Rational):
124
+ prediction = [float(tmp) for tmp in prediction]
125
+ elif isinstance(prediction, np.ndarray):
126
+ prediction = prediction.tolist()
127
+ else:
128
+ if isinstance(prediction, complex):
129
+ prediction = prediction.real
130
+ elif isinstance(prediction, Rational):
131
+ prediction = float(prediction)
132
+
133
+ return prediction
134
+
135
+
136
+ def extract_answer(response: str):
137
+ """Parses the final answer from the model's response text.
138
+
139
+ Args:
140
+ response: Text extracted from the model's response
141
+
142
+ Returns:
143
+ The final answer as a numeric value (string), or None if not found
144
+ """
145
+ # Remove any asterisks or other unwanted characters
146
+ response = response.replace('*', '')
147
+ response = response.replace('(', '')
148
+ response = response.replace(')', '')
149
+
150
+ # Search for the pattern 'the answer is {final answer}.'
151
+ match = re.search(r'the answer is (\=?\≈?\`?\%?\$?\°?\£?\€?\¥?-?[0-9\.,]+)', response, re.IGNORECASE)
152
+
153
+ if match:
154
+ # Remove commas from the matched number (if any)
155
+ res = match.group(1).replace(',', '').rstrip('.')
156
+ return res
157
+ else:
158
+ return response
159
+
160
+
161
+ def within_eps(pred: float, gt: float):
162
+ eps = abs(gt) * 0.0015
163
+ if pred >= gt - eps and pred <= gt + eps:
164
+ return True
165
+ else:
166
+ return False
167
+
168
+
169
+ def compare_two_numbers(p, gt):
170
+ if isinstance(p, int) or isinstance(p, float):
171
+ pass
172
+ elif isinstance(p, list) or isinstance(p, bool) or isinstance(p, str):
173
+ return False
174
+ elif isinstance(p, tuple) or isinstance(p, complex) or isinstance(p, dict):
175
+ return False
176
+ else:
177
+ raise ValueError(p)
178
+
179
+ v1, v2 = max(abs(gt), abs(p)), min(abs(gt), abs(p))
180
+ if (v1 != 0 and v2 != 0) and int(math.log10(v1 / v2)) == math.log10(v1 / v2):
181
+ return True
182
+
183
+ if v2 <= v1 / 50 and within_eps(pred=v2 * 100, gt=v1):
184
+ return True
185
+ elif v2 <= v1 / 500 and within_eps(pred=v2 * 1000, gt=v1):
186
+ return True
187
+ elif v2 <= v1 / 50000 and within_eps(pred=v2 * 100000, gt=v1):
188
+ return True
189
+
190
+ if round_up_to_decimal(v1, 2) == round_up_to_decimal(v2, 2):
191
+ return True
192
+
193
+ return within_eps(pred=p, gt=gt)
194
+
195
+
196
+ def get_acc(prediction, gt, cot=True):
197
+ try:
198
+ if cot:
199
+ prediction = normalize(prediction)
200
+ else:
201
+ prediction = float(prediction)
202
+
203
+ answer_type = type(gt).__name__
204
+ assert answer_type in ['int', 'float', 'float64', 'bool'], answer_type
205
+ if isinstance(prediction, (str, int, float, bool)) or isinstance(prediction, list):
206
+ # Comparing prediction against the reference
207
+ if answer_type in ['bool']:
208
+ acc = int(prediction == gt)
209
+ elif answer_type == 'int':
210
+ acc = int(compare_two_numbers(prediction, gt))
211
+ elif answer_type == 'float' or answer_type == 'float64':
212
+ acc = int(compare_two_numbers(prediction, gt))
213
+ else:
214
+ acc = 0
215
+ else:
216
+ acc = 0
217
+ logger.error('Error: ', prediction, type(prediction))
218
+ return acc
219
+ except Exception:
220
+ return 0
@@ -31,6 +31,9 @@ Answer: 43
31
31
  @Benchmark.register(
32
32
  name='drop',
33
33
  pretty_name='DROP',
34
+ tags=['Reasoning'],
35
+ description=
36
+ 'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
34
37
  dataset_id='AI-ModelScope/DROP',
35
38
  metric_list=['AverageAccuracy'],
36
39
  few_shot_num=0,
File without changes
@@ -0,0 +1,91 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.constants import EvalType, OutputType
3
+ from evalscope.metrics import LLMJudge, exact_match
4
+
5
+ TEMPLATE_0SHOT = """Please read the following text and answer the question below.
6
+
7
+ <text>
8
+ {context}
9
+ </text>
10
+
11
+ {question}
12
+
13
+ Format your response as follows: "Therefore, the answer is (insert answer here)"."""
14
+
15
+
16
+ @Benchmark.register(
17
+ name='frames',
18
+ pretty_name='FRAMES',
19
+ tags=['Reasoning', 'Long Context'],
20
+ description=
21
+ 'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.', # noqa: E501
22
+ dataset_id='iic/frames',
23
+ model_adapter=OutputType.GENERATION,
24
+ output_types=[OutputType.GENERATION],
25
+ metric_list=['AverageAccuracy'],
26
+ few_shot_num=0,
27
+ train_split=None,
28
+ eval_split='test',
29
+ prompt_template=TEMPLATE_0SHOT,
30
+ )
31
+ class FramesAdapter(DataAdapter):
32
+
33
+ def __init__(self, **kwargs):
34
+ super().__init__(**kwargs)
35
+
36
+ def load(self, **kwargs):
37
+ # default load with snapshot
38
+ kwargs['file_structure'] = {'default': ['test.jsonl']}
39
+ data_dict = super().load_with_snapshot(**kwargs)
40
+ return data_dict
41
+
42
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
43
+ """
44
+ Generate model prompt from input data.
45
+ """
46
+ context = '\n'.join([f"{i['title']}\n{i['text']}" for i in input_d['wiki_items']])
47
+ question = input_d['Prompt']
48
+ prompt = self.prompt_template.format(context=context, question=question)
49
+ return self.gen_prompt_data(prompt)
50
+
51
+ def get_gold_answer(self, input_d: dict) -> str:
52
+ """
53
+ Parse the raw input labels (gold).
54
+ """
55
+ return input_d['Answer']
56
+
57
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
58
+ """
59
+ Parse the predicted result and extract proper answer.
60
+ """
61
+ response = result.replace('*', '')
62
+
63
+ if 'the answer is' in response:
64
+ ans = response.rsplit('the answer is', 1)[-1].strip().strip('.').strip()
65
+ else:
66
+ ans = ''
67
+
68
+ return ans
69
+
70
+ def match(self, gold: str, pred: str) -> float:
71
+ """
72
+ Match the gold answer and the predicted answer.
73
+ """
74
+ from .utils import normalize_answer
75
+ gold = normalize_answer(gold)
76
+ pred = normalize_answer(pred)
77
+ return exact_match(gold=gold, pred=pred)
78
+
79
+ def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
80
+ from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
81
+
82
+ raw_input = kwargs.get('raw_input', None)
83
+ question = raw_input['Prompt']
84
+ # get grading response
85
+ prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
86
+ orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
87
+ # parse grading response
88
+ if 'YES' in orm_response:
89
+ return 1.0
90
+ else:
91
+ return 0.0
@@ -0,0 +1,37 @@
1
+ import re
2
+ import string
3
+
4
+
5
+ def normalize_answer(s):
6
+
7
+ def remove_articles(text):
8
+ return re.sub(r'\b(a|an|the)\b', ' ', text)
9
+
10
+ def white_space_fix(text):
11
+ return ' '.join(text.split())
12
+
13
+ def remove_punc(text):
14
+ exclude = set(string.punctuation)
15
+ return ''.join(ch for ch in text if ch not in exclude)
16
+
17
+ def lower(text):
18
+ return text.lower()
19
+
20
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
21
+
22
+
23
+ GENERAL_ORM_PROMPT = """You are an expert in verifying if two answers are the same.
24
+ Your input is a problem and two answers, Answer 1 and Answer 2. You need to check if they are equivalent.
25
+ Your task is to determine if two answers are equivalent, without attempting to solve the original problem.
26
+ Compare the answers to verify they represent identical values or meaning, even when written in different forms or notations.
27
+
28
+ Your output must follow the following format:
29
+ 1) Provide an explanation for why the answers are equivalent or not.
30
+ 2) Then provide your final answer in the form of: [[YES]] or [[NO]]
31
+ """ # noqa: E501
32
+
33
+ ORM_USER_TEMPLATE = """
34
+ Problem: {problem}
35
+ Answer 1: {answer_1}
36
+ Answer 2: {answer_2}
37
+ """
@@ -1,11 +1,12 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import csv
3
2
  import os
3
+ from collections import defaultdict
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
8
  from evalscope.utils import ResponseParser
9
+ from evalscope.utils.io_utils import csv_to_list, jsonl_to_list
9
10
  from evalscope.utils.logger import get_logger
10
11
 
11
12
  # flake8: noqa
@@ -15,7 +16,9 @@ logger = get_logger()
15
16
 
16
17
  @Benchmark.register(
17
18
  name='general_mcq',
18
- pretty_name='General MCQ',
19
+ pretty_name='General-MCQ',
20
+ description='A general multiple-choice question answering dataset.',
21
+ tags=['MCQ', 'Custom'],
19
22
  dataset_id='general_mcq',
20
23
  model_adapter=OutputType.GENERATION,
21
24
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -24,7 +27,7 @@ logger = get_logger()
24
27
  few_shot_num=0,
25
28
  train_split='dev',
26
29
  eval_split='val',
27
- prompt_template='请回答问题,并选出其中的正确答案\n{query}',
30
+ prompt_template='请回答问题,并选出其中的正确答案。你的回答的最后一行应该是这样的格式:“答案是:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
28
31
  query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
29
32
  class GeneralMCQAdapter(DataAdapter):
30
33
 
@@ -34,28 +37,21 @@ class GeneralMCQAdapter(DataAdapter):
34
37
  self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
35
38
 
36
39
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
37
- data_dict = {}
40
+ data_dict = defaultdict(dict)
38
41
  for subset_name in subset_list:
39
42
  for split_name in [self.train_split, self.eval_split]:
40
- if os.path.exists(dataset_name_or_path):
41
- file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}.csv')
42
- else:
43
- file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
44
- if os.path.exists(file_path):
45
- with open(file_path, encoding='utf-8') as f:
46
- rows = []
47
- reader = csv.reader(f)
48
- header = next(reader)
49
- for row in reader:
50
- item = dict(zip(header, row))
51
- rows.append(item)
52
-
53
- if subset_name in data_dict:
54
- data_dict[subset_name].update({split_name: rows})
55
- else:
56
- data_dict[subset_name] = {split_name: rows}
57
-
58
- return data_dict
43
+ # Check for files with different extensions
44
+ for ext, loader in [('.jsonl', jsonl_to_list), ('.csv', csv_to_list)]:
45
+ if os.path.exists(dataset_name_or_path):
46
+ file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
47
+ else:
48
+ file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
49
+
50
+ if os.path.exists(file_path):
51
+ data_dict[subset_name][split_name] = loader(file_path)
52
+ break # Stop checking other extensions once a file is found
53
+
54
+ return dict(data_dict)
59
55
 
60
56
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
61
57
  """
@@ -13,6 +13,9 @@ logger = get_logger()
13
13
 
14
14
  @Benchmark.register(
15
15
  name='general_qa',
16
+ pretty_name='General-QA',
17
+ description='General Question Answering dataset',
18
+ tags=['QA', 'Custom'],
16
19
  dataset_id='general_qa',
17
20
  subset_list=['default'],
18
21
  metric_list=['AverageBLEU', 'AverageRouge'],
@@ -10,6 +10,9 @@ from evalscope.metrics import exact_match
10
10
  @Benchmark.register(
11
11
  name='gpqa',
12
12
  pretty_name='GPQA',
13
+ tags=['MCQ', 'Knowledge'],
14
+ description=
15
+ 'GPQA is a dataset for evaluating the reasoning ability of large language models (LLMs) on complex mathematical problems. It contains questions that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
13
16
  dataset_id='modelscope/gpqa',
14
17
  model_adapter=OutputType.GENERATION,
15
18
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -15,6 +15,9 @@ logger = get_logger()
15
15
  @Benchmark.register(
16
16
  name='gsm8k',
17
17
  pretty_name='GSM8K',
18
+ tags=['Mathematics'],
19
+ description=
20
+ 'GSM8K (Grade School Math 8K) is a dataset of grade school math problems, designed to evaluate the mathematical reasoning abilities of AI models.',
18
21
  dataset_id='modelscope/gsm8k',
19
22
  subset_list=['main'],
20
23
  metric_list=['AverageAccuracy'],
@@ -18,6 +18,9 @@ logger = get_logger()
18
18
  @Benchmark.register(
19
19
  name='hellaswag',
20
20
  pretty_name='HellaSwag',
21
+ tags=['Commonsense', 'MCQ', 'Knowledge'],
22
+ description=
23
+ 'HellaSwag is a benchmark for commonsense reasoning in natural language understanding tasks. It consists of multiple-choice questions where the model must select the most plausible continuation of a given context.',
21
24
  dataset_id='modelscope/hellaswag',
22
25
  model_adapter=OutputType.MULTIPLE_CHOICE,
23
26
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -13,6 +13,9 @@ logger = get_logger()
13
13
  @Benchmark.register(
14
14
  name='humaneval',
15
15
  pretty_name='HumanEval',
16
+ tags=['Coding'],
17
+ description=
18
+ 'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.', # noqa: E501
16
19
  dataset_id='modelscope/humaneval',
17
20
  subset_list=['openai_humaneval'],
18
21
  metric_list=['Pass@1'],
@@ -10,6 +10,9 @@ from evalscope.metrics import Metric, mean, metric_registry
10
10
  @Benchmark.register(
11
11
  name='ifeval',
12
12
  pretty_name='IFEval',
13
+ tags=['Instruction-Following'],
14
+ description=
15
+ 'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
13
16
  dataset_id='opencompass/ifeval',
14
17
  subset_list=['default'],
15
18
  metric_list=[
@@ -7,6 +7,9 @@ from evalscope.utils.utils import ResponseParser
7
7
  @Benchmark.register(
8
8
  name='iquiz',
9
9
  pretty_name='IQuiz',
10
+ tags=['Knowledge', 'MCQ', 'Chinese'],
11
+ description=
12
+ 'IQuiz is a benchmark for evaluating AI models on IQ and EQ questions. It consists of multiple-choice questions where the model must select the correct answer and provide an explanation.', # noqa: E501
10
13
  dataset_id='AI-ModelScope/IQuiz',
11
14
  model_adapter=OutputType.GENERATION,
12
15
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -8,7 +8,10 @@ logger = get_logger()
8
8
 
9
9
  @Benchmark.register(
10
10
  name='live_code_bench',
11
- pretty_name='Live Code Bench',
11
+ pretty_name='Live-Code-Bench',
12
+ tags=['Coding'],
13
+ description=
14
+ 'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.', # noqa: E501
12
15
  dataset_id='AI-ModelScope/code_generation_lite',
13
16
  subset_list=['release_latest'],
14
17
  metric_list=['Pass@1'],
@@ -11,6 +11,9 @@ SUBSET_LIST = ['default']
11
11
  @Benchmark.register(
12
12
  name='maritime_bench',
13
13
  pretty_name='MaritimeBench',
14
+ tags=['Maritime', 'MCQ', 'Knowledge'],
15
+ description=
16
+ 'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.', # noqa: E501
14
17
  dataset_id='HiDolphin/MaritimeBench',
15
18
  model_adapter=OutputType.GENERATION,
16
19
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -10,6 +10,9 @@ logger = get_logger()
10
10
  @Benchmark.register(
11
11
  name='math_500',
12
12
  pretty_name='MATH-500',
13
+ tags=['Mathematics'],
14
+ description=
15
+ "MATH-500 is a benchmark for evaluating mathematical reasoning capabilities of AI models. It consists of 500 diverse math problems across five levels of difficulty, designed to test a model's ability to solve complex mathematical problems by generating step-by-step solutions and providing the correct final answer.", # noqa: E501
13
16
  dataset_id='AI-ModelScope/MATH-500',
14
17
  subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
15
18
  metric_list=['AveragePass@1'],
@@ -136,6 +136,9 @@ SUBJECT_MAPPING = {
136
136
  @Benchmark.register(
137
137
  name='mmlu',
138
138
  pretty_name='MMLU',
139
+ tags=['Knowledge', 'MCQ'],
140
+ description=
141
+ "The MMLU (Massive Multitask Language Understanding) benchmark is a comprehensive evaluation suite designed to assess the performance of language models across a wide range of subjects and tasks. It includes multiple-choice questions from various domains, such as history, science, mathematics, and more, providing a robust measure of a model's understanding and reasoning capabilities.", # noqa: E501
139
142
  dataset_id='modelscope/mmlu',
140
143
  model_adapter=OutputType.GENERATION,
141
144
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -15,6 +15,9 @@ SUBSET_LIST = [
15
15
  @Benchmark.register(
16
16
  name='mmlu_pro',
17
17
  pretty_name='MMLU-Pro',
18
+ tags=['MCQ', 'Knowledge'],
19
+ description=
20
+ 'MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
18
21
  dataset_id='modelscope/MMLU-Pro',
19
22
  model_adapter=OutputType.GENERATION,
20
23
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],