evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (81) hide show
  1. evalscope/arguments.py +3 -0
  2. evalscope/benchmarks/aime/__init__.py +0 -0
  3. evalscope/benchmarks/aime/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  5. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  6. evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
  7. evalscope/benchmarks/benchmark.py +5 -3
  8. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  9. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  10. evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
  11. evalscope/benchmarks/data_adapter.py +88 -29
  12. evalscope/benchmarks/data_collection/__init__.py +0 -0
  13. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  14. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  15. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
  16. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
  17. evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
  18. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
  19. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  20. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  21. evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
  22. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  23. evalscope/benchmarks/math_500/__init__.py +0 -0
  24. evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
  25. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  26. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
  27. evalscope/benchmarks/musr/__init__.py +0 -0
  28. evalscope/benchmarks/musr/musr_adapter.py +68 -0
  29. evalscope/benchmarks/process_bench/__init__.py +0 -0
  30. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  31. evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
  32. evalscope/benchmarks/race/race_adapter.py +3 -3
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  34. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
  35. evalscope/cli/start_app.py +4 -1
  36. evalscope/cli/start_eval.py +4 -3
  37. evalscope/cli/start_perf.py +4 -2
  38. evalscope/collections/evaluator.py +109 -39
  39. evalscope/collections/sampler.py +2 -1
  40. evalscope/collections/schema.py +1 -2
  41. evalscope/config.py +4 -1
  42. evalscope/evaluator/evaluator.py +81 -65
  43. evalscope/metrics/__init__.py +2 -1
  44. evalscope/metrics/math_parser.py +526 -0
  45. evalscope/metrics/metrics.py +39 -3
  46. evalscope/metrics/named_metrics.py +31 -7
  47. evalscope/models/base_adapter.py +7 -1
  48. evalscope/models/chat_adapter.py +69 -49
  49. evalscope/models/choice_adapter.py +52 -45
  50. evalscope/models/custom_adapter.py +2 -2
  51. evalscope/models/local_model.py +7 -2
  52. evalscope/models/server_adapter.py +106 -61
  53. evalscope/perf/__init__.py +0 -1
  54. evalscope/perf/arguments.py +5 -1
  55. evalscope/perf/http_client.py +2 -2
  56. evalscope/perf/plugin/api/openai_api.py +11 -1
  57. evalscope/perf/utils/benchmark_util.py +6 -2
  58. evalscope/report/app.py +42 -23
  59. evalscope/run.py +11 -8
  60. evalscope/third_party/thinkbench/__init__.py +3 -0
  61. evalscope/third_party/thinkbench/eval.py +264 -0
  62. evalscope/third_party/thinkbench/infer.py +100 -0
  63. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  64. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  65. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  66. evalscope/third_party/thinkbench/tools/llm.py +47 -0
  67. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  68. evalscope/utils/chat_service.py +2 -2
  69. evalscope/utils/io_utils.py +1 -1
  70. evalscope/utils/model_utils.py +17 -1
  71. evalscope/utils/utils.py +45 -45
  72. evalscope/version.py +2 -2
  73. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
  74. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
  75. tests/cli/test_run.py +108 -19
  76. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  77. evalscope/metrics/math_accuracy.py +0 -200
  78. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
  79. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
  80. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
  81. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,6 @@
2
2
  import re
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.metrics import Pass1
6
5
  from evalscope.models import ChatGenerationModelAdapter
7
6
  from evalscope.utils.logger import get_logger
8
7
 
@@ -17,11 +16,11 @@ logger = get_logger()
17
16
  dataset_id='modelscope/humaneval',
18
17
  model_adapter=ChatGenerationModelAdapter,
19
18
  subset_list=['openai_humaneval'],
20
- metric_list=[Pass1],
19
+ metric_list=['Pass@1'],
21
20
  few_shot_num=0,
22
21
  train_split=None,
23
22
  eval_split='test',
24
- prompt_template='',
23
+ prompt_template='Complete the following python code:\n{query}',
25
24
  )
26
25
  class HumanevalAdapter(DataAdapter):
27
26
  """
@@ -64,10 +63,10 @@ class HumanevalAdapter(DataAdapter):
64
63
  input_d (dict): The raw input. A single data format of the Humaneval:
65
64
  {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
66
65
  """
67
- full_prompt = input_d['prompt']
68
- full_prompt = f'Complete the following python code:\n{full_prompt}' if self.prompt_template else full_prompt
66
+ query = input_d['prompt']
67
+ full_prompt = self.prompt_template.format(query=query)
69
68
 
70
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
69
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
71
70
 
72
71
  @classmethod
73
72
  def _postprocess(cls, text: str) -> str:
@@ -2,9 +2,9 @@ from collections import defaultdict
2
2
  from typing import Any, Dict, List
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
5
+ from evalscope.benchmarks.ifeval.utils import process_results
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import Metric, mean
7
+ from evalscope.metrics import Metric, mean, metric_registry
8
8
  from evalscope.models import ChatGenerationModelAdapter
9
9
 
10
10
 
@@ -14,10 +14,10 @@ from evalscope.models import ChatGenerationModelAdapter
14
14
  model_adapter=ChatGenerationModelAdapter,
15
15
  subset_list=['default'],
16
16
  metric_list=[
17
- Metric(name='prompt_level_strict_acc', object=mean),
18
- Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
19
- Metric(name='prompt_level_loose_acc', object=mean),
20
- Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
17
+ 'prompt_level_strict_acc',
18
+ 'inst_level_strict_acc',
19
+ 'prompt_level_loose_acc',
20
+ 'inst_level_loose_acc',
21
21
  ],
22
22
  few_shot_num=0,
23
23
  train_split=None,
@@ -29,8 +29,14 @@ class IFEvalAdapter(DataAdapter):
29
29
  def __init__(self, **kwargs):
30
30
  super().__init__(**kwargs)
31
31
 
32
+ # register metrics
33
+ metric_registry.register(Metric(name='prompt_level_strict_acc', object=mean))
34
+ metric_registry.register(Metric(name='inst_level_strict_acc', object=mean))
35
+ metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
36
+ metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
37
+
32
38
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
33
- return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
39
+ return {'data': [input_d['prompt']], 'system_prompt': self.system_prompt}
34
40
 
35
41
  def get_gold_answer(self, input_d: dict) -> str:
36
42
  return input_d
@@ -41,16 +47,11 @@ class IFEvalAdapter(DataAdapter):
41
47
  def match(self, gold: Any, pred: Any) -> Dict:
42
48
  return process_results(gold, [pred])
43
49
 
44
- def compute_metric(self, review_res_list: List[dict]) -> Any:
50
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
45
51
  # aggregate review results
46
52
  res_dict = defaultdict(list)
47
53
  for res in review_res_list:
48
54
  for k, v in res.items():
49
55
  res_dict[k].append(v)
50
56
 
51
- metrics = []
52
- for metric in self.metric_list:
53
- metric_name = metric.name
54
- pred_value = res_dict[metric_name]
55
- metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
56
- return metrics
57
+ return super().compute_metric(res_dict)
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import AnswerKeys, EvalType
3
- from evalscope.metrics import AverageAccuracy, exact_match
2
+ from evalscope.constants import EvalType
3
+ from evalscope.metrics import exact_match
4
4
  from evalscope.models import ChatGenerationModelAdapter
5
5
  from evalscope.utils.utils import ResponseParser
6
6
 
@@ -10,11 +10,11 @@ from evalscope.utils.utils import ResponseParser
10
10
  dataset_id='AI-ModelScope/IQuiz',
11
11
  model_adapter=ChatGenerationModelAdapter,
12
12
  subset_list=['IQ', 'EQ'],
13
- metric_list=[AverageAccuracy],
13
+ metric_list=['AverageAccuracy'],
14
14
  few_shot_num=0,
15
15
  train_split=None,
16
16
  eval_split='test',
17
- prompt_template='你是一个高智商和高情商的专家,你被要求回答一个选择题,并选出一个正确的选项,解释原因,最终输出格式为:`答案是(选项)`。', # noqa: E501
17
+ system_prompt='你是一个高智商和高情商的专家,你被要求回答一个选择题,并选出一个正确的选项,解释原因,最终输出格式为:`答案是(选项)`。', # noqa: E501
18
18
  )
19
19
  class IQuizAdapter(DataAdapter):
20
20
 
@@ -36,7 +36,7 @@ class IQuizAdapter(DataAdapter):
36
36
  """
37
37
  prompt = f"问题: {input_d['question']}\n"
38
38
  prompt += self.__form_options(input_d['choices'])
39
- return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
39
+ return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
40
40
 
41
41
  def __form_options(self, options: list):
42
42
  option_str = '选项:\n'
File without changes
@@ -0,0 +1,58 @@
1
+ from collections import defaultdict
2
+
3
+ from evalscope.benchmarks import Benchmark, DataAdapter
4
+ from evalscope.constants import AnswerKeys
5
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
6
+ from evalscope.models import ChatGenerationModelAdapter
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ # flake8: noqa
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ @Benchmark.register(
15
+ name='math_500',
16
+ dataset_id='AI-ModelScope/MATH-500',
17
+ model_adapter=ChatGenerationModelAdapter,
18
+ subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
19
+ metric_list=['AveragePass@1'],
20
+ few_shot_num=0,
21
+ train_split=None,
22
+ eval_split='test',
23
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
24
+ )
25
+ class Math500Adapter(DataAdapter):
26
+
27
+ def __init__(self, *args, **kwargs):
28
+ super().__init__(*args, **kwargs)
29
+
30
+ def load(self, **kwargs):
31
+ # default load all levels
32
+ kwargs['subset_list'] = ['default']
33
+ data_dict = super().load(**kwargs)
34
+ return self.reformat_subset(data_dict, subset_key='level', format='Level {}')
35
+
36
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
37
+ """
38
+ Generate the prompt for the model input.
39
+ """
40
+ problem = input_d['problem']
41
+ full_prompt = self.prompt_template.format(query=problem)
42
+
43
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
44
+
45
+ def get_gold_answer(self, input_d: dict) -> str:
46
+ # Extract the gold answer from the input dict.
47
+ return strip_answer_string(input_d['answer'])
48
+
49
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
50
+ """
51
+ Parse the model output to get the answer. Could be the best choice index.
52
+ """
53
+ # Note: Use same extraction method for both of checkpoint/service/custom
54
+ result = strip_answer_string(extract_answer(result))
55
+ return result
56
+
57
+ def match(self, gold: str, pred: str) -> float:
58
+ return math_equal(pred, gold)
@@ -4,17 +4,15 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy, exact_match
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.models import MultiChoiceModelAdapter
9
- from evalscope.utils import ResponseParser, normalize_score
9
+ from evalscope.utils import ResponseParser
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  # flake8: noqa
13
13
 
14
14
  logger = get_logger()
15
15
 
16
- DATASET_ID = 'modelscope/mmlu'
17
-
18
16
  SUBSET_LIST = [
19
17
  'high_school_european_history',
20
18
  'business_ethics',
@@ -141,11 +139,11 @@ SUBJECT_MAPPING = {
141
139
  dataset_id='modelscope/mmlu',
142
140
  model_adapter=MultiChoiceModelAdapter,
143
141
  subset_list=SUBSET_LIST,
144
- metric_list=[AverageAccuracy],
142
+ metric_list=['AverageAccuracy'],
145
143
  few_shot_num=5,
146
144
  train_split='train',
147
145
  eval_split='test',
148
- prompt_template='',
146
+ prompt_template='The following are multiple choice questions (with answers) about {subset_name}. \n{query}',
149
147
  )
150
148
  class MMLUAdapter(DataAdapter):
151
149
 
@@ -221,17 +219,15 @@ class MMLUAdapter(DataAdapter):
221
219
  {'data': [full_prompt], 'multi_choices': self.choices}
222
220
 
223
221
  """
224
- prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
225
- self._format_subject(subset_name))
226
222
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
227
223
 
228
224
  context: str = '\n'.join(few_shot_prompts) + '\n'
229
225
  context += self._generate_prompt(input_d=input_d, include_answer=False)
230
- context = prompt + context
226
+ query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
231
227
 
232
- full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
228
+ full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
233
229
 
234
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
230
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
235
231
 
236
232
  def get_gold_answer(self, input_d: dict) -> str:
237
233
  # Get the gold choice
@@ -3,22 +3,27 @@ from typing import Any, Dict
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.constants import AnswerKeys, EvalType
6
- from evalscope.metrics import AverageAccuracy, exact_match
6
+ from evalscope.metrics import exact_match
7
7
  from evalscope.models import ChatGenerationModelAdapter
8
8
  from evalscope.utils.utils import ResponseParser
9
9
 
10
+ SUBSET_LIST = [
11
+ 'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
12
+ 'philosophy', 'economics', 'other', 'psychology', 'history'
13
+ ]
14
+
10
15
 
11
16
  @Benchmark.register(
12
17
  name='mmlu_pro',
13
- dataset_id='modelscope/mmlu-pro',
18
+ dataset_id='modelscope/MMLU-Pro',
14
19
  model_adapter=ChatGenerationModelAdapter,
15
- subset_list=['default'],
16
- metric_list=[AverageAccuracy],
20
+ subset_list=SUBSET_LIST,
21
+ metric_list=['AverageAccuracy'],
17
22
  few_shot_num=5,
18
23
  train_split='validation',
19
24
  eval_split='test',
20
25
  prompt_template=
21
- 'You are an knowledge expert, you are supposed to answer the multi-choice question to derive your final answer as `The answer is ...`.', # noqa: E501
26
+ 'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
22
27
  )
23
28
  class MMLUProAdapter(DataAdapter):
24
29
 
@@ -26,38 +31,29 @@ class MMLUProAdapter(DataAdapter):
26
31
  super().__init__(**kwargs)
27
32
 
28
33
  self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
29
- self.categories = [
30
- 'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
31
- 'philosophy', 'economics', 'other', 'psychology', 'history'
32
- ]
33
-
34
- def gen_prompts(self, data_dict: dict, **kwargs) -> Dict[str, list]:
35
- """
36
- Generate model prompt from raw input, unify the prompt format for MMLU-Pro benchmark.
37
- Return a dict with category as key and list of prompts as value.
38
- """
39
-
40
- data_dict = data_dict[self.subset_list[0]] # Only one subset for MMLU-Pro
41
- fewshot_prompts = self.get_fewshot_examples(data_dict)
42
-
43
- # Use the category as key to group the prompts
44
- res_dict = defaultdict(list)
45
- # generate prompts for each test sample
46
- for entry in data_dict[self.eval_split]:
47
- prefix = fewshot_prompts[entry['category']]
48
- query = prefix + 'Q: ' + entry['question'] + '\n' + \
49
- self.__form_options(entry['options']) + '\n'
50
-
51
- prompt_d = {'data': [query], 'system_prompt': self.prompt_template, AnswerKeys.RAW_INPUT: entry}
52
-
53
- res_dict[entry['category']].append(prompt_d)
54
- return res_dict
55
34
 
56
- def get_fewshot_examples(self, data_dict: dict):
57
- # load 5-shot prompts for each category
58
- prompts = {c: '' for c in self.categories}
59
- for d in data_dict[self.train_split]:
60
- prompts[d['category']] += 'Q:' + ' ' + d['question'] + '\n' + \
35
+ def load(self, **kwargs):
36
+ # default load all data
37
+ kwargs['subset_list'] = ['default']
38
+ data_dict = super().load(**kwargs)
39
+ return self.reformat_subset(data_dict, subset_key='category')
40
+
41
+ def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
42
+ if self.few_shot_num > 0:
43
+ prefix = self.format_fewshot_examples(few_shot_list)
44
+ else:
45
+ prefix = ''
46
+ query = prefix + 'Q: ' + input_d['question'] + '\n' + \
47
+ self.__form_options(input_d['options']) + '\n'
48
+
49
+ full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
50
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
51
+
52
+ def format_fewshot_examples(self, few_shot_list):
53
+ # load few-shot prompts for each category
54
+ prompts = ''
55
+ for index, d in enumerate(few_shot_list):
56
+ prompts += 'Q: ' + d['question'] + '\n' + \
61
57
  self.__form_options(d['options']) + '\n' + \
62
58
  d['cot_content'] + '\n\n'
63
59
  return prompts
File without changes
@@ -0,0 +1,68 @@
1
+ import ast
2
+ from typing import Any
3
+
4
+ from evalscope.benchmarks import Benchmark, DataAdapter
5
+ from evalscope.constants import EvalType
6
+ from evalscope.metrics import exact_match
7
+ from evalscope.models import ChatGenerationModelAdapter
8
+ from evalscope.utils.utils import ResponseParser
9
+
10
+
11
+ @Benchmark.register(
12
+ name='musr',
13
+ pretty_name='MuSR',
14
+ dataset_id='AI-ModelScope/MuSR',
15
+ model_adapter=ChatGenerationModelAdapter,
16
+ subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
17
+ metric_list=['AverageAccuracy'],
18
+ few_shot_num=0,
19
+ train_split=None,
20
+ eval_split='test',
21
+ prompt_template=
22
+ '{narrative}\n\n{question}\n\n{choices}\nThink step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.', # noqa: E501
23
+ )
24
+ class MuSRAdapter(DataAdapter):
25
+
26
+ def __init__(self, **kwargs):
27
+ super().__init__(**kwargs)
28
+
29
+ self.choices = ['A', 'B', 'C', 'D', 'E', 'F']
30
+
31
+ def load(self, **kwargs):
32
+ # default load all levels
33
+ kwargs['split_as_subset'] = True
34
+ data_dict = super().load(**kwargs)
35
+ return data_dict
36
+
37
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
38
+
39
+ choices = self.format_choice(ast.literal_eval(input_d['choices']))
40
+
41
+ full_prompt = self.prompt_template.format(
42
+ narrative=input_d['narrative'], question=input_d['question'], choices=choices)
43
+
44
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
45
+
46
+ def format_choice(self, options: list):
47
+ option_str = ''
48
+ for opt, choice in zip(options, self.choices):
49
+ option_str += f'({choice}): {opt}\n'
50
+ return option_str
51
+
52
+ def get_gold_answer(self, input_d: dict) -> str:
53
+ """
54
+ Parse the raw input labels (gold).
55
+ """
56
+ return self.choices[input_d['answer_index']]
57
+
58
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
59
+ """
60
+ Parse the predicted result and extract proper answer.
61
+ """
62
+ return ResponseParser.parse_first_option(result)
63
+
64
+ def match(self, gold: str, pred: str) -> float:
65
+ """
66
+ Match the gold answer and the predicted answer.
67
+ """
68
+ return exact_match(gold=gold, pred=pred)
File without changes
@@ -0,0 +1,13 @@
1
+ The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
2
+
3
+ [Math Problem]
4
+
5
+ {problem}
6
+
7
+ [Solution]
8
+
9
+ {tagged_response}
10
+
11
+ Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").
12
+
13
+ Please put your final answer (i.e., the index) in \boxed{{}}.
@@ -0,0 +1,96 @@
1
+ import os
2
+ import re
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import AnswerKeys, EvalType
7
+ from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
8
+ from evalscope.models import ChatGenerationModelAdapter
9
+
10
+ cur_path = os.path.dirname(os.path.abspath(__file__))
11
+
12
+
13
+ @Benchmark.register(
14
+ name='process_bench',
15
+ pretty_name='ProcessBench',
16
+ dataset_id='Qwen/ProcessBench',
17
+ model_adapter=ChatGenerationModelAdapter,
18
+ subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
19
+ metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
20
+ few_shot_num=0,
21
+ train_split=None,
22
+ eval_split='test',
23
+ )
24
+ class ProcessBenchAdapter(DataAdapter):
25
+
26
+ def __init__(self, **kwargs):
27
+ super().__init__(**kwargs)
28
+
29
+ self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt')).read()
30
+
31
+ # register metrics
32
+ metric_registry.register(Metric(name='error_acc', object=mean))
33
+ metric_registry.register(Metric(name='correct_acc', object=mean))
34
+ metric_registry.register(Metric(name='simple_f1_score', object=simple_f1_score))
35
+
36
+ def load(self, **kwargs):
37
+ # default load all levels
38
+ kwargs['split_as_subset'] = True
39
+ data_dict = super().load(**kwargs)
40
+ return data_dict
41
+
42
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
43
+
44
+ problem = input_d['problem']
45
+ steps = input_d['steps']
46
+ tagged_response = ''
47
+ for sdx, step in enumerate(steps):
48
+ tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
49
+ tagged_response = tagged_response.strip()
50
+
51
+ full_prompt = self.prompt_template.format(problem=problem, tagged_response=tagged_response)
52
+
53
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
54
+
55
+ def get_gold_answer(self, input_d: dict) -> str:
56
+ """
57
+ Parse the raw input labels (gold).
58
+ """
59
+ return int(input_d['label'])
60
+
61
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
62
+ """
63
+ Parse the predicted result and extract proper answer.
64
+ """
65
+ pred = ProcessBenchAdapter.extract_answer(result)
66
+ try:
67
+ pred = int(pred)
68
+ except Exception:
69
+ pred = None
70
+ return pred
71
+
72
+ def match(self, gold: int, pred: int) -> float:
73
+ """
74
+ Match the gold answer and the predicted answer.
75
+ """
76
+ return gold == pred
77
+
78
+ def compute_metric(self, review_res_list: list, **kwargs) -> List[dict]:
79
+ reviews_list = kwargs['reviews_list']
80
+ error_data = []
81
+ correct_data = []
82
+ for res, raw in zip(review_res_list, reviews_list):
83
+ if raw[AnswerKeys.RAW_INPUT]['label'] == -1:
84
+ correct_data.append(res)
85
+ else:
86
+ error_data.append(res)
87
+ data = {'error_acc': error_data, 'correct_acc': correct_data, 'simple_f1_score': (correct_data, error_data)}
88
+ return super().compute_metric(data)
89
+
90
+ @staticmethod
91
+ def extract_answer(solution_text: str):
92
+ boxed_pattern = r'\\boxed\{([^}]*)\}'
93
+ matches = re.findall(boxed_pattern, solution_text)
94
+ if matches:
95
+ return matches[-1].strip()
96
+ return None
@@ -4,7 +4,7 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy, exact_match
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.models import MultiChoiceModelAdapter
9
9
  from evalscope.utils import ResponseParser
10
10
  from evalscope.utils.io_utils import jsonl_to_list
@@ -20,7 +20,7 @@ logger = get_logger()
20
20
  dataset_id='modelscope/race',
21
21
  model_adapter=MultiChoiceModelAdapter,
22
22
  subset_list=['high', 'middle'],
23
- metric_list=[AverageAccuracy],
23
+ metric_list=['AverageAccuracy'],
24
24
  few_shot_num=3,
25
25
  train_split='train',
26
26
  eval_split='test',
@@ -82,7 +82,7 @@ class RACEAdapter(DataAdapter):
82
82
 
83
83
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
84
84
 
85
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
85
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
86
86
 
87
87
  def get_gold_answer(self, input_d: dict) -> str:
88
88
  # Get the gold choice
@@ -6,7 +6,6 @@ import os
6
6
  from evalscope.benchmarks import Benchmark
7
7
  from evalscope.benchmarks.data_adapter import DataAdapter
8
8
  from evalscope.constants import EvalType
9
- from evalscope.metrics import AverageAccuracy
10
9
  from evalscope.models import ChatGenerationModelAdapter
11
10
  from evalscope.utils import get_logger
12
11
 
@@ -20,7 +19,7 @@ logger = get_logger()
20
19
  dataset_id='modelscope/trivia_qa',
21
20
  model_adapter=ChatGenerationModelAdapter,
22
21
  subset_list=['default'],
23
- metric_list=[AverageAccuracy],
22
+ metric_list=['AverageAccuracy'],
24
23
  few_shot_num=5,
25
24
  train_split='dev',
26
25
  eval_split='test',
@@ -9,9 +9,8 @@ from typing import List
9
9
  from evalscope.benchmarks import Benchmark
10
10
  from evalscope.benchmarks.data_adapter import DataAdapter
11
11
  from evalscope.constants import EvalType
12
- from evalscope.metrics import AverageAccuracy
13
12
  from evalscope.models import ContinuationLogitsModelAdapter
14
- from evalscope.utils import get_logger, normalize_score
13
+ from evalscope.utils import get_logger
15
14
 
16
15
  # flake8: noqa
17
16
 
@@ -25,7 +24,7 @@ logger = get_logger()
25
24
  dataset_id='modelscope/truthful_qa',
26
25
  model_adapter=ContinuationLogitsModelAdapter,
27
26
  subset_list=['multiple_choice'],
28
- metric_list=[AverageAccuracy],
27
+ metric_list=['AverageAccuracy'],
29
28
  few_shot_num=0,
30
29
  train_split=None,
31
30
  eval_split='validation',
@@ -259,7 +258,7 @@ class TruthfulQaAdapter(DataAdapter):
259
258
 
260
259
  return {'multiple_choice': {'mc1': mc1(mc1_lls), 'mc2': mc2(mc2_lls)}} # or {'generation': xxx}
261
260
 
262
- def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
261
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
263
262
  """
264
263
  Compute evaluation result by specific metric for each subset.
265
264
 
@@ -284,8 +283,9 @@ class TruthfulQaAdapter(DataAdapter):
284
283
  logger.error(f'** Unknown review_res: {review_res_d}')
285
284
 
286
285
  # To get mc2 score
287
- return [{
288
- 'metric_name': self.metric_list[0].name,
289
- 'score': self.metric_list[0].object(mc2_list),
290
- 'num': len(mc2_list)
291
- }]
286
+ # return [{
287
+ # 'metric_name': self.metric_list[0].name,
288
+ # 'score': self.metric_list[0].object(mc2_list),
289
+ # 'num': len(mc2_list)
290
+ # }]
291
+ return super().compute_metric(mc2_list)
@@ -3,7 +3,6 @@ import os
3
3
  from argparse import ArgumentParser
4
4
 
5
5
  from evalscope.cli.base import CLICommand
6
- from evalscope.report.app import add_argument, create_app
7
6
 
8
7
 
9
8
  def subparser_func(args):
@@ -22,9 +21,13 @@ class StartAppCMD(CLICommand):
22
21
  def define_args(parsers: ArgumentParser):
23
22
  """ define args for create pipeline template command.
24
23
  """
24
+ from evalscope.report.app import add_argument
25
+
25
26
  parser = parsers.add_parser(StartAppCMD.name)
26
27
  add_argument(parser)
27
28
  parser.set_defaults(func=subparser_func)
28
29
 
29
30
  def execute(self):
31
+ from evalscope.report.app import create_app
32
+
30
33
  create_app(self.args)
@@ -1,10 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  from argparse import ArgumentParser
4
3
 
5
- from evalscope.arguments import add_argument
6
4
  from evalscope.cli.base import CLICommand
7
- from evalscope.run import run_task
8
5
 
9
6
 
10
7
  def subparser_func(args):
@@ -23,9 +20,13 @@ class EvalCMD(CLICommand):
23
20
  def define_args(parsers: ArgumentParser):
24
21
  """ define args for create pipeline template command.
25
22
  """
23
+ from evalscope.arguments import add_argument
24
+
26
25
  parser = parsers.add_parser(EvalCMD.name)
27
26
  add_argument(parser)
28
27
  parser.set_defaults(func=subparser_func)
29
28
 
30
29
  def execute(self):
30
+ from evalscope.run import run_task
31
+
31
32
  run_task(self.args)