evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (114) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +40 -30
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  7. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  8. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  9. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  10. evalscope/backend/rag_eval/utils/embedding.py +77 -39
  11. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  12. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  13. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  14. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  16. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  17. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  18. evalscope/benchmarks/benchmark.py +2 -0
  19. evalscope/benchmarks/bfcl/__init__.py +0 -0
  20. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  21. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  22. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  23. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  24. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  25. evalscope/benchmarks/data_adapter.py +99 -16
  26. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  27. evalscope/benchmarks/docmath/__init__.py +0 -0
  28. evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
  29. evalscope/benchmarks/docmath/utils.py +220 -0
  30. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  31. evalscope/benchmarks/frames/__init__.py +0 -0
  32. evalscope/benchmarks/frames/frames_adapter.py +91 -0
  33. evalscope/benchmarks/frames/utils.py +37 -0
  34. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  35. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  36. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  37. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  38. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  39. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  40. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  41. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  42. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  43. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  44. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  45. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  46. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  47. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  48. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  49. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  50. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
  51. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  52. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  53. evalscope/benchmarks/race/race_adapter.py +3 -0
  54. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  55. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  56. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  57. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  58. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
  59. evalscope/benchmarks/tool_bench/utils.py +5 -4
  60. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  61. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  62. evalscope/benchmarks/utils.py +25 -0
  63. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  64. evalscope/cli/start_app.py +2 -2
  65. evalscope/collections/__init__.py +35 -3
  66. evalscope/collections/evaluator.py +68 -34
  67. evalscope/config.py +8 -2
  68. evalscope/constants.py +1 -1
  69. evalscope/evaluator/evaluator.py +40 -28
  70. evalscope/metrics/__init__.py +3 -1
  71. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  72. evalscope/metrics/llm_judge.py +12 -5
  73. evalscope/metrics/math_parser.py +1 -1
  74. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  75. evalscope/models/adapters/__init__.py +2 -0
  76. evalscope/models/adapters/base_adapter.py +31 -27
  77. evalscope/models/adapters/bfcl_adapter.py +244 -0
  78. evalscope/models/adapters/server_adapter.py +80 -23
  79. evalscope/models/custom/custom_model.py +0 -3
  80. evalscope/models/custom/dummy_model.py +77 -39
  81. evalscope/models/local_model.py +1 -1
  82. evalscope/models/register.py +2 -1
  83. evalscope/perf/arguments.py +4 -2
  84. evalscope/perf/benchmark.py +16 -12
  85. evalscope/perf/main.py +7 -0
  86. evalscope/perf/plugin/api/openai_api.py +2 -0
  87. evalscope/perf/plugin/datasets/custom.py +15 -0
  88. evalscope/perf/utils/benchmark_util.py +1 -1
  89. evalscope/perf/utils/local_server.py +1 -0
  90. evalscope/perf/utils/log_utils.py +12 -5
  91. evalscope/perf/utils/rich_display.py +1 -1
  92. evalscope/report/__init__.py +36 -4
  93. evalscope/report/combinator.py +40 -6
  94. evalscope/report/generator.py +33 -9
  95. evalscope/report/utils.py +84 -4
  96. evalscope/run.py +12 -0
  97. evalscope/summarizer.py +1 -1
  98. evalscope/utils/io_utils.py +59 -2
  99. evalscope/utils/logger.py +1 -1
  100. evalscope/utils/utils.py +12 -0
  101. evalscope/version.py +2 -2
  102. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
  103. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
  104. tests/aigc/test_t2i.py +48 -11
  105. tests/cli/test_all.py +14 -3
  106. tests/cli/test_collection.py +6 -4
  107. tests/cli/test_run.py +50 -25
  108. tests/rag/test_clip_benchmark.py +5 -1
  109. tests/rag/test_mteb.py +51 -7
  110. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  111. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  112. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  113. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  114. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import OutputType
3
2
  from evalscope.metrics import extract_answer, math_equal, strip_answer_string
4
3
  from evalscope.utils.logger import get_logger
5
4
 
@@ -11,6 +10,9 @@ logger = get_logger()
11
10
  @Benchmark.register(
12
11
  name='aime25',
13
12
  pretty_name='AIME-2025',
13
+ tags=['Mathematics'],
14
+ description=
15
+ 'The AIME 2025 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.',
14
16
  dataset_id='opencompass/AIME2025',
15
17
  subset_list=['AIME2025-I', 'AIME2025-II'],
16
18
  metric_list=['AveragePass@1'],
@@ -47,6 +47,11 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
47
47
  @Benchmark.register(
48
48
  name='alpaca_eval',
49
49
  pretty_name='AlpacaEval2.0',
50
+ tags=['Instruction-Following', 'Reasoning'],
51
+ description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
52
+ 'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
53
+ 'provide more accurate and cost-effective model assessments. '
54
+ 'Currently not support `length-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-turbo`.', # noqa: E501
50
55
  dataset_id='AI-ModelScope/alpaca_eval',
51
56
  subset_list=['alpaca_eval_gpt4_baseline'],
52
57
  metric_list=['winrate'],
@@ -17,6 +17,9 @@ logger = get_logger()
17
17
  @Benchmark.register(
18
18
  name='arc',
19
19
  pretty_name='ARC',
20
+ tags=['Reasoning', 'MCQ'],
21
+ description=
22
+ 'The ARC (AI2 Reasoning Challenge) benchmark is designed to evaluate the reasoning capabilities of AI models through multiple-choice questions derived from science exams. It includes two subsets: ARC-Easy and ARC-Challenge, which vary in difficulty.', # noqa: E501
20
23
  dataset_id='modelscope/ai2_arc',
21
24
  model_adapter=OutputType.GENERATION,
22
25
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -1,5 +1,3 @@
1
- import re
2
- from collections import defaultdict
3
1
  from typing import Any, List
4
2
 
5
3
  from evalscope.benchmarks import Benchmark, DataAdapter
@@ -19,12 +17,18 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
19
17
  @Benchmark.register(
20
18
  name='arena_hard',
21
19
  pretty_name='ArenaHard',
20
+ tags=['Instruction-Following', 'Reasoning'],
21
+ description=
22
+ 'ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, '
23
+ 'where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. '
24
+ 'It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. '
25
+ 'Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.', # noqa: E501
22
26
  dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
23
27
  metric_list=['winrate'],
24
28
  few_shot_num=0,
25
29
  train_split=None,
26
30
  eval_split='test')
27
- class AlpacaEvalAdapter(DataAdapter):
31
+ class ArenaHardAdapter(DataAdapter):
28
32
 
29
33
  def __init__(self, *args, **kwargs):
30
34
  super().__init__(*args, **kwargs)
@@ -59,6 +59,9 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
59
59
  @Benchmark.register(
60
60
  name='bbh',
61
61
  pretty_name='BBH',
62
+ tags=['Reasoning'],
63
+ description=
64
+ 'The BBH (Big Bench Hard) benchmark is a collection of challenging tasks designed to evaluate the reasoning capabilities of AI models. It includes both free-form and multiple-choice tasks, covering a wide range of reasoning skills.', # noqa: E501
62
65
  dataset_id='modelscope/bbh',
63
66
  subset_list=SUBSET_LIST,
64
67
  metric_list=['AverageAccuracy'],
@@ -28,6 +28,8 @@ class BenchmarkMeta:
28
28
  system_prompt: Optional[str] = None
29
29
  query_template: Optional[str] = None
30
30
  pretty_name: Optional[str] = None
31
+ description: Optional[str] = None
32
+ tags: Optional[List[str]] = field(default_factory=list)
31
33
  filters: Optional[OrderedDict] = None
32
34
  extra_params: Optional[Dict] = field(default_factory=dict)
33
35
 
File without changes
@@ -0,0 +1,237 @@
1
+ import copy
2
+ import importlib
3
+ import json
4
+ import re
5
+ import traceback
6
+ from typing import Any, List
7
+
8
+ from evalscope.benchmarks import Benchmark, DataAdapter
9
+ from evalscope.constants import EvalType
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ SUBJECT_MAPPING = {
15
+ 'simple': 'AST_NON_LIVE',
16
+ 'multiple': 'AST_NON_LIVE',
17
+ 'parallel': 'AST_NON_LIVE',
18
+ 'parallel_multiple': 'AST_NON_LIVE',
19
+ 'java': 'AST_NON_LIVE',
20
+ 'javascript': 'AST_NON_LIVE',
21
+ 'live_simple': 'AST_LIVE',
22
+ 'live_multiple': 'AST_LIVE',
23
+ 'live_parallel': 'AST_LIVE',
24
+ 'live_parallel_multiple': 'AST_LIVE',
25
+ 'irrelevance': 'RELEVANCE',
26
+ 'live_relevance': 'RELEVANCE',
27
+ 'live_irrelevance': 'RELEVANCE',
28
+ 'multi_turn_base': 'MULTI_TURN',
29
+ 'multi_turn_miss_func': 'MULTI_TURN',
30
+ 'multi_turn_miss_param': 'MULTI_TURN',
31
+ 'multi_turn_long_context': 'MULTI_TURN'
32
+ }
33
+
34
+
35
+ @Benchmark.register(
36
+ name='bfcl_v3',
37
+ pretty_name='BFCL-v3',
38
+ tags=['Agent'],
39
+ description=
40
+ 'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
41
+ 'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
42
+ 'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
43
+ 'Need to run `pip install bfcl-eval` before evaluating. '
44
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)', # noqa: E501
45
+ dataset_id='AI-ModelScope/bfcl_v3',
46
+ subset_list=list(SUBJECT_MAPPING.keys()),
47
+ model_adapter='bfcl_server',
48
+ metric_list=['AverageAccuracy'],
49
+ few_shot_num=0,
50
+ train_split=None,
51
+ eval_split='train',
52
+ extra_params={
53
+ 'underscore_to_dot': True,
54
+ 'is_fc_model': True,
55
+ })
56
+ class BFCLAdapter(DataAdapter):
57
+
58
+ def __init__(self, **kwargs):
59
+ super().__init__(**kwargs)
60
+
61
+ spec = importlib.util.find_spec('bfcl_eval')
62
+ if spec is None:
63
+ raise ImportError(
64
+ '`bfcl_eval` not found, please install it with `pip install bfcl-eval` before evaluating.')
65
+
66
+ self.category_map = SUBJECT_MAPPING
67
+
68
+ extra_params = kwargs.get('extra_params', {})
69
+ self.underscore_to_dot = extra_params.get('underscore_to_dot', False)
70
+ self.is_fc_model = extra_params.get('is_fc_model', True)
71
+
72
+ def load(self, **kwargs):
73
+ kwargs['subset_list'] = ['default']
74
+ data_dict = super().load(**kwargs)
75
+ return self.reformat_subset(data_dict, subset_key='subset', format='{}')
76
+
77
+ def preprocess_row(self, row: dict):
78
+ """
79
+ Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
80
+ """
81
+ row['should_execute_tool_calls'] = True if row['multi_turn'] else False
82
+ row['functions'] = json.loads(row['functions'])
83
+ row['tools'] = json.loads(row['tools'])
84
+ row['turns'] = json.loads(row['turns'])
85
+ row['missing_functions'] = json.loads(row['missed_functions'])
86
+ row['ground_truth'] = json.loads(row.get('ground_truth', '{}'))
87
+ row['initial_config'] = json.loads(row['initial_config'])
88
+ row['is_fc_model'] = self.is_fc_model
89
+
90
+ def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
91
+ self.preprocess_row(input_d)
92
+
93
+ # If the model is a function calling model, we need to remove the system prompt
94
+ if self.is_fc_model:
95
+ turns = input_d['turns']
96
+ new_turns = []
97
+ for turn_idx, messages in enumerate(turns):
98
+ current_messages = messages.copy()
99
+ if len(current_messages) > 0 and current_messages[0]['role'] == 'system':
100
+ current_messages = current_messages[1:]
101
+ new_turns.append(current_messages)
102
+ input_d['turns'] = new_turns
103
+
104
+ return self.gen_prompt_data(prompt='', messages=input_d)
105
+
106
+ def get_gold_answer(self, input_d: dict) -> str:
107
+ # Get the gold choice
108
+ return input_d.get('ground_truth', )
109
+
110
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> dict:
111
+ row = copy.deepcopy(raw_input_d)
112
+ del row['turns'] # Remove turns as they are not needed for the match function
113
+
114
+ row['generation'] = result
115
+ return row
116
+
117
+ def match(self, gold: dict, pred: dict) -> dict:
118
+ from bfcl_eval.eval_checker.ast_eval.ast_checker import ast_checker
119
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import multi_turn_checker
120
+ from bfcl_eval.model_handler.utils import (convert_to_function_call, default_decode_ast_prompting,
121
+ default_decode_execute_prompting)
122
+ from bfcl_eval.utils import is_empty_output
123
+
124
+ # NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
125
+ # which decides if model was provided with functions of the type
126
+ # spotify.list_songs or spotify_list_songs
127
+ # It is False for all llama models (when using via prompting)
128
+ # and True for API calls
129
+ if self.underscore_to_dot:
130
+ dummy_model = 'gpt-4o-2024-11-20-FC'
131
+ else:
132
+ dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
133
+
134
+ row = pred
135
+ test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
136
+ if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
137
+ error = None
138
+ try:
139
+ if self.is_fc_model:
140
+ decoded_tool_calls = []
141
+ for tool_call in row['generation'][0]:
142
+ name = list(tool_call.keys())[0]
143
+ params = json.loads(tool_call[name])
144
+ decoded_tool_calls.append({name: params})
145
+ else:
146
+ decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
147
+
148
+ # successful decode means valid function call was present
149
+ contains_func_call = True
150
+ if is_empty_output(decoded_tool_calls):
151
+ # Empty output is not considered as a valid function call
152
+ contains_func_call = False
153
+ error = 'Empty decoded output.'
154
+ except Exception:
155
+ contains_func_call = False
156
+ error = f'Failed to decode with traceback: {traceback.format_exc()}'
157
+ finally:
158
+ valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
159
+ score_result = {'valid': valid, 'error_message': error}
160
+
161
+ elif row['multi_turn']:
162
+ # each step might give a list of tool calls and each turn is multi-step
163
+ # and multi-turn has generations of all the turns
164
+ # hence in a multi-turn setting,
165
+ # multi_turn_decoded_generations is a list of list of list of strings
166
+ multi_turn_decoded_generations: list[list[list[str]]] = []
167
+ for single_turn_generations in row['generation']:
168
+ single_turn_decoded_generations: list[list[str]] = []
169
+ for generation in single_turn_generations:
170
+ try:
171
+ if self.is_fc_model:
172
+ tool_calls = convert_to_function_call(generation)
173
+ else:
174
+ tool_calls = default_decode_execute_prompting(generation)
175
+
176
+ single_turn_decoded_generations.append(tool_calls)
177
+ except Exception:
178
+ single_turn_decoded_generations.append([generation])
179
+
180
+ multi_turn_decoded_generations.append(single_turn_decoded_generations)
181
+
182
+ try:
183
+ raw_score_result = multi_turn_checker(
184
+ multi_turn_decoded_generations,
185
+ row['ground_truth'],
186
+ row,
187
+ test_category,
188
+ dummy_model,
189
+ )
190
+ except Exception:
191
+ raw_score_result = {
192
+ 'valid': False,
193
+ 'error_type': 'multi_turn:checker_failed',
194
+ 'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
195
+ }
196
+
197
+ score_result = {
198
+ 'valid': float(raw_score_result['valid']),
199
+ 'error_message': raw_score_result.get('error_message', ''),
200
+ 'error_type': raw_score_result.get('error_type', ''),
201
+ }
202
+ else:
203
+ try:
204
+ if self.is_fc_model:
205
+ decoded_tool_calls = []
206
+ for tool_call in row['generation'][0]:
207
+ name = list(tool_call.keys())[0]
208
+ params = json.loads(tool_call[name])
209
+ decoded_tool_calls.append({name: params})
210
+ else:
211
+ decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
212
+
213
+ score_result = ast_checker(
214
+ row['functions'],
215
+ decoded_tool_calls,
216
+ row['ground_truth'],
217
+ row['language'],
218
+ row['test_category'],
219
+ dummy_model,
220
+ )
221
+ except Exception:
222
+ score_result = {
223
+ 'valid': False,
224
+ 'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
225
+ 'error_type': 'ast_decoder:decoder_failed',
226
+ }
227
+
228
+ return {
229
+ 'AverageAccuracy': float(score_result['valid']),
230
+ 'raw_score_result': score_result,
231
+ }
232
+
233
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
234
+ # aggregate review results
235
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
236
+
237
+ return super().compute_metric(res_dict, **kwargs)
@@ -126,6 +126,9 @@ SUBJECT_MAPPING = {
126
126
  @Benchmark.register(
127
127
  name='ceval',
128
128
  pretty_name='C-Eval',
129
+ tags=['Knowledge', 'MCQ', 'Chinese'],
130
+ description=
131
+ 'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.', # noqa: E501
129
132
  dataset_id='modelscope/ceval-exam',
130
133
  model_adapter=OutputType.GENERATION,
131
134
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -87,7 +87,10 @@ SUBSET_LIST = ['中华文化', '人文与社会科学', '工程、技术与应
87
87
 
88
88
  @Benchmark.register(
89
89
  name='chinese_simpleqa',
90
- pretty_name='Chinese SimpleQA',
90
+ pretty_name='Chinese-SimpleQA',
91
+ tags=['Knowledge', 'QA', 'Chinese'],
92
+ description=
93
+ "Chinese SimpleQA is a Chinese question-answering dataset designed to evaluate the performance of language models on simple factual questions. It includes a variety of topics and is structured to test the model's ability to understand and generate correct answers in Chinese.", # noqa: E501
91
94
  subset_list=SUBSET_LIST,
92
95
  dataset_id='AI-ModelScope/Chinese-SimpleQA',
93
96
  metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
@@ -103,6 +103,9 @@ SUBJECT_MAPPING = {
103
103
  @Benchmark.register(
104
104
  name='cmmlu',
105
105
  pretty_name='C-MMLU',
106
+ tags=['Knowledge', 'MCQ', 'Chinese'],
107
+ description=
108
+ 'C-MMLU is a benchmark designed to evaluate the performance of AI models on Chinese language tasks, including reading comprehension, text classification, and more.',
106
109
  dataset_id='modelscope/cmmlu',
107
110
  model_adapter=OutputType.GENERATION,
108
111
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -17,6 +17,9 @@ logger = get_logger()
17
17
  @Benchmark.register(
18
18
  name='competition_math',
19
19
  pretty_name='MATH',
20
+ tags=['Mathematics'],
21
+ description=
22
+ 'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
20
23
  dataset_id='modelscope/competition_math',
21
24
  subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
22
25
  metric_list=['AveragePass@1'],
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
5
5
  from collections import defaultdict
6
6
  from typing import Any, Dict, List, Optional, Union
7
7
 
8
- from evalscope.benchmarks.utils import PromptData, preprocess_decorator
8
+ from evalscope.benchmarks.utils import PromptData, load_file_with_extension, preprocess_decorator
9
9
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
10
10
  from evalscope.metrics import LLMJudge, metric_registry
11
11
  from evalscope.report import Report, ReportGenerator
@@ -15,6 +15,13 @@ logger = get_logger()
15
15
 
16
16
 
17
17
  class DataAdapter(ABC):
18
+ """
19
+ Data Adapter for the benchmark. You need to implement the following methods:
20
+ - gen_prompt
21
+ - get_gold_answer
22
+ - parse_pred_result
23
+ - match
24
+ """
18
25
 
19
26
  def __init__(self,
20
27
  name: str,
@@ -31,30 +38,37 @@ class DataAdapter(ABC):
31
38
  system_prompt: Optional[str] = None,
32
39
  query_template: Optional[str] = None,
33
40
  pretty_name: Optional[str] = None,
41
+ description: Optional[str] = None,
42
+ tags: Optional[List[str]] = None,
34
43
  **kwargs):
35
44
  """
36
- Data Adapter for the benchmark. You need to implement the following methods:
37
- - gen_prompt
38
- - get_gold_answer
39
- - parse_pred_result
40
- - match
41
45
  Args:
42
46
  name: str, the name of the benchmark.
43
47
  dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
48
+ model_adapter: str, the model adapter to use for the benchmark.
44
49
  subset_list: list of subset names for the dataset.
45
50
  metric_list: list, the metric list to evaluate the model on specific benchmark.
51
+ llm_as_a_judge: bool, whether to use LLM as a judge to evaluate the predicted answer against the gold answer.
52
+ output_types: list, the output types of the model adapter. Default: [model_adapter]
46
53
  few_shot_num: int, number of few-shot examples. Default: 0
47
54
  train_split: str, usually for few-shot examples. e.g. 'train'
48
55
  eval_split: str, the target eval split name. e.g. 'test'
49
56
  prompt_template: str, the prompt template for the benchmark,
50
57
  e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
51
58
  the form of A or B or C or D, do not output explanation:`
52
- """
59
+ system_prompt: str, the system prompt for the benchmark, e.g. 'You are a helpful assistant.'
60
+ query_template: str, the query template for the benchmark, e.g. 'Please answer the following question: {}'
61
+ pretty_name: str, the pretty name of the benchmark, e.g. 'ARC Challenge Set'.
62
+ description: str, the description of the benchmark,
63
+ e.g. 'ARC Challenge Set is a benchmark for evaluating reasoning abilities of models on science questions.'
64
+ """ # noqa: E501
53
65
  self.name = name
54
66
  self.dataset_id = dataset_id
55
67
  self.model_adapter = model_adapter
56
68
  self.subset_list = subset_list
57
69
  self.metric_list = metric_list
70
+ self.llm_as_a_judge = llm_as_a_judge
71
+ self.output_types = output_types or [model_adapter]
58
72
  self.few_shot_num = few_shot_num
59
73
  self.train_split = train_split
60
74
  self.eval_split = eval_split
@@ -62,9 +76,9 @@ class DataAdapter(ABC):
62
76
  self.system_prompt = system_prompt
63
77
  self.query_template = query_template
64
78
  self.pretty_name = pretty_name
79
+ self.description = description
80
+ self.tags = tags or []
65
81
  self.config_kwargs = kwargs
66
- self.output_types = output_types or [model_adapter]
67
- self.llm_as_a_judge = llm_as_a_judge
68
82
  self.category_map = kwargs.get('category_map', {})
69
83
  self.choices = kwargs.get('choices', None)
70
84
 
@@ -156,6 +170,49 @@ class DataAdapter(ABC):
156
170
  """
157
171
  return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
158
172
 
173
+ def load_with_snapshot(self,
174
+ file_structure: Dict[str, List[str]],
175
+ dataset_name_or_path: str = None,
176
+ subset_list: list = None,
177
+ work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
178
+ **kwargs) -> dict:
179
+ """
180
+ For datasets that cannot be correctly loaded using MsDataset, utilize snapshot downloading to load the data.
181
+ This feature supports both remote and local datasets.
182
+
183
+ Args:
184
+ file_structure: dict, the file structure of the dataset, e.g. {'subset_name': ['file1.jsonl', 'file2.jsonl']}.
185
+ dataset_name_or_path: str, the dataset id on ModelScope or local path for the benchmark.
186
+ subset_list: list of subset names for the dataset.
187
+ work_dir: str, the working directory to store the dataset.
188
+ Returns: {'subset_name': {'eval': eval_dataset}}
189
+ """ # noqa: E501
190
+ dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
191
+ subset_list = subset_list or self.subset_list
192
+
193
+ # Try to load dataset from local disk
194
+ if os.path.exists(dataset_name_or_path):
195
+ logger.info(f'Loading dataset from {dataset_name_or_path}')
196
+ dataset_path = dataset_name_or_path
197
+ else:
198
+ from modelscope import dataset_snapshot_download
199
+
200
+ # Load dataset from remote
201
+ logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
202
+ # flatten file structure
203
+ file_names = [file for sub_files in file_structure.values() for file in sub_files]
204
+ # download dataset snapshot
205
+ dataset_path = dataset_snapshot_download(
206
+ dataset_name_or_path, cache_dir=work_dir, allow_file_pattern=file_names)
207
+ # read and process files
208
+ data_dict = defaultdict(dict)
209
+ for sub_name in subset_list:
210
+ file_paths = [os.path.join(dataset_path, file_name) for file_name in file_structure[sub_name]]
211
+ # not train split, only eval split
212
+ data_dict[sub_name][self.eval_split] = load_file_with_extension(file_paths)
213
+
214
+ return data_dict
215
+
159
216
  def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
160
217
  """
161
218
  Reformat the dataset subset with subset_key and format.
@@ -249,7 +306,7 @@ class DataAdapter(ABC):
249
306
  def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
250
307
  **kwargs) -> Dict[str, List[float]]:
251
308
  """
252
- compute weighted mean of the bleu score of all samples
309
+ compute weighted mean of score of all samples
253
310
 
254
311
  Args:
255
312
  review_res_list: [score1, score2, ...]
@@ -270,7 +327,7 @@ class DataAdapter(ABC):
270
327
  items['AverageAccuracy'].append(scores)
271
328
  return items
272
329
 
273
- def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
330
+ def gen_report(self, subset_score_map: dict, model_name: str, **kwargs) -> Report:
274
331
  """
275
332
  Generate report for the evaluation results for all subsets.
276
333
 
@@ -278,7 +335,7 @@ class DataAdapter(ABC):
278
335
  subset_score_map: The subset-score map.
279
336
  e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
280
337
 
281
- report_name: str, the user-defined report name. Default: None
338
+ model_name: The evaluation model name.
282
339
 
283
340
  Returns: The evaluation report.
284
341
 
@@ -312,9 +369,17 @@ class DataAdapter(ABC):
312
369
  "model_name": "qwen2.5"
313
370
  }
314
371
  """ # noqa: E501
315
- kwargs['category_map'] = self.category_map
316
- kwargs['metric_list'] = self.metric_list
317
- return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
372
+ return ReportGenerator.gen_report(subset_score_map, model_name, data_adapter=self, **kwargs)
373
+
374
+ def post_process_report(self, report: Report, **kwargs):
375
+ """
376
+ Post-process the report after generation. Draw a chart, save to file, etc.
377
+ This method can be overridden to customize the report format or content.
378
+
379
+ Args:
380
+ report (Report): The generated report.
381
+ """
382
+ pass
318
383
 
319
384
  def gen_prompt_data(self,
320
385
  prompt: str,
@@ -324,6 +389,23 @@ class DataAdapter(ABC):
324
389
  id: Optional[Union[int, str]] = None,
325
390
  messages: Optional[List[dict]] = None,
326
391
  **kwargs) -> dict:
392
+ """
393
+ Generates a dictionary representation of prompt data for evaluation or inference.
394
+
395
+ Args:
396
+ prompt (str): The main prompt or input text. Can also be a list of prompts.
397
+ system_prompt (Optional[str], optional): An optional system-level prompt to provide context or instructions. Defaults to None.
398
+ choices (Optional[List[str]], optional): A list of possible choices for multi-choice tasks.
399
+ If not provided, uses self.choices. Defaults to None.
400
+ index (Optional[Union[int, str]], optional): An optional index or identifier for the prompt.
401
+ Defaults to 0 if not provided. Defaults to None.
402
+ id (Optional[Union[int, str]], optional): An optional unique identifier for the prompt data. Defaults to None.
403
+ messages (Optional[List[dict]], optional): An optional list of message dictionaries, typically for chat-based prompts. Defaults to None.
404
+ If messages is provided, it will be used as the prompt data instead of the prompt string.
405
+
406
+ Returns:
407
+ dict: A dictionary representation of the prompt data, suitable for further processing or model input.
408
+ """ # noqa: E501
327
409
  data = [prompt] if not isinstance(prompt, list) else prompt
328
410
  prompt_data = PromptData(
329
411
  data=data,
@@ -416,7 +498,8 @@ class DataAdapter(ABC):
416
498
 
417
499
  # Extract question from raw_input if available
418
500
  raw_input = kwargs.get('raw_input', {})
419
- question_keys = ['question', 'prompt', 'query', 'problem']
501
+ question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
502
+ # Find the first non-empty question key in raw_input
420
503
  question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
421
504
 
422
505
  # Request judge and obtain score
@@ -14,6 +14,7 @@ logger = get_logger()
14
14
  @Benchmark.register(
15
15
  name='data_collection',
16
16
  dataset_id='', # dataset_id need to be set
17
+ description='Data collection',
17
18
  subset_list=['default'],
18
19
  metric_list=['AverageAccuracy'],
19
20
  few_shot_num=0,
File without changes