evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (114) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +40 -30
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  7. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  8. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  9. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  10. evalscope/backend/rag_eval/utils/embedding.py +77 -39
  11. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  12. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  13. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  14. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  16. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  17. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  18. evalscope/benchmarks/benchmark.py +2 -0
  19. evalscope/benchmarks/bfcl/__init__.py +0 -0
  20. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  21. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  22. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  23. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  24. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  25. evalscope/benchmarks/data_adapter.py +99 -16
  26. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  27. evalscope/benchmarks/docmath/__init__.py +0 -0
  28. evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
  29. evalscope/benchmarks/docmath/utils.py +220 -0
  30. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  31. evalscope/benchmarks/frames/__init__.py +0 -0
  32. evalscope/benchmarks/frames/frames_adapter.py +91 -0
  33. evalscope/benchmarks/frames/utils.py +37 -0
  34. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  35. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  36. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  37. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  38. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  39. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  40. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  41. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  42. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  43. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  44. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  45. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  46. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  47. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  48. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  49. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  50. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
  51. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  52. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  53. evalscope/benchmarks/race/race_adapter.py +3 -0
  54. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  55. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  56. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  57. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  58. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
  59. evalscope/benchmarks/tool_bench/utils.py +5 -4
  60. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  61. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  62. evalscope/benchmarks/utils.py +25 -0
  63. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  64. evalscope/cli/start_app.py +2 -2
  65. evalscope/collections/__init__.py +35 -3
  66. evalscope/collections/evaluator.py +68 -34
  67. evalscope/config.py +8 -2
  68. evalscope/constants.py +1 -1
  69. evalscope/evaluator/evaluator.py +40 -28
  70. evalscope/metrics/__init__.py +3 -1
  71. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  72. evalscope/metrics/llm_judge.py +12 -5
  73. evalscope/metrics/math_parser.py +1 -1
  74. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  75. evalscope/models/adapters/__init__.py +2 -0
  76. evalscope/models/adapters/base_adapter.py +31 -27
  77. evalscope/models/adapters/bfcl_adapter.py +244 -0
  78. evalscope/models/adapters/server_adapter.py +80 -23
  79. evalscope/models/custom/custom_model.py +0 -3
  80. evalscope/models/custom/dummy_model.py +77 -39
  81. evalscope/models/local_model.py +1 -1
  82. evalscope/models/register.py +2 -1
  83. evalscope/perf/arguments.py +4 -2
  84. evalscope/perf/benchmark.py +16 -12
  85. evalscope/perf/main.py +7 -0
  86. evalscope/perf/plugin/api/openai_api.py +2 -0
  87. evalscope/perf/plugin/datasets/custom.py +15 -0
  88. evalscope/perf/utils/benchmark_util.py +1 -1
  89. evalscope/perf/utils/local_server.py +1 -0
  90. evalscope/perf/utils/log_utils.py +12 -5
  91. evalscope/perf/utils/rich_display.py +1 -1
  92. evalscope/report/__init__.py +36 -4
  93. evalscope/report/combinator.py +40 -6
  94. evalscope/report/generator.py +33 -9
  95. evalscope/report/utils.py +84 -4
  96. evalscope/run.py +12 -0
  97. evalscope/summarizer.py +1 -1
  98. evalscope/utils/io_utils.py +59 -2
  99. evalscope/utils/logger.py +1 -1
  100. evalscope/utils/utils.py +12 -0
  101. evalscope/version.py +2 -2
  102. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
  103. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
  104. tests/aigc/test_t2i.py +48 -11
  105. tests/cli/test_all.py +14 -3
  106. tests/cli/test_collection.py +6 -4
  107. tests/cli/test_run.py +50 -25
  108. tests/rag/test_clip_benchmark.py +5 -1
  109. tests/rag/test_mteb.py +51 -7
  110. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  111. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  112. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  113. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  114. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -88,6 +88,9 @@ SUBJECT_MAPPING = {
88
88
  @Benchmark.register(
89
89
  name='mmlu_redux',
90
90
  pretty_name='MMLU-Redux',
91
+ tags=['MCQ', 'Knowledge'],
92
+ description=
93
+ 'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
91
94
  dataset_id='AI-ModelScope/mmlu-redux-2.0',
92
95
  model_adapter=OutputType.GENERATION,
93
96
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -10,6 +10,9 @@ from evalscope.utils.utils import ResponseParser
10
10
  @Benchmark.register(
11
11
  name='musr',
12
12
  pretty_name='MuSR',
13
+ tags=['Reasoning', 'MCQ'],
14
+ description=
15
+ 'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.', # noqa: E501
13
16
  dataset_id='AI-ModelScope/MuSR',
14
17
  model_adapter=OutputType.GENERATION,
15
18
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
File without changes
@@ -0,0 +1,348 @@
1
+ from itertools import product
2
+ from tqdm import tqdm
3
+ from typing import TYPE_CHECKING, List, Union
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import AnswerKeys, EvalType
7
+ from evalscope.metrics import LLMJudge, exact_match
8
+ from evalscope.metrics.metrics import mean
9
+ from evalscope.utils import get_logger
10
+
11
+ if TYPE_CHECKING:
12
+ from evalscope.report import Report
13
+
14
+ logger = get_logger()
15
+
16
+ PROMPT_TEMPLATE = """Please read the following text and answer the question below.
17
+
18
+ <text>
19
+ {context}
20
+ </text>
21
+
22
+ <question>
23
+ {question}
24
+ </question>
25
+
26
+ Don't give information outside the document or repeat your findings."""
27
+
28
+
29
+ @Benchmark.register(
30
+ name='needle_haystack',
31
+ pretty_name='Needle-in-a-Haystack',
32
+ tags=['Retrieval', 'Long Context'],
33
+ description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
34
+ 'It requires the model to find specific information within a large corpus of text. '
35
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
36
+ dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
37
+ metric_list=['AverageAccuracy'],
38
+ subset_list=['english', 'chinese'],
39
+ few_shot_num=0,
40
+ train_split=None,
41
+ eval_split='test',
42
+ system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
43
+ prompt_template=PROMPT_TEMPLATE,
44
+ extra_params={
45
+ 'retrieval_question': 'What is the best thing to do in San Francisco?',
46
+ 'needles':
47
+ ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
48
+ 'context_lengths_min': 1000,
49
+ 'context_lengths_max': 32000,
50
+ 'context_lengths_num_intervals': 10,
51
+ 'document_depth_percent_min': 0,
52
+ 'document_depth_percent_max': 100,
53
+ 'document_depth_percent_intervals': 10,
54
+ 'tokenizer_path': 'Qwen/Qwen3-0.6B',
55
+ 'show_score': False,
56
+ })
57
+ class NeedleHaystackAdapter(DataAdapter):
58
+
59
+ def __init__(self, **kwargs):
60
+ super().__init__(**kwargs)
61
+
62
+ self.llm_as_a_judge = True
63
+ # set extra params
64
+ extra_params = kwargs.get('extra_params', {})
65
+ self.retrieval_question = extra_params.get('retrieval_question',
66
+ 'What is the best thing to do in San Francisco?')
67
+ self.needles = extra_params.get(
68
+ 'needles',
69
+ ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'])
70
+ self.context_lengths_min = extra_params.get('context_lengths_min', 1000)
71
+ self.context_lengths_max = extra_params.get('context_lengths_max', 32000)
72
+ self.context_lengths_num_intervals = extra_params.get('context_lengths_num_intervals', 10)
73
+ self.document_depth_percent_min = extra_params.get('document_depth_percent_min', 0)
74
+ self.document_depth_percent_max = extra_params.get('document_depth_percent_max', 100)
75
+ self.document_depth_percent_intervals = extra_params.get('document_depth_percent_intervals', 10)
76
+ self.tokenizer_path = extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
77
+ self.show_score = extra_params.get('show_score', False)
78
+
79
+ self._init_tokenizer()
80
+ self._init_length()
81
+
82
+ def _init_length(self):
83
+ """ Initialize context lengths and document depth percentages based on the provided parameters."""
84
+ import numpy as np
85
+
86
+ self.context_lengths = np.round(
87
+ np.linspace(
88
+ self.context_lengths_min,
89
+ self.context_lengths_max,
90
+ num=self.context_lengths_num_intervals,
91
+ endpoint=True)).astype(int)
92
+
93
+ self.document_depth_percents = np.round(
94
+ np.linspace(
95
+ self.document_depth_percent_min,
96
+ self.document_depth_percent_max,
97
+ num=self.document_depth_percent_intervals,
98
+ endpoint=True)).astype(int)
99
+
100
+ def _init_tokenizer(self):
101
+ """ Initialize the tokenizer based on the provided tokenizer path."""
102
+ from modelscope import AutoTokenizer
103
+ self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
104
+
105
+ def load(self, **kwargs):
106
+ # default load with snapshot
107
+ kwargs['file_structure'] = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
108
+ data_dict = super().load_with_snapshot(**kwargs)
109
+ return data_dict
110
+
111
+ def gen_prompts(self, data_dict: dict) -> dict:
112
+ """
113
+ Generate dataset prompts from raw input, unify the prompt format for different datasets.
114
+
115
+ Args:
116
+ data_dict: {'english': {'test': [sample_d_1, sample_d_2, ...]},
117
+ 'chinese': {'test': [sample_d_1, sample_d_2, ...]}}
118
+
119
+ Returns:
120
+ {'subset_name': [prompt_d_1, prompt_d_2, ...]}
121
+ prompt_d_i (dict): refer to the output of gen_prompt method.
122
+
123
+ e.g. train -- few-shot data, test -- target dataset to evaluate.
124
+ """
125
+ res_dict: dict = {}
126
+
127
+ for sub_name, sub_data_dict in data_dict.items():
128
+ res_dict[sub_name] = []
129
+ for sample_d in sub_data_dict[self.eval_split]:
130
+ # Generate prompts for each sample in the dataset
131
+ tokens_context = self._get_context_tokens(sample_d['text'])
132
+ for context_length, depth_percent in tqdm(
133
+ product(self.context_lengths, self.document_depth_percents),
134
+ desc=f'Generating {sub_name} prompts'):
135
+ # Insert needles into the context at the specified depth percentage
136
+ context = self._insert_needles(tokens_context, depth_percent, context_length)
137
+ # Build the input dictionary for the prompt
138
+ input_d = {
139
+ 'context_length': int(context_length),
140
+ 'depth_percent': int(depth_percent),
141
+ 'question': self.retrieval_question,
142
+ 'answer': '\n'.join(self.needles),
143
+ 'context': context,
144
+ }
145
+ prompt_d = self.gen_prompt(input_d=input_d)
146
+ prompt_d[AnswerKeys.RAW_INPUT] = input_d
147
+ res_dict[sub_name].append(prompt_d)
148
+
149
+ return res_dict
150
+
151
+ def _get_context_tokens(self, input_context: str) -> list:
152
+ """
153
+ Encodes the context string into tokens using the tokenizer, ensuring the tokenized context
154
+ is at least as long as the maximum context length required.
155
+
156
+ Args:
157
+ input_context (str): The context string to be tokenized.
158
+
159
+ Returns:
160
+ List[int]: A list of token IDs representing the context.
161
+ """
162
+ max_context_length = max(self.context_lengths)
163
+ context = input_context
164
+ tokens_context = self.tokenizer.encode(context, add_special_tokens=False)
165
+ # Repeat the context until reaching the required length
166
+ while len(tokens_context) < max_context_length:
167
+ context += '\n' + input_context
168
+ tokens_context = self.tokenizer.encode(context, add_special_tokens=False)
169
+ return tokens_context
170
+
171
+ def _insert_needles(self, tokens_context, depth_percent, context_length):
172
+ """
173
+ Inserts multiple needles (specific facts or pieces of information) into the original context string at
174
+ designated depth percentages, effectively distributing these needles throughout the context. This method
175
+ is designed to test a model's ability to retrieve specific information (needles) from a larger body of text
176
+ (haystack) based on the placement depth of these needles.
177
+
178
+ The method first encodes the context and each needle into tokens to calculate their lengths in tokens.
179
+ It then adjusts the context length to accommodate the final buffer length. This is crucial for ensuring
180
+ that the total token count (context plus needles) does not exceed the maximum allowable context length,
181
+ which might otherwise lead to information being truncated.
182
+
183
+ This approach calculates the initial insertion point for the first needle as before but then calculates even
184
+ spacing for the remaining needles based on the remaining context length. It ensures that needles are
185
+ distributed as evenly as possible throughout the context after the first insertion.
186
+
187
+ Args:
188
+ tokens_context (List[int]): The original context tokens.
189
+ depth_percent (float): The depth percent at which to insert the needles.
190
+ context_length (int): The total length of the context in tokens, adjusted for final buffer.
191
+
192
+ Returns:
193
+ str: The new context with needles inserted.
194
+ """
195
+
196
+ context_length -= 150
197
+
198
+ # Calculate the total length of all needles in tokens
199
+ total_needles_length = sum(len(self.tokenizer.encode(needle)) for needle in self.needles)
200
+
201
+ # Ensure context length accounts for needles
202
+ if len(tokens_context) + total_needles_length > context_length:
203
+ tokens_context = tokens_context[:context_length - total_needles_length]
204
+
205
+ # To evenly distribute the needles, we calculate the intervals they need to be inserted.
206
+ depth_percent_interval = (100 - depth_percent) / len(self.needles)
207
+
208
+ # Reset the insertion percentages list for the current context
209
+ self.insertion_percentages = []
210
+
211
+ # Insert needles at calculated points
212
+ for needle in self.needles:
213
+
214
+ tokens_needle = self.tokenizer.encode(needle)
215
+
216
+ if depth_percent == 100:
217
+ # If your depth percent is 100 (which means your needle is the last thing in the doc),
218
+ # throw it at the end
219
+ tokens_context = tokens_context + tokens_needle
220
+ else:
221
+ # Go get the position (in terms of tokens) to insert your needle
222
+ insertion_point = int(len(tokens_context) * (depth_percent / 100))
223
+
224
+ # tokens_new_context represents the tokens before the needle
225
+ tokens_new_context = tokens_context[:insertion_point]
226
+
227
+ # We want to make sure that we place our needle at a sentence break
228
+ # so we first see what token a '.' is
229
+ period_tokens = self.tokenizer.encode('.') + self.tokenizer.encode(
230
+ '。') # Handle both English and Chinese periods
231
+
232
+ # Then we iteration backwards until we find the first period
233
+ while tokens_new_context and tokens_new_context[-1] not in period_tokens:
234
+ insertion_point -= 1
235
+ tokens_new_context = tokens_context[:insertion_point]
236
+
237
+ # Insert the needle into the context at the found position
238
+ tokens_context = tokens_context[:insertion_point] + tokens_needle + tokens_context[insertion_point:]
239
+
240
+ # Log
241
+ insertion_percentage = (insertion_point / len(tokens_context)) * 100
242
+ self.insertion_percentages.append(insertion_percentage)
243
+ logger.debug(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
244
+ f'total length now: {len(tokens_context)} tokens')
245
+
246
+ # Adjust depth for next needle
247
+ depth_percent += depth_percent_interval
248
+
249
+ new_context = self.tokenizer.decode(tokens_context)
250
+ return new_context
251
+
252
+ def gen_prompt(self, input_d: dict, **kwargs) -> dict:
253
+ """
254
+ Generate the prompt for each sample in the dataset.
255
+ Args:
256
+ input_d: A dictionary containing the input data for the prompt.
257
+ It should contain 'context' and optionally 'question'.
258
+ Returns:
259
+ A dictionary containing the prompt data
260
+ """
261
+ context = input_d.get('context')
262
+ question = input_d.get('question')
263
+
264
+ prompt = self.prompt_template.format(context=context, question=question)
265
+
266
+ return self.gen_prompt_data(prompt, system_prompt=self.system_prompt)
267
+
268
+ def get_gold_answer(self, input_d: dict) -> str:
269
+ """
270
+ Parse the raw input labels (gold).
271
+ """
272
+ return input_d.get('answer', '').strip()
273
+
274
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
275
+ """
276
+ Parse the predicted result and extract proper answer.
277
+ """
278
+ return result
279
+
280
+ def match(self, gold: str, pred: str) -> float:
281
+ """
282
+ Match the gold answer and the predicted answer.
283
+ """
284
+ from .utils import normalize_answer
285
+ norm_gold = normalize_answer(gold)
286
+ norm_pred = normalize_answer(pred)
287
+ # Use exact match for Needle in a Haystack
288
+ return exact_match(gold=norm_gold, pred=norm_pred)
289
+
290
+ def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> dict:
291
+ """
292
+ Use LLM as a judge to evaluate the predicted answer against the gold answer.
293
+ """
294
+ from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE, parse_score
295
+
296
+ raw_input = kwargs.get('raw_input', None)
297
+ question = raw_input.get('question')
298
+ context_length = raw_input.get('context_length')
299
+ depth_percent = raw_input.get('depth_percent')
300
+
301
+ # get grading response
302
+ prompt = ORM_USER_TEMPLATE.format(question=question, gold=gold, pred=pred)
303
+ orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
304
+
305
+ # parse grading score with regex, [[score]]
306
+ score = parse_score(orm_response) if orm_response else 0.0
307
+ return {f'Context#{context_length} Depth#{depth_percent}': score}
308
+
309
+ def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
310
+ """
311
+ compute weighted mean of the bleu score of all samples
312
+
313
+ Args:
314
+ review_res_list: [score1, score2, ...]
315
+
316
+ Returns:
317
+ avg_res: List[dict]
318
+
319
+ """
320
+ items = super().compute_dict_metric(review_res_list, **kwargs)
321
+ return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
322
+
323
+ def post_process_report(self, report: 'Report', **kwargs):
324
+ try:
325
+ import os
326
+
327
+ from .utils import draw_score_chat
328
+
329
+ report_path = kwargs.get('report_path')
330
+ data_frame = report.to_dataframe()
331
+ # split `Metric` to `Context` and `Depth`
332
+ data_frame[['Context', 'Depth']] = data_frame['Metric'].str.split(' ', n=1, expand=True)
333
+ data_frame['Depth'] = data_frame['Depth'].str.replace('Depth#', '').astype(float)
334
+ data_frame['Context'] = data_frame['Context'].str.replace('Context#', '').astype(int)
335
+ # split by `Subset` to multi sub data frame
336
+ for subset in data_frame['Subset'].unique():
337
+ sub_df = data_frame[data_frame['Subset'] == subset]
338
+ # draw charts for each subset
339
+ pivot_table = sub_df.pivot_table(
340
+ values='Score', index=['Depth', 'Context'], aggfunc='mean').reset_index()
341
+ pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
342
+ draw_score_chat(
343
+ pivot_table,
344
+ outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
345
+ show_score=self.show_score)
346
+
347
+ except Exception as e:
348
+ logger.error(f'Error generating charts: {e}')
@@ -0,0 +1,79 @@
1
+ import matplotlib.pyplot as plt
2
+ import os
3
+ import re
4
+ import seaborn as sns
5
+ import string
6
+ from matplotlib.colors import LinearSegmentedColormap
7
+
8
+
9
+ def normalize_answer(s):
10
+
11
+ def remove_articles(text):
12
+ return re.sub(r'\b(a|an|the)\b', ' ', text)
13
+
14
+ def white_space_fix(text):
15
+ return ' '.join(text.split())
16
+
17
+ def remove_punc(text):
18
+ exclude = set(string.punctuation)
19
+ return ''.join(ch for ch in text if ch not in exclude)
20
+
21
+ def lower(text):
22
+ return text.lower()
23
+
24
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
25
+
26
+
27
+ def parse_score(score_str: str) -> int:
28
+ """
29
+ Parses a score string and returns an integer score.
30
+ The score should be in the format [[score]].
31
+ """
32
+ score_match = re.search(r'\[\[(\d+)\]\]', score_str)
33
+ if score_match:
34
+ score = int(score_match.group(1))
35
+ return score / 10.0
36
+ else:
37
+ return 0.0
38
+
39
+
40
+ def draw_score_chat(pivot_table, outpath, show_score=False):
41
+ # Create a custom colormap. Go to https://coolors.co/ and pick cool colors
42
+ cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
43
+
44
+ # Create the heatmap with better aesthetics
45
+ plt.figure(figsize=(17.5, 8)) # Can adjust these dimensions as needed
46
+ sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=show_score, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})
47
+
48
+ # More aesthetics
49
+ plt.title('Fact Retrieval Across Context Lengths ("Needle In A HayStack")') # Adds a title
50
+ plt.xlabel('Token Limit') # X-axis label
51
+ plt.ylabel('Depth Percent') # Y-axis label
52
+ plt.xticks(rotation=45) # Rotates the x-axis labels to prevent overlap
53
+ plt.yticks(rotation=0) # Ensures the y-axis labels are horizontal
54
+ plt.tight_layout() # Fits everything neatly into the figure area
55
+
56
+ # save the figure
57
+ plt.savefig(outpath, dpi=300, bbox_inches='tight')
58
+
59
+
60
+ GENERAL_ORM_PROMPT = """You are an expert in verifying if the model answer is correct based on the reference answer.
61
+ Your input is a question, a reference answer, and a model answer. You need to check if the model answer is correct based on the reference answer.
62
+ You should focus on the correctness of the model answer compared to the reference answer, without attempting to solve the original question.
63
+ You must provide your final score in the form of a number from 1 to 10, where:
64
+
65
+ Score 1: The answer is completely unrelated to the reference.
66
+ Score 3: The answer has minor relevance but does not align with the reference.
67
+ Score 5: The answer has moderate relevance but contains inaccuracies.
68
+ Score 7: The answer aligns with the reference but has minor omissions.
69
+ Score 10: The answer is completely accurate and aligns perfectly with the reference.
70
+
71
+ Only respond with a numberical score with formatted as [[score]].""" # noqa: E501
72
+
73
+ ORM_USER_TEMPLATE = """
74
+ Question: {question}
75
+
76
+ Reference Answer: {gold}
77
+
78
+ Model Answer: {pred}
79
+ """
@@ -12,6 +12,9 @@ cur_path = os.path.dirname(os.path.abspath(__file__))
12
12
  @Benchmark.register(
13
13
  name='process_bench',
14
14
  pretty_name='ProcessBench',
15
+ tags=['Mathematical', 'Reasoning'],
16
+ description=
17
+ 'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
15
18
  dataset_id='Qwen/ProcessBench',
16
19
  subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
17
20
  metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
@@ -17,6 +17,9 @@ logger = get_logger()
17
17
  @Benchmark.register(
18
18
  name='race',
19
19
  pretty_name='RACE',
20
+ tags=['Reasoning', 'MCQ'],
21
+ description=
22
+ 'RACE is a benchmark for testing reading comprehension and reasoning abilities of neural models. It is constructed from Chinese middle and high school examinations.', # noqa: E501
20
23
  dataset_id='modelscope/race',
21
24
  model_adapter=OutputType.MULTIPLE_CHOICE,
22
25
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -95,6 +95,9 @@ Just return the letters "A", "B", or "C", with no text around it.
95
95
  @Benchmark.register(
96
96
  name='simple_qa',
97
97
  pretty_name='SimpleQA',
98
+ tags=['Knowledge', 'QA'],
99
+ description=
100
+ 'SimpleQA is a benchmark designed to evaluate the performance of language models on simple question-answering tasks. It includes a set of straightforward questions that require basic reasoning and understanding capabilities.', # noqa: E501
98
101
  dataset_id='AI-ModelScope/SimpleQA',
99
102
  metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
100
103
  few_shot_num=0,
@@ -85,5 +85,6 @@ Answer: A.
85
85
 
86
86
  Question:
87
87
  {query}
88
+ {choices}
88
89
 
89
90
  Answer: Let's think step by step.
@@ -109,6 +109,9 @@ SUBSET_MAPPING = {
109
109
  @Benchmark.register(
110
110
  name='super_gpqa',
111
111
  pretty_name='SuperGPQA',
112
+ tags=['MCQ', 'Knowledge'],
113
+ description=
114
+ 'SuperGPQA is a large-scale multiple-choice question answering dataset, designed to evaluate the generalization ability of models across different fields. It contains 100,000+ questions from 50+ fields, with each question having 10 options.', # noqa: E501
112
115
  dataset_id='m-a-p/SuperGPQA',
113
116
  model_adapter=OutputType.GENERATION,
114
117
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -139,13 +142,15 @@ class SuperGPQAAdapter(DataAdapter):
139
142
  return self.reformat_subset(data_dict, subset_key='field', format='{}')
140
143
 
141
144
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
145
+ question = input_d['question']
146
+ choices = self._format_choices(input_d['options'])
142
147
  if not self.prompt_template:
143
148
  if few_shot_list:
144
- prompt = self.few_shot_prompt.format(query=input_d['question'])
149
+ prompt = self.few_shot_prompt.format(query=question, choices=choices)
145
150
  else:
146
- prompt = self.zero_shot_prompt.format(query=input_d['question'])
151
+ prompt = self.zero_shot_prompt.format(query=question, choices=choices)
147
152
  else:
148
- prompt = self.prompt_template.format(query=input_d['question'])
153
+ prompt = self.prompt_template.format(query=question, choices=choices)
149
154
  return self.gen_prompt_data(prompt)
150
155
 
151
156
  def get_gold_answer(self, input_d: dict) -> str:
@@ -189,3 +194,16 @@ class SuperGPQAAdapter(DataAdapter):
189
194
 
190
195
  def match(self, gold: str, pred: str) -> float:
191
196
  return exact_match(gold=gold, pred=pred)
197
+
198
+ def _format_choices(self, choices: list) -> str:
199
+ """
200
+ Format the choices into a string for display.
201
+
202
+ Args:
203
+ choices (list): List of choices.
204
+
205
+ Returns:
206
+ str: Formatted string of choices.
207
+ """
208
+ choice_list = [f'{option}) {content}' for option, content in zip(self.choices, choices)]
209
+ return '\n'.join(choice_list)
@@ -1,3 +1,4 @@
1
1
  Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
2
2
 
3
3
  {query}
4
+ {choices}
@@ -8,6 +8,11 @@ from evalscope.metrics import Metric, mean, metric_registry
8
8
  @Benchmark.register(
9
9
  name='tool_bench',
10
10
  pretty_name='ToolBench-Static',
11
+ tags=['Reasoning', 'Agent'],
12
+ description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
13
+ 'It includes various subsets such as in-domain and out-of-domain, '
14
+ 'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
15
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)', # noqa: E501
11
16
  dataset_id='AI-ModelScope/ToolBench-Static',
12
17
  subset_list=['in_domain', 'out_of_domain'],
13
18
  metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
@@ -31,7 +36,10 @@ class ToolBenchAdapter(DataAdapter):
31
36
  Generate model prompt from input data.
32
37
  """
33
38
  messages = input_d['messages']
34
- # use prepared messages
39
+ # use prepared messages and remove the name field
40
+ for message in messages:
41
+ if 'name' in message:
42
+ del message['name']
35
43
  return self.gen_prompt_data(prompt='', messages=messages)
36
44
 
37
45
  def get_gold_answer(self, input_d: dict) -> str:
@@ -1,13 +1,14 @@
1
1
  import json
2
- from rouge import Rouge
2
+
3
+ from evalscope.metrics import compute_rouge_score_one_sample
3
4
 
4
5
 
5
6
  def evaluate_rougel(cand_list: list, ref_list: list):
6
7
  if len(ref_list) == 0:
7
8
  return 0
8
- rouge = Rouge()
9
- rouge_score = rouge.get_scores(hyps=cand_list, refs=ref_list, avg=True)
10
- rougel = rouge_score['rouge-l']['f']
9
+ rouge_score = compute_rouge_score_one_sample(cand_list, ref_list)
10
+ rougel = rouge_score.get('rouge-l-f', 0)
11
+
11
12
  return rougel
12
13
 
13
14
 
@@ -16,6 +16,9 @@ logger = get_logger()
16
16
  @Benchmark.register(
17
17
  name='trivia_qa',
18
18
  pretty_name='TriviaQA',
19
+ tags=['QA', 'Reading Comprehension'],
20
+ description=
21
+ 'TriviaQA is a large-scale reading comprehension dataset consisting of question-answer pairs collected from trivia websites. It includes questions with multiple possible answers, making it suitable for evaluating the ability of models to understand and generate answers based on context.', # noqa: E501
19
22
  dataset_id='modelscope/trivia_qa',
20
23
  subset_list=['default'],
21
24
  metric_list=['AverageAccuracy'],
@@ -21,6 +21,9 @@ logger = get_logger()
21
21
  @Benchmark.register(
22
22
  name='truthful_qa',
23
23
  pretty_name='TruthfulQA',
24
+ tags=['Knowledge'],
25
+ description=
26
+ 'TruthfulQA is a benchmark designed to evaluate the ability of AI models to answer questions truthfully and accurately. It includes multiple-choice and generation tasks, focusing on the model\'s understanding of factual information and its ability to generate coherent responses.', # noqa: E501
24
27
  dataset_id='modelscope/truthful_qa',
25
28
  model_adapter=OutputType.CONTINUOUS,
26
29
  output_types=[OutputType.CONTINUOUS, OutputType.GENERATION],
@@ -33,3 +33,28 @@ def preprocess_decorator(func):
33
33
  return func(self, result, raw_input_d, **kwargs)
34
34
 
35
35
  return wrapper
36
+
37
+
38
+ def load_file_with_extension(file_path: Union[str, List[str]]) -> List[dict]:
39
+ """
40
+ Load a file with a specific extension and return its content as a list of dictionaries.
41
+ """
42
+ import json
43
+ import os
44
+
45
+ if isinstance(file_path, str):
46
+ file_path = [file_path]
47
+
48
+ data = []
49
+ for path in file_path:
50
+ if not os.path.exists(path):
51
+ raise FileNotFoundError(f'The file {path} does not exist.')
52
+
53
+ with open(path, 'r', encoding='utf-8') as f:
54
+ if path.endswith('.json'):
55
+ data.extend(json.load(f))
56
+ elif path.endswith('.jsonl'):
57
+ data.extend([json.loads(line) for line in f])
58
+ elif path.endswith('.txt'):
59
+ data.extend([{'text': f.read()}])
60
+ return data
@@ -7,6 +7,9 @@ from evalscope.utils.utils import ResponseParser
7
7
  @Benchmark.register(
8
8
  name='winogrande',
9
9
  pretty_name='Winogrande',
10
+ tags=['Reasoning', 'MCQ'],
11
+ description=
12
+ 'Winogrande is a benchmark for evaluating AI models on commonsense reasoning tasks, specifically designed to test the ability to resolve ambiguous pronouns in sentences.', # noqa: E501
10
13
  dataset_id='AI-ModelScope/winogrande_val',
11
14
  model_adapter=OutputType.GENERATION,
12
15
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],