evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (82) hide show
  1. evalscope/app/app.py +20 -5
  2. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  3. evalscope/backend/rag_eval/utils/embedding.py +2 -4
  4. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  5. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  6. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  7. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  8. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  9. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  10. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  11. evalscope/benchmarks/benchmark.py +1 -0
  12. evalscope/benchmarks/bfcl/__init__.py +0 -0
  13. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  14. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  15. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  16. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  17. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  18. evalscope/benchmarks/data_adapter.py +2 -0
  19. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  20. evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
  21. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +1 -0
  23. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  26. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  27. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  28. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  29. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  30. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  31. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  32. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  33. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  34. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  35. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  36. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  37. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  38. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
  39. evalscope/benchmarks/needle_haystack/utils.py +2 -2
  40. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  41. evalscope/benchmarks/race/race_adapter.py +3 -0
  42. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  43. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  44. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  45. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  46. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  48. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  49. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  50. evalscope/collections/evaluator.py +50 -28
  51. evalscope/constants.py +1 -1
  52. evalscope/evaluator/evaluator.py +6 -5
  53. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  54. evalscope/models/adapters/__init__.py +2 -0
  55. evalscope/models/adapters/base_adapter.py +31 -27
  56. evalscope/models/adapters/bfcl_adapter.py +244 -0
  57. evalscope/models/adapters/server_adapter.py +78 -17
  58. evalscope/models/custom/custom_model.py +0 -3
  59. evalscope/models/custom/dummy_model.py +77 -39
  60. evalscope/models/local_model.py +1 -1
  61. evalscope/models/register.py +2 -1
  62. evalscope/perf/arguments.py +2 -0
  63. evalscope/perf/benchmark.py +16 -3
  64. evalscope/perf/plugin/api/openai_api.py +2 -0
  65. evalscope/report/combinator.py +38 -12
  66. evalscope/report/utils.py +24 -1
  67. evalscope/run.py +1 -1
  68. evalscope/summarizer.py +1 -1
  69. evalscope/utils/io_utils.py +59 -2
  70. evalscope/version.py +2 -2
  71. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
  72. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
  73. tests/aigc/test_t2i.py +8 -8
  74. tests/cli/test_all.py +40 -33
  75. tests/cli/test_collection.py +4 -3
  76. tests/cli/test_run.py +36 -21
  77. tests/rag/test_clip_benchmark.py +5 -1
  78. tests/rag/test_mteb.py +46 -2
  79. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  80. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  81. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  82. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import csv
3
2
  import os
3
+ from collections import defaultdict
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
8
  from evalscope.utils import ResponseParser
9
+ from evalscope.utils.io_utils import csv_to_list, jsonl_to_list
9
10
  from evalscope.utils.logger import get_logger
10
11
 
11
12
  # flake8: noqa
@@ -15,7 +16,9 @@ logger = get_logger()
15
16
 
16
17
  @Benchmark.register(
17
18
  name='general_mcq',
18
- pretty_name='General MCQ',
19
+ pretty_name='General-MCQ',
20
+ description='A general multiple-choice question answering dataset.',
21
+ tags=['MCQ', 'Custom'],
19
22
  dataset_id='general_mcq',
20
23
  model_adapter=OutputType.GENERATION,
21
24
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -24,7 +27,7 @@ logger = get_logger()
24
27
  few_shot_num=0,
25
28
  train_split='dev',
26
29
  eval_split='val',
27
- prompt_template='请回答问题,并选出其中的正确答案\n{query}',
30
+ prompt_template='请回答问题,并选出其中的正确答案。你的回答的最后一行应该是这样的格式:“答案是:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
28
31
  query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
29
32
  class GeneralMCQAdapter(DataAdapter):
30
33
 
@@ -34,28 +37,21 @@ class GeneralMCQAdapter(DataAdapter):
34
37
  self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
35
38
 
36
39
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
37
- data_dict = {}
40
+ data_dict = defaultdict(dict)
38
41
  for subset_name in subset_list:
39
42
  for split_name in [self.train_split, self.eval_split]:
40
- if os.path.exists(dataset_name_or_path):
41
- file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}.csv')
42
- else:
43
- file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
44
- if os.path.exists(file_path):
45
- with open(file_path, encoding='utf-8') as f:
46
- rows = []
47
- reader = csv.reader(f)
48
- header = next(reader)
49
- for row in reader:
50
- item = dict(zip(header, row))
51
- rows.append(item)
52
-
53
- if subset_name in data_dict:
54
- data_dict[subset_name].update({split_name: rows})
55
- else:
56
- data_dict[subset_name] = {split_name: rows}
57
-
58
- return data_dict
43
+ # Check for files with different extensions
44
+ for ext, loader in [('.jsonl', jsonl_to_list), ('.csv', csv_to_list)]:
45
+ if os.path.exists(dataset_name_or_path):
46
+ file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
47
+ else:
48
+ file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
49
+
50
+ if os.path.exists(file_path):
51
+ data_dict[subset_name][split_name] = loader(file_path)
52
+ break # Stop checking other extensions once a file is found
53
+
54
+ return dict(data_dict)
59
55
 
60
56
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
61
57
  """
@@ -13,6 +13,9 @@ logger = get_logger()
13
13
 
14
14
  @Benchmark.register(
15
15
  name='general_qa',
16
+ pretty_name='General-QA',
17
+ description='General Question Answering dataset',
18
+ tags=['QA', 'Custom'],
16
19
  dataset_id='general_qa',
17
20
  subset_list=['default'],
18
21
  metric_list=['AverageBLEU', 'AverageRouge'],
@@ -10,6 +10,9 @@ from evalscope.metrics import exact_match
10
10
  @Benchmark.register(
11
11
  name='gpqa',
12
12
  pretty_name='GPQA',
13
+ tags=['MCQ', 'Knowledge'],
14
+ description=
15
+ 'GPQA is a dataset for evaluating the reasoning ability of large language models (LLMs) on complex mathematical problems. It contains questions that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
13
16
  dataset_id='modelscope/gpqa',
14
17
  model_adapter=OutputType.GENERATION,
15
18
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -15,6 +15,9 @@ logger = get_logger()
15
15
  @Benchmark.register(
16
16
  name='gsm8k',
17
17
  pretty_name='GSM8K',
18
+ tags=['Mathematics'],
19
+ description=
20
+ 'GSM8K (Grade School Math 8K) is a dataset of grade school math problems, designed to evaluate the mathematical reasoning abilities of AI models.',
18
21
  dataset_id='modelscope/gsm8k',
19
22
  subset_list=['main'],
20
23
  metric_list=['AverageAccuracy'],
@@ -18,6 +18,9 @@ logger = get_logger()
18
18
  @Benchmark.register(
19
19
  name='hellaswag',
20
20
  pretty_name='HellaSwag',
21
+ tags=['Commonsense', 'MCQ', 'Knowledge'],
22
+ description=
23
+ 'HellaSwag is a benchmark for commonsense reasoning in natural language understanding tasks. It consists of multiple-choice questions where the model must select the most plausible continuation of a given context.',
21
24
  dataset_id='modelscope/hellaswag',
22
25
  model_adapter=OutputType.MULTIPLE_CHOICE,
23
26
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -13,6 +13,9 @@ logger = get_logger()
13
13
  @Benchmark.register(
14
14
  name='humaneval',
15
15
  pretty_name='HumanEval',
16
+ tags=['Coding'],
17
+ description=
18
+ 'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.', # noqa: E501
16
19
  dataset_id='modelscope/humaneval',
17
20
  subset_list=['openai_humaneval'],
18
21
  metric_list=['Pass@1'],
@@ -10,6 +10,9 @@ from evalscope.metrics import Metric, mean, metric_registry
10
10
  @Benchmark.register(
11
11
  name='ifeval',
12
12
  pretty_name='IFEval',
13
+ tags=['Instruction-Following'],
14
+ description=
15
+ 'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
13
16
  dataset_id='opencompass/ifeval',
14
17
  subset_list=['default'],
15
18
  metric_list=[
@@ -7,6 +7,9 @@ from evalscope.utils.utils import ResponseParser
7
7
  @Benchmark.register(
8
8
  name='iquiz',
9
9
  pretty_name='IQuiz',
10
+ tags=['Knowledge', 'MCQ', 'Chinese'],
11
+ description=
12
+ 'IQuiz is a benchmark for evaluating AI models on IQ and EQ questions. It consists of multiple-choice questions where the model must select the correct answer and provide an explanation.', # noqa: E501
10
13
  dataset_id='AI-ModelScope/IQuiz',
11
14
  model_adapter=OutputType.GENERATION,
12
15
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -8,7 +8,10 @@ logger = get_logger()
8
8
 
9
9
  @Benchmark.register(
10
10
  name='live_code_bench',
11
- pretty_name='Live Code Bench',
11
+ pretty_name='Live-Code-Bench',
12
+ tags=['Coding'],
13
+ description=
14
+ 'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.', # noqa: E501
12
15
  dataset_id='AI-ModelScope/code_generation_lite',
13
16
  subset_list=['release_latest'],
14
17
  metric_list=['Pass@1'],
@@ -11,6 +11,9 @@ SUBSET_LIST = ['default']
11
11
  @Benchmark.register(
12
12
  name='maritime_bench',
13
13
  pretty_name='MaritimeBench',
14
+ tags=['Maritime', 'MCQ', 'Knowledge'],
15
+ description=
16
+ 'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.', # noqa: E501
14
17
  dataset_id='HiDolphin/MaritimeBench',
15
18
  model_adapter=OutputType.GENERATION,
16
19
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -10,6 +10,9 @@ logger = get_logger()
10
10
  @Benchmark.register(
11
11
  name='math_500',
12
12
  pretty_name='MATH-500',
13
+ tags=['Mathematics'],
14
+ description=
15
+ "MATH-500 is a benchmark for evaluating mathematical reasoning capabilities of AI models. It consists of 500 diverse math problems across five levels of difficulty, designed to test a model's ability to solve complex mathematical problems by generating step-by-step solutions and providing the correct final answer.", # noqa: E501
13
16
  dataset_id='AI-ModelScope/MATH-500',
14
17
  subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
15
18
  metric_list=['AveragePass@1'],
@@ -136,6 +136,9 @@ SUBJECT_MAPPING = {
136
136
  @Benchmark.register(
137
137
  name='mmlu',
138
138
  pretty_name='MMLU',
139
+ tags=['Knowledge', 'MCQ'],
140
+ description=
141
+ "The MMLU (Massive Multitask Language Understanding) benchmark is a comprehensive evaluation suite designed to assess the performance of language models across a wide range of subjects and tasks. It includes multiple-choice questions from various domains, such as history, science, mathematics, and more, providing a robust measure of a model's understanding and reasoning capabilities.", # noqa: E501
139
142
  dataset_id='modelscope/mmlu',
140
143
  model_adapter=OutputType.GENERATION,
141
144
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -15,6 +15,9 @@ SUBSET_LIST = [
15
15
  @Benchmark.register(
16
16
  name='mmlu_pro',
17
17
  pretty_name='MMLU-Pro',
18
+ tags=['MCQ', 'Knowledge'],
19
+ description=
20
+ 'MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
18
21
  dataset_id='modelscope/MMLU-Pro',
19
22
  model_adapter=OutputType.GENERATION,
20
23
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -88,6 +88,9 @@ SUBJECT_MAPPING = {
88
88
  @Benchmark.register(
89
89
  name='mmlu_redux',
90
90
  pretty_name='MMLU-Redux',
91
+ tags=['MCQ', 'Knowledge'],
92
+ description=
93
+ 'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
91
94
  dataset_id='AI-ModelScope/mmlu-redux-2.0',
92
95
  model_adapter=OutputType.GENERATION,
93
96
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -10,6 +10,9 @@ from evalscope.utils.utils import ResponseParser
10
10
  @Benchmark.register(
11
11
  name='musr',
12
12
  pretty_name='MuSR',
13
+ tags=['Reasoning', 'MCQ'],
14
+ description=
15
+ 'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.', # noqa: E501
13
16
  dataset_id='AI-ModelScope/MuSR',
14
17
  model_adapter=OutputType.GENERATION,
15
18
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -28,9 +28,11 @@ Don't give information outside the document or repeat your findings."""
28
28
 
29
29
  @Benchmark.register(
30
30
  name='needle_haystack',
31
- pretty_name='Needle in a Haystack',
32
- description='Needle in a Haystack is a benchmark focused on information retrieval tasks. \
33
- It requires the model to find specific information within a large corpus of text.',
31
+ pretty_name='Needle-in-a-Haystack',
32
+ tags=['Retrieval', 'Long Context'],
33
+ description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
34
+ 'It requires the model to find specific information within a large corpus of text. '
35
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
34
36
  dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
35
37
  metric_list=['AverageAccuracy'],
36
38
  subset_list=['english', 'chinese'],
@@ -50,6 +52,7 @@ Don't give information outside the document or repeat your findings."""
50
52
  'document_depth_percent_max': 100,
51
53
  'document_depth_percent_intervals': 10,
52
54
  'tokenizer_path': 'Qwen/Qwen3-0.6B',
55
+ 'show_score': False,
53
56
  })
54
57
  class NeedleHaystackAdapter(DataAdapter):
55
58
 
@@ -71,11 +74,12 @@ class NeedleHaystackAdapter(DataAdapter):
71
74
  self.document_depth_percent_max = extra_params.get('document_depth_percent_max', 100)
72
75
  self.document_depth_percent_intervals = extra_params.get('document_depth_percent_intervals', 10)
73
76
  self.tokenizer_path = extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
77
+ self.show_score = extra_params.get('show_score', False)
74
78
 
75
- self.__init_tokenizer()
76
- self.__init_length()
79
+ self._init_tokenizer()
80
+ self._init_length()
77
81
 
78
- def __init_length(self):
82
+ def _init_length(self):
79
83
  """ Initialize context lengths and document depth percentages based on the provided parameters."""
80
84
  import numpy as np
81
85
 
@@ -93,7 +97,7 @@ class NeedleHaystackAdapter(DataAdapter):
93
97
  num=self.document_depth_percent_intervals,
94
98
  endpoint=True)).astype(int)
95
99
 
96
- def __init_tokenizer(self):
100
+ def _init_tokenizer(self):
97
101
  """ Initialize the tokenizer based on the provided tokenizer path."""
98
102
  from modelscope import AutoTokenizer
99
103
  self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
@@ -335,7 +339,10 @@ class NeedleHaystackAdapter(DataAdapter):
335
339
  pivot_table = sub_df.pivot_table(
336
340
  values='Score', index=['Depth', 'Context'], aggfunc='mean').reset_index()
337
341
  pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
338
- draw_score_chat(pivot_table, outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'))
342
+ draw_score_chat(
343
+ pivot_table,
344
+ outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
345
+ show_score=self.show_score)
339
346
 
340
347
  except Exception as e:
341
348
  logger.error(f'Error generating charts: {e}')
@@ -37,13 +37,13 @@ def parse_score(score_str: str) -> int:
37
37
  return 0.0
38
38
 
39
39
 
40
- def draw_score_chat(pivot_table, outpath):
40
+ def draw_score_chat(pivot_table, outpath, show_score=False):
41
41
  # Create a custom colormap. Go to https://coolors.co/ and pick cool colors
42
42
  cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
43
43
 
44
44
  # Create the heatmap with better aesthetics
45
45
  plt.figure(figsize=(17.5, 8)) # Can adjust these dimensions as needed
46
- sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=True, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})
46
+ sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=show_score, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})
47
47
 
48
48
  # More aesthetics
49
49
  plt.title('Fact Retrieval Across Context Lengths ("Needle In A HayStack")') # Adds a title
@@ -12,6 +12,9 @@ cur_path = os.path.dirname(os.path.abspath(__file__))
12
12
  @Benchmark.register(
13
13
  name='process_bench',
14
14
  pretty_name='ProcessBench',
15
+ tags=['Mathematical', 'Reasoning'],
16
+ description=
17
+ 'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
15
18
  dataset_id='Qwen/ProcessBench',
16
19
  subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
17
20
  metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
@@ -17,6 +17,9 @@ logger = get_logger()
17
17
  @Benchmark.register(
18
18
  name='race',
19
19
  pretty_name='RACE',
20
+ tags=['Reasoning', 'MCQ'],
21
+ description=
22
+ 'RACE is a benchmark for testing reading comprehension and reasoning abilities of neural models. It is constructed from Chinese middle and high school examinations.', # noqa: E501
20
23
  dataset_id='modelscope/race',
21
24
  model_adapter=OutputType.MULTIPLE_CHOICE,
22
25
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -95,6 +95,9 @@ Just return the letters "A", "B", or "C", with no text around it.
95
95
  @Benchmark.register(
96
96
  name='simple_qa',
97
97
  pretty_name='SimpleQA',
98
+ tags=['Knowledge', 'QA'],
99
+ description=
100
+ 'SimpleQA is a benchmark designed to evaluate the performance of language models on simple question-answering tasks. It includes a set of straightforward questions that require basic reasoning and understanding capabilities.', # noqa: E501
98
101
  dataset_id='AI-ModelScope/SimpleQA',
99
102
  metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
100
103
  few_shot_num=0,
@@ -85,5 +85,6 @@ Answer: A.
85
85
 
86
86
  Question:
87
87
  {query}
88
+ {choices}
88
89
 
89
90
  Answer: Let's think step by step.
@@ -109,6 +109,9 @@ SUBSET_MAPPING = {
109
109
  @Benchmark.register(
110
110
  name='super_gpqa',
111
111
  pretty_name='SuperGPQA',
112
+ tags=['MCQ', 'Knowledge'],
113
+ description=
114
+ 'SuperGPQA is a large-scale multiple-choice question answering dataset, designed to evaluate the generalization ability of models across different fields. It contains 100,000+ questions from 50+ fields, with each question having 10 options.', # noqa: E501
112
115
  dataset_id='m-a-p/SuperGPQA',
113
116
  model_adapter=OutputType.GENERATION,
114
117
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -139,13 +142,15 @@ class SuperGPQAAdapter(DataAdapter):
139
142
  return self.reformat_subset(data_dict, subset_key='field', format='{}')
140
143
 
141
144
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
145
+ question = input_d['question']
146
+ choices = self._format_choices(input_d['options'])
142
147
  if not self.prompt_template:
143
148
  if few_shot_list:
144
- prompt = self.few_shot_prompt.format(query=input_d['question'])
149
+ prompt = self.few_shot_prompt.format(query=question, choices=choices)
145
150
  else:
146
- prompt = self.zero_shot_prompt.format(query=input_d['question'])
151
+ prompt = self.zero_shot_prompt.format(query=question, choices=choices)
147
152
  else:
148
- prompt = self.prompt_template.format(query=input_d['question'])
153
+ prompt = self.prompt_template.format(query=question, choices=choices)
149
154
  return self.gen_prompt_data(prompt)
150
155
 
151
156
  def get_gold_answer(self, input_d: dict) -> str:
@@ -189,3 +194,16 @@ class SuperGPQAAdapter(DataAdapter):
189
194
 
190
195
  def match(self, gold: str, pred: str) -> float:
191
196
  return exact_match(gold=gold, pred=pred)
197
+
198
+ def _format_choices(self, choices: list) -> str:
199
+ """
200
+ Format the choices into a string for display.
201
+
202
+ Args:
203
+ choices (list): List of choices.
204
+
205
+ Returns:
206
+ str: Formatted string of choices.
207
+ """
208
+ choice_list = [f'{option}) {content}' for option, content in zip(self.choices, choices)]
209
+ return '\n'.join(choice_list)
@@ -1,3 +1,4 @@
1
1
  Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
2
2
 
3
3
  {query}
4
+ {choices}
@@ -8,6 +8,11 @@ from evalscope.metrics import Metric, mean, metric_registry
8
8
  @Benchmark.register(
9
9
  name='tool_bench',
10
10
  pretty_name='ToolBench-Static',
11
+ tags=['Reasoning', 'Agent'],
12
+ description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
13
+ 'It includes various subsets such as in-domain and out-of-domain, '
14
+ 'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
15
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)', # noqa: E501
11
16
  dataset_id='AI-ModelScope/ToolBench-Static',
12
17
  subset_list=['in_domain', 'out_of_domain'],
13
18
  metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
@@ -16,6 +16,9 @@ logger = get_logger()
16
16
  @Benchmark.register(
17
17
  name='trivia_qa',
18
18
  pretty_name='TriviaQA',
19
+ tags=['QA', 'Reading Comprehension'],
20
+ description=
21
+ 'TriviaQA is a large-scale reading comprehension dataset consisting of question-answer pairs collected from trivia websites. It includes questions with multiple possible answers, making it suitable for evaluating the ability of models to understand and generate answers based on context.', # noqa: E501
19
22
  dataset_id='modelscope/trivia_qa',
20
23
  subset_list=['default'],
21
24
  metric_list=['AverageAccuracy'],
@@ -21,6 +21,9 @@ logger = get_logger()
21
21
  @Benchmark.register(
22
22
  name='truthful_qa',
23
23
  pretty_name='TruthfulQA',
24
+ tags=['Knowledge'],
25
+ description=
26
+ 'TruthfulQA is a benchmark designed to evaluate the ability of AI models to answer questions truthfully and accurately. It includes multiple-choice and generation tasks, focusing on the model\'s understanding of factual information and its ability to generate coherent responses.', # noqa: E501
24
27
  dataset_id='modelscope/truthful_qa',
25
28
  model_adapter=OutputType.CONTINUOUS,
26
29
  output_types=[OutputType.CONTINUOUS, OutputType.GENERATION],
@@ -7,6 +7,9 @@ from evalscope.utils.utils import ResponseParser
7
7
  @Benchmark.register(
8
8
  name='winogrande',
9
9
  pretty_name='Winogrande',
10
+ tags=['Reasoning', 'MCQ'],
11
+ description=
12
+ 'Winogrande is a benchmark for evaluating AI models on commonsense reasoning tasks, specifically designed to test the ability to resolve ambiguous pronouns in sentences.', # noqa: E501
10
13
  dataset_id='AI-ModelScope/winogrande_val',
11
14
  model_adapter=OutputType.GENERATION,
12
15
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -32,11 +32,22 @@ class SimpleEvaluator(Evaluator):
32
32
  task_cfg=task_cfg,
33
33
  outputs=outputs)
34
34
 
35
- def get_answer(self, samples, infer_cfg) -> List[dict]:
35
+ def get_answer(self, samples: List[DatasetEntry], infer_cfg: dict) -> List[dict]:
36
36
  input_prompts = [sample.prompt for sample in samples]
37
37
  subset_name = samples[0].subset_name
38
+ try:
39
+ # get answer from model
40
+ answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
41
+ except Exception as e:
42
+ logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
43
+ # if ignore_errors is True, continue to next input
44
+ if self.task_cfg.ignore_errors:
45
+ logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
46
+ return [None] * len(samples), samples
47
+ else:
48
+ raise e
49
+ # process answers
38
50
  answers_list = []
39
- answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
40
51
  for answer_d, input_prompt in zip(answer_ds, input_prompts):
41
52
  answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
42
53
  processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
@@ -66,7 +77,7 @@ class EvaluatorCollection:
66
77
  self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
67
78
  self.evaluators = self._initialize_evaluators()
68
79
 
69
- def load(self) -> tuple[list[DatasetEntry], str]:
80
+ def load(self) -> tuple[List[DatasetEntry], str]:
70
81
  dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
71
82
  raw_dataset = self.data_adapter.load()
72
83
  # random limit the dataset
@@ -86,7 +97,7 @@ class EvaluatorCollection:
86
97
  return datasets, dataset_name
87
98
 
88
99
  @staticmethod
89
- def _init_name_map(dataset):
100
+ def _init_name_map(dataset: List[DatasetEntry]) -> Dict[str, Dict[str, List[int]]]:
90
101
  dataset_name_map = defaultdict(lambda: defaultdict(list))
91
102
  for sample in dataset:
92
103
  dataset_name, subset_name = sample.dataset_name, sample.subset_name
@@ -94,13 +105,13 @@ class EvaluatorCollection:
94
105
  return dataset_name_map
95
106
 
96
107
  @staticmethod
97
- def _init_id_map(dataset):
108
+ def _init_id_map(dataset: List[DatasetEntry]) -> Dict[int, DatasetEntry]:
98
109
  dataset_id_map = {}
99
110
  for sample in dataset:
100
111
  dataset_id_map[sample.index] = sample
101
112
  return dataset_id_map
102
113
 
103
- def _initialize_evaluators(self):
114
+ def _initialize_evaluators(self) -> Dict[str, SimpleEvaluator]:
104
115
  evaluators = {}
105
116
  # load dataset args
106
117
  dataset_args = deepcopy(self.task_cfg.dataset_args)
@@ -118,6 +129,8 @@ class EvaluatorCollection:
118
129
  return evaluators
119
130
 
120
131
  def get_report(self, scores):
132
+ if not scores:
133
+ return
121
134
 
122
135
  def get_dataframe(scores):
123
136
  data = []
@@ -241,9 +254,12 @@ class EvaluatorCollection:
241
254
  # Process completed tasks
242
255
  for future in as_completed(futures):
243
256
  answer_list, samples = future.result()
244
- answers[samples[0].index] = answer_list[0]
245
- dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
246
- pbar.update(1)
257
+ for answer_d, sample in zip(answer_list, samples):
258
+ if answer_d is None:
259
+ continue
260
+ answers[sample.index] = answer_d
261
+ dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
262
+ pbar.update(1)
247
263
  else:
248
264
  for dataset_name, data_map in dataset_name_map.items():
249
265
  # get evaluator for the dataset
@@ -253,13 +269,14 @@ class EvaluatorCollection:
253
269
  # get batch samples
254
270
  batch_ids = ids[i:i + eval_batch_size]
255
271
  batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
256
- answer_list, _ = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
272
+ answer_list, samples = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
257
273
  # update answers
258
- for j, _id in enumerate(batch_ids):
259
- answers[_id] = answer_list[j]
260
- dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
261
-
262
- pbar.update(len(batch_ids))
274
+ for answer_d, sample in zip(answer_list, samples):
275
+ if answer_d is None:
276
+ continue
277
+ answers[sample.index] = answer_d
278
+ dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
279
+ pbar.update(1)
263
280
  return answers
264
281
 
265
282
  def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
@@ -289,19 +306,22 @@ class EvaluatorCollection:
289
306
 
290
307
  reviews = {}
291
308
  for sample in tqdm(self.dataset, desc='Getting reviews'):
292
- file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
293
-
294
- if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
295
- # Use cached review if available
296
- review_d = review_history_map[file_name][sample.index]
297
- else:
298
- # Generate new review
299
- evaluator = self.evaluators[sample.dataset_name]
300
- review_d = evaluator.get_review(answers[sample.index])
301
- # Only save the review if it's not in the cache
302
- self._save_review(review_file_path, file_name, review_d)
303
-
304
- reviews[sample.index] = review_d
309
+ try:
310
+ file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
311
+
312
+ if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
313
+ # Use cached review if available
314
+ review_d = review_history_map[file_name][sample.index]
315
+ else:
316
+ # Generate new review
317
+ evaluator = self.evaluators[sample.dataset_name]
318
+ review_d = evaluator.get_review(answers[sample.index])
319
+ # Only save the review if it's not in the cache
320
+ self._save_review(review_file_path, file_name, review_d)
321
+
322
+ reviews[sample.index] = review_d
323
+ except Exception as e:
324
+ logger.error(f'Error getting review for sample index {sample.index}: {e}. Skipping this sample.')
305
325
 
306
326
  return reviews
307
327
 
@@ -339,6 +359,8 @@ class EvaluatorCollection:
339
359
  scores = defaultdict(dict)
340
360
  for sample in tqdm(self.dataset, desc='Getting scores'):
341
361
  evaluator = self.evaluators[sample.dataset_name]
362
+ if sample.index not in reviews:
363
+ continue
342
364
  review_d = reviews[sample.index]
343
365
  score = evaluator.get_score(review_d)
344
366
  scores[sample.index] = score
evalscope/constants.py CHANGED
@@ -146,7 +146,7 @@ class EvalType:
146
146
 
147
147
 
148
148
  class OutputType:
149
- LOGITS = 'logits' # for multiple choice tasks
149
+ LOGITS = 'logits' # for logits output tasks
150
150
  GENERATION = 'generation' # for text generation tasks and general tasks
151
151
  MULTIPLE_CHOICE = 'multiple_choice_logits' # for multiple choice tasks
152
152
  CONTINUOUS = 'continuous_logits' # for continuous tasks