evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +20 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/utils/embedding.py +2 -4
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +2 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/frames_adapter.py +1 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
- evalscope/benchmarks/needle_haystack/utils.py +2 -2
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/collections/evaluator.py +50 -28
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +6 -5
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +78 -17
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +16 -3
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/report/combinator.py +38 -12
- evalscope/report/utils.py +24 -1
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/version.py +2 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
- tests/aigc/test_t2i.py +8 -8
- tests/cli/test_all.py +40 -33
- tests/cli/test_collection.py +4 -3
- tests/cli/test_run.py +36 -21
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +46 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import csv
|
|
3
2
|
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.utils import ResponseParser
|
|
9
|
+
from evalscope.utils.io_utils import csv_to_list, jsonl_to_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
# flake8: noqa
|
|
@@ -15,7 +16,9 @@ logger = get_logger()
|
|
|
15
16
|
|
|
16
17
|
@Benchmark.register(
|
|
17
18
|
name='general_mcq',
|
|
18
|
-
pretty_name='General
|
|
19
|
+
pretty_name='General-MCQ',
|
|
20
|
+
description='A general multiple-choice question answering dataset.',
|
|
21
|
+
tags=['MCQ', 'Custom'],
|
|
19
22
|
dataset_id='general_mcq',
|
|
20
23
|
model_adapter=OutputType.GENERATION,
|
|
21
24
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -24,7 +27,7 @@ logger = get_logger()
|
|
|
24
27
|
few_shot_num=0,
|
|
25
28
|
train_split='dev',
|
|
26
29
|
eval_split='val',
|
|
27
|
-
prompt_template='
|
|
30
|
+
prompt_template='请回答问题,并选出其中的正确答案。你的回答的最后一行应该是这样的格式:“答案是:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
|
|
28
31
|
query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
|
|
29
32
|
class GeneralMCQAdapter(DataAdapter):
|
|
30
33
|
|
|
@@ -34,28 +37,21 @@ class GeneralMCQAdapter(DataAdapter):
|
|
|
34
37
|
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
35
38
|
|
|
36
39
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
37
|
-
data_dict =
|
|
40
|
+
data_dict = defaultdict(dict)
|
|
38
41
|
for subset_name in subset_list:
|
|
39
42
|
for split_name in [self.train_split, self.eval_split]:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if subset_name in data_dict:
|
|
54
|
-
data_dict[subset_name].update({split_name: rows})
|
|
55
|
-
else:
|
|
56
|
-
data_dict[subset_name] = {split_name: rows}
|
|
57
|
-
|
|
58
|
-
return data_dict
|
|
43
|
+
# Check for files with different extensions
|
|
44
|
+
for ext, loader in [('.jsonl', jsonl_to_list), ('.csv', csv_to_list)]:
|
|
45
|
+
if os.path.exists(dataset_name_or_path):
|
|
46
|
+
file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
|
|
47
|
+
else:
|
|
48
|
+
file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
|
|
49
|
+
|
|
50
|
+
if os.path.exists(file_path):
|
|
51
|
+
data_dict[subset_name][split_name] = loader(file_path)
|
|
52
|
+
break # Stop checking other extensions once a file is found
|
|
53
|
+
|
|
54
|
+
return dict(data_dict)
|
|
59
55
|
|
|
60
56
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
61
57
|
"""
|
|
@@ -13,6 +13,9 @@ logger = get_logger()
|
|
|
13
13
|
|
|
14
14
|
@Benchmark.register(
|
|
15
15
|
name='general_qa',
|
|
16
|
+
pretty_name='General-QA',
|
|
17
|
+
description='General Question Answering dataset',
|
|
18
|
+
tags=['QA', 'Custom'],
|
|
16
19
|
dataset_id='general_qa',
|
|
17
20
|
subset_list=['default'],
|
|
18
21
|
metric_list=['AverageBLEU', 'AverageRouge'],
|
|
@@ -10,6 +10,9 @@ from evalscope.metrics import exact_match
|
|
|
10
10
|
@Benchmark.register(
|
|
11
11
|
name='gpqa',
|
|
12
12
|
pretty_name='GPQA',
|
|
13
|
+
tags=['MCQ', 'Knowledge'],
|
|
14
|
+
description=
|
|
15
|
+
'GPQA is a dataset for evaluating the reasoning ability of large language models (LLMs) on complex mathematical problems. It contains questions that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
|
|
13
16
|
dataset_id='modelscope/gpqa',
|
|
14
17
|
model_adapter=OutputType.GENERATION,
|
|
15
18
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -15,6 +15,9 @@ logger = get_logger()
|
|
|
15
15
|
@Benchmark.register(
|
|
16
16
|
name='gsm8k',
|
|
17
17
|
pretty_name='GSM8K',
|
|
18
|
+
tags=['Mathematics'],
|
|
19
|
+
description=
|
|
20
|
+
'GSM8K (Grade School Math 8K) is a dataset of grade school math problems, designed to evaluate the mathematical reasoning abilities of AI models.',
|
|
18
21
|
dataset_id='modelscope/gsm8k',
|
|
19
22
|
subset_list=['main'],
|
|
20
23
|
metric_list=['AverageAccuracy'],
|
|
@@ -18,6 +18,9 @@ logger = get_logger()
|
|
|
18
18
|
@Benchmark.register(
|
|
19
19
|
name='hellaswag',
|
|
20
20
|
pretty_name='HellaSwag',
|
|
21
|
+
tags=['Commonsense', 'MCQ', 'Knowledge'],
|
|
22
|
+
description=
|
|
23
|
+
'HellaSwag is a benchmark for commonsense reasoning in natural language understanding tasks. It consists of multiple-choice questions where the model must select the most plausible continuation of a given context.',
|
|
21
24
|
dataset_id='modelscope/hellaswag',
|
|
22
25
|
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
23
26
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -13,6 +13,9 @@ logger = get_logger()
|
|
|
13
13
|
@Benchmark.register(
|
|
14
14
|
name='humaneval',
|
|
15
15
|
pretty_name='HumanEval',
|
|
16
|
+
tags=['Coding'],
|
|
17
|
+
description=
|
|
18
|
+
'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.', # noqa: E501
|
|
16
19
|
dataset_id='modelscope/humaneval',
|
|
17
20
|
subset_list=['openai_humaneval'],
|
|
18
21
|
metric_list=['Pass@1'],
|
|
@@ -10,6 +10,9 @@ from evalscope.metrics import Metric, mean, metric_registry
|
|
|
10
10
|
@Benchmark.register(
|
|
11
11
|
name='ifeval',
|
|
12
12
|
pretty_name='IFEval',
|
|
13
|
+
tags=['Instruction-Following'],
|
|
14
|
+
description=
|
|
15
|
+
'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
|
|
13
16
|
dataset_id='opencompass/ifeval',
|
|
14
17
|
subset_list=['default'],
|
|
15
18
|
metric_list=[
|
|
@@ -7,6 +7,9 @@ from evalscope.utils.utils import ResponseParser
|
|
|
7
7
|
@Benchmark.register(
|
|
8
8
|
name='iquiz',
|
|
9
9
|
pretty_name='IQuiz',
|
|
10
|
+
tags=['Knowledge', 'MCQ', 'Chinese'],
|
|
11
|
+
description=
|
|
12
|
+
'IQuiz is a benchmark for evaluating AI models on IQ and EQ questions. It consists of multiple-choice questions where the model must select the correct answer and provide an explanation.', # noqa: E501
|
|
10
13
|
dataset_id='AI-ModelScope/IQuiz',
|
|
11
14
|
model_adapter=OutputType.GENERATION,
|
|
12
15
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -8,7 +8,10 @@ logger = get_logger()
|
|
|
8
8
|
|
|
9
9
|
@Benchmark.register(
|
|
10
10
|
name='live_code_bench',
|
|
11
|
-
pretty_name='Live
|
|
11
|
+
pretty_name='Live-Code-Bench',
|
|
12
|
+
tags=['Coding'],
|
|
13
|
+
description=
|
|
14
|
+
'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.', # noqa: E501
|
|
12
15
|
dataset_id='AI-ModelScope/code_generation_lite',
|
|
13
16
|
subset_list=['release_latest'],
|
|
14
17
|
metric_list=['Pass@1'],
|
|
@@ -11,6 +11,9 @@ SUBSET_LIST = ['default']
|
|
|
11
11
|
@Benchmark.register(
|
|
12
12
|
name='maritime_bench',
|
|
13
13
|
pretty_name='MaritimeBench',
|
|
14
|
+
tags=['Maritime', 'MCQ', 'Knowledge'],
|
|
15
|
+
description=
|
|
16
|
+
'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.', # noqa: E501
|
|
14
17
|
dataset_id='HiDolphin/MaritimeBench',
|
|
15
18
|
model_adapter=OutputType.GENERATION,
|
|
16
19
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -10,6 +10,9 @@ logger = get_logger()
|
|
|
10
10
|
@Benchmark.register(
|
|
11
11
|
name='math_500',
|
|
12
12
|
pretty_name='MATH-500',
|
|
13
|
+
tags=['Mathematics'],
|
|
14
|
+
description=
|
|
15
|
+
"MATH-500 is a benchmark for evaluating mathematical reasoning capabilities of AI models. It consists of 500 diverse math problems across five levels of difficulty, designed to test a model's ability to solve complex mathematical problems by generating step-by-step solutions and providing the correct final answer.", # noqa: E501
|
|
13
16
|
dataset_id='AI-ModelScope/MATH-500',
|
|
14
17
|
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
15
18
|
metric_list=['AveragePass@1'],
|
|
@@ -136,6 +136,9 @@ SUBJECT_MAPPING = {
|
|
|
136
136
|
@Benchmark.register(
|
|
137
137
|
name='mmlu',
|
|
138
138
|
pretty_name='MMLU',
|
|
139
|
+
tags=['Knowledge', 'MCQ'],
|
|
140
|
+
description=
|
|
141
|
+
"The MMLU (Massive Multitask Language Understanding) benchmark is a comprehensive evaluation suite designed to assess the performance of language models across a wide range of subjects and tasks. It includes multiple-choice questions from various domains, such as history, science, mathematics, and more, providing a robust measure of a model's understanding and reasoning capabilities.", # noqa: E501
|
|
139
142
|
dataset_id='modelscope/mmlu',
|
|
140
143
|
model_adapter=OutputType.GENERATION,
|
|
141
144
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -15,6 +15,9 @@ SUBSET_LIST = [
|
|
|
15
15
|
@Benchmark.register(
|
|
16
16
|
name='mmlu_pro',
|
|
17
17
|
pretty_name='MMLU-Pro',
|
|
18
|
+
tags=['MCQ', 'Knowledge'],
|
|
19
|
+
description=
|
|
20
|
+
'MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
|
|
18
21
|
dataset_id='modelscope/MMLU-Pro',
|
|
19
22
|
model_adapter=OutputType.GENERATION,
|
|
20
23
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -88,6 +88,9 @@ SUBJECT_MAPPING = {
|
|
|
88
88
|
@Benchmark.register(
|
|
89
89
|
name='mmlu_redux',
|
|
90
90
|
pretty_name='MMLU-Redux',
|
|
91
|
+
tags=['MCQ', 'Knowledge'],
|
|
92
|
+
description=
|
|
93
|
+
'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
|
|
91
94
|
dataset_id='AI-ModelScope/mmlu-redux-2.0',
|
|
92
95
|
model_adapter=OutputType.GENERATION,
|
|
93
96
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -10,6 +10,9 @@ from evalscope.utils.utils import ResponseParser
|
|
|
10
10
|
@Benchmark.register(
|
|
11
11
|
name='musr',
|
|
12
12
|
pretty_name='MuSR',
|
|
13
|
+
tags=['Reasoning', 'MCQ'],
|
|
14
|
+
description=
|
|
15
|
+
'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.', # noqa: E501
|
|
13
16
|
dataset_id='AI-ModelScope/MuSR',
|
|
14
17
|
model_adapter=OutputType.GENERATION,
|
|
15
18
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -28,9 +28,11 @@ Don't give information outside the document or repeat your findings."""
|
|
|
28
28
|
|
|
29
29
|
@Benchmark.register(
|
|
30
30
|
name='needle_haystack',
|
|
31
|
-
pretty_name='Needle
|
|
32
|
-
|
|
33
|
-
|
|
31
|
+
pretty_name='Needle-in-a-Haystack',
|
|
32
|
+
tags=['Retrieval', 'Long Context'],
|
|
33
|
+
description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
|
|
34
|
+
'It requires the model to find specific information within a large corpus of text. '
|
|
35
|
+
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
|
|
34
36
|
dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
|
|
35
37
|
metric_list=['AverageAccuracy'],
|
|
36
38
|
subset_list=['english', 'chinese'],
|
|
@@ -50,6 +52,7 @@ Don't give information outside the document or repeat your findings."""
|
|
|
50
52
|
'document_depth_percent_max': 100,
|
|
51
53
|
'document_depth_percent_intervals': 10,
|
|
52
54
|
'tokenizer_path': 'Qwen/Qwen3-0.6B',
|
|
55
|
+
'show_score': False,
|
|
53
56
|
})
|
|
54
57
|
class NeedleHaystackAdapter(DataAdapter):
|
|
55
58
|
|
|
@@ -71,11 +74,12 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
71
74
|
self.document_depth_percent_max = extra_params.get('document_depth_percent_max', 100)
|
|
72
75
|
self.document_depth_percent_intervals = extra_params.get('document_depth_percent_intervals', 10)
|
|
73
76
|
self.tokenizer_path = extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
|
|
77
|
+
self.show_score = extra_params.get('show_score', False)
|
|
74
78
|
|
|
75
|
-
self.
|
|
76
|
-
self.
|
|
79
|
+
self._init_tokenizer()
|
|
80
|
+
self._init_length()
|
|
77
81
|
|
|
78
|
-
def
|
|
82
|
+
def _init_length(self):
|
|
79
83
|
""" Initialize context lengths and document depth percentages based on the provided parameters."""
|
|
80
84
|
import numpy as np
|
|
81
85
|
|
|
@@ -93,7 +97,7 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
93
97
|
num=self.document_depth_percent_intervals,
|
|
94
98
|
endpoint=True)).astype(int)
|
|
95
99
|
|
|
96
|
-
def
|
|
100
|
+
def _init_tokenizer(self):
|
|
97
101
|
""" Initialize the tokenizer based on the provided tokenizer path."""
|
|
98
102
|
from modelscope import AutoTokenizer
|
|
99
103
|
self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
|
|
@@ -335,7 +339,10 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
335
339
|
pivot_table = sub_df.pivot_table(
|
|
336
340
|
values='Score', index=['Depth', 'Context'], aggfunc='mean').reset_index()
|
|
337
341
|
pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
|
|
338
|
-
draw_score_chat(
|
|
342
|
+
draw_score_chat(
|
|
343
|
+
pivot_table,
|
|
344
|
+
outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
|
|
345
|
+
show_score=self.show_score)
|
|
339
346
|
|
|
340
347
|
except Exception as e:
|
|
341
348
|
logger.error(f'Error generating charts: {e}')
|
|
@@ -37,13 +37,13 @@ def parse_score(score_str: str) -> int:
|
|
|
37
37
|
return 0.0
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
def draw_score_chat(pivot_table, outpath):
|
|
40
|
+
def draw_score_chat(pivot_table, outpath, show_score=False):
|
|
41
41
|
# Create a custom colormap. Go to https://coolors.co/ and pick cool colors
|
|
42
42
|
cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
|
|
43
43
|
|
|
44
44
|
# Create the heatmap with better aesthetics
|
|
45
45
|
plt.figure(figsize=(17.5, 8)) # Can adjust these dimensions as needed
|
|
46
|
-
sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=
|
|
46
|
+
sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=show_score, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})
|
|
47
47
|
|
|
48
48
|
# More aesthetics
|
|
49
49
|
plt.title('Fact Retrieval Across Context Lengths ("Needle In A HayStack")') # Adds a title
|
|
@@ -12,6 +12,9 @@ cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
|
12
12
|
@Benchmark.register(
|
|
13
13
|
name='process_bench',
|
|
14
14
|
pretty_name='ProcessBench',
|
|
15
|
+
tags=['Mathematical', 'Reasoning'],
|
|
16
|
+
description=
|
|
17
|
+
'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
|
|
15
18
|
dataset_id='Qwen/ProcessBench',
|
|
16
19
|
subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
|
|
17
20
|
metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
|
|
@@ -17,6 +17,9 @@ logger = get_logger()
|
|
|
17
17
|
@Benchmark.register(
|
|
18
18
|
name='race',
|
|
19
19
|
pretty_name='RACE',
|
|
20
|
+
tags=['Reasoning', 'MCQ'],
|
|
21
|
+
description=
|
|
22
|
+
'RACE is a benchmark for testing reading comprehension and reasoning abilities of neural models. It is constructed from Chinese middle and high school examinations.', # noqa: E501
|
|
20
23
|
dataset_id='modelscope/race',
|
|
21
24
|
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
22
25
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -95,6 +95,9 @@ Just return the letters "A", "B", or "C", with no text around it.
|
|
|
95
95
|
@Benchmark.register(
|
|
96
96
|
name='simple_qa',
|
|
97
97
|
pretty_name='SimpleQA',
|
|
98
|
+
tags=['Knowledge', 'QA'],
|
|
99
|
+
description=
|
|
100
|
+
'SimpleQA is a benchmark designed to evaluate the performance of language models on simple question-answering tasks. It includes a set of straightforward questions that require basic reasoning and understanding capabilities.', # noqa: E501
|
|
98
101
|
dataset_id='AI-ModelScope/SimpleQA',
|
|
99
102
|
metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
|
|
100
103
|
few_shot_num=0,
|
|
@@ -109,6 +109,9 @@ SUBSET_MAPPING = {
|
|
|
109
109
|
@Benchmark.register(
|
|
110
110
|
name='super_gpqa',
|
|
111
111
|
pretty_name='SuperGPQA',
|
|
112
|
+
tags=['MCQ', 'Knowledge'],
|
|
113
|
+
description=
|
|
114
|
+
'SuperGPQA is a large-scale multiple-choice question answering dataset, designed to evaluate the generalization ability of models across different fields. It contains 100,000+ questions from 50+ fields, with each question having 10 options.', # noqa: E501
|
|
112
115
|
dataset_id='m-a-p/SuperGPQA',
|
|
113
116
|
model_adapter=OutputType.GENERATION,
|
|
114
117
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -139,13 +142,15 @@ class SuperGPQAAdapter(DataAdapter):
|
|
|
139
142
|
return self.reformat_subset(data_dict, subset_key='field', format='{}')
|
|
140
143
|
|
|
141
144
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
145
|
+
question = input_d['question']
|
|
146
|
+
choices = self._format_choices(input_d['options'])
|
|
142
147
|
if not self.prompt_template:
|
|
143
148
|
if few_shot_list:
|
|
144
|
-
prompt = self.few_shot_prompt.format(query=
|
|
149
|
+
prompt = self.few_shot_prompt.format(query=question, choices=choices)
|
|
145
150
|
else:
|
|
146
|
-
prompt = self.zero_shot_prompt.format(query=
|
|
151
|
+
prompt = self.zero_shot_prompt.format(query=question, choices=choices)
|
|
147
152
|
else:
|
|
148
|
-
prompt = self.prompt_template.format(query=
|
|
153
|
+
prompt = self.prompt_template.format(query=question, choices=choices)
|
|
149
154
|
return self.gen_prompt_data(prompt)
|
|
150
155
|
|
|
151
156
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
@@ -189,3 +194,16 @@ class SuperGPQAAdapter(DataAdapter):
|
|
|
189
194
|
|
|
190
195
|
def match(self, gold: str, pred: str) -> float:
|
|
191
196
|
return exact_match(gold=gold, pred=pred)
|
|
197
|
+
|
|
198
|
+
def _format_choices(self, choices: list) -> str:
|
|
199
|
+
"""
|
|
200
|
+
Format the choices into a string for display.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
choices (list): List of choices.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
str: Formatted string of choices.
|
|
207
|
+
"""
|
|
208
|
+
choice_list = [f'{option}) {content}' for option, content in zip(self.choices, choices)]
|
|
209
|
+
return '\n'.join(choice_list)
|
|
@@ -8,6 +8,11 @@ from evalscope.metrics import Metric, mean, metric_registry
|
|
|
8
8
|
@Benchmark.register(
|
|
9
9
|
name='tool_bench',
|
|
10
10
|
pretty_name='ToolBench-Static',
|
|
11
|
+
tags=['Reasoning', 'Agent'],
|
|
12
|
+
description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
|
|
13
|
+
'It includes various subsets such as in-domain and out-of-domain, '
|
|
14
|
+
'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
|
|
15
|
+
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)', # noqa: E501
|
|
11
16
|
dataset_id='AI-ModelScope/ToolBench-Static',
|
|
12
17
|
subset_list=['in_domain', 'out_of_domain'],
|
|
13
18
|
metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
|
|
@@ -16,6 +16,9 @@ logger = get_logger()
|
|
|
16
16
|
@Benchmark.register(
|
|
17
17
|
name='trivia_qa',
|
|
18
18
|
pretty_name='TriviaQA',
|
|
19
|
+
tags=['QA', 'Reading Comprehension'],
|
|
20
|
+
description=
|
|
21
|
+
'TriviaQA is a large-scale reading comprehension dataset consisting of question-answer pairs collected from trivia websites. It includes questions with multiple possible answers, making it suitable for evaluating the ability of models to understand and generate answers based on context.', # noqa: E501
|
|
19
22
|
dataset_id='modelscope/trivia_qa',
|
|
20
23
|
subset_list=['default'],
|
|
21
24
|
metric_list=['AverageAccuracy'],
|
|
@@ -21,6 +21,9 @@ logger = get_logger()
|
|
|
21
21
|
@Benchmark.register(
|
|
22
22
|
name='truthful_qa',
|
|
23
23
|
pretty_name='TruthfulQA',
|
|
24
|
+
tags=['Knowledge'],
|
|
25
|
+
description=
|
|
26
|
+
'TruthfulQA is a benchmark designed to evaluate the ability of AI models to answer questions truthfully and accurately. It includes multiple-choice and generation tasks, focusing on the model\'s understanding of factual information and its ability to generate coherent responses.', # noqa: E501
|
|
24
27
|
dataset_id='modelscope/truthful_qa',
|
|
25
28
|
model_adapter=OutputType.CONTINUOUS,
|
|
26
29
|
output_types=[OutputType.CONTINUOUS, OutputType.GENERATION],
|
|
@@ -7,6 +7,9 @@ from evalscope.utils.utils import ResponseParser
|
|
|
7
7
|
@Benchmark.register(
|
|
8
8
|
name='winogrande',
|
|
9
9
|
pretty_name='Winogrande',
|
|
10
|
+
tags=['Reasoning', 'MCQ'],
|
|
11
|
+
description=
|
|
12
|
+
'Winogrande is a benchmark for evaluating AI models on commonsense reasoning tasks, specifically designed to test the ability to resolve ambiguous pronouns in sentences.', # noqa: E501
|
|
10
13
|
dataset_id='AI-ModelScope/winogrande_val',
|
|
11
14
|
model_adapter=OutputType.GENERATION,
|
|
12
15
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -32,11 +32,22 @@ class SimpleEvaluator(Evaluator):
|
|
|
32
32
|
task_cfg=task_cfg,
|
|
33
33
|
outputs=outputs)
|
|
34
34
|
|
|
35
|
-
def get_answer(self, samples, infer_cfg) -> List[dict]:
|
|
35
|
+
def get_answer(self, samples: List[DatasetEntry], infer_cfg: dict) -> List[dict]:
|
|
36
36
|
input_prompts = [sample.prompt for sample in samples]
|
|
37
37
|
subset_name = samples[0].subset_name
|
|
38
|
+
try:
|
|
39
|
+
# get answer from model
|
|
40
|
+
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
|
|
43
|
+
# if ignore_errors is True, continue to next input
|
|
44
|
+
if self.task_cfg.ignore_errors:
|
|
45
|
+
logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
|
|
46
|
+
return [None] * len(samples), samples
|
|
47
|
+
else:
|
|
48
|
+
raise e
|
|
49
|
+
# process answers
|
|
38
50
|
answers_list = []
|
|
39
|
-
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
40
51
|
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
41
52
|
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
42
53
|
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
@@ -66,7 +77,7 @@ class EvaluatorCollection:
|
|
|
66
77
|
self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
|
|
67
78
|
self.evaluators = self._initialize_evaluators()
|
|
68
79
|
|
|
69
|
-
def load(self) -> tuple[
|
|
80
|
+
def load(self) -> tuple[List[DatasetEntry], str]:
|
|
70
81
|
dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
|
|
71
82
|
raw_dataset = self.data_adapter.load()
|
|
72
83
|
# random limit the dataset
|
|
@@ -86,7 +97,7 @@ class EvaluatorCollection:
|
|
|
86
97
|
return datasets, dataset_name
|
|
87
98
|
|
|
88
99
|
@staticmethod
|
|
89
|
-
def _init_name_map(dataset):
|
|
100
|
+
def _init_name_map(dataset: List[DatasetEntry]) -> Dict[str, Dict[str, List[int]]]:
|
|
90
101
|
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
91
102
|
for sample in dataset:
|
|
92
103
|
dataset_name, subset_name = sample.dataset_name, sample.subset_name
|
|
@@ -94,13 +105,13 @@ class EvaluatorCollection:
|
|
|
94
105
|
return dataset_name_map
|
|
95
106
|
|
|
96
107
|
@staticmethod
|
|
97
|
-
def _init_id_map(dataset):
|
|
108
|
+
def _init_id_map(dataset: List[DatasetEntry]) -> Dict[int, DatasetEntry]:
|
|
98
109
|
dataset_id_map = {}
|
|
99
110
|
for sample in dataset:
|
|
100
111
|
dataset_id_map[sample.index] = sample
|
|
101
112
|
return dataset_id_map
|
|
102
113
|
|
|
103
|
-
def _initialize_evaluators(self):
|
|
114
|
+
def _initialize_evaluators(self) -> Dict[str, SimpleEvaluator]:
|
|
104
115
|
evaluators = {}
|
|
105
116
|
# load dataset args
|
|
106
117
|
dataset_args = deepcopy(self.task_cfg.dataset_args)
|
|
@@ -118,6 +129,8 @@ class EvaluatorCollection:
|
|
|
118
129
|
return evaluators
|
|
119
130
|
|
|
120
131
|
def get_report(self, scores):
|
|
132
|
+
if not scores:
|
|
133
|
+
return
|
|
121
134
|
|
|
122
135
|
def get_dataframe(scores):
|
|
123
136
|
data = []
|
|
@@ -241,9 +254,12 @@ class EvaluatorCollection:
|
|
|
241
254
|
# Process completed tasks
|
|
242
255
|
for future in as_completed(futures):
|
|
243
256
|
answer_list, samples = future.result()
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
257
|
+
for answer_d, sample in zip(answer_list, samples):
|
|
258
|
+
if answer_d is None:
|
|
259
|
+
continue
|
|
260
|
+
answers[sample.index] = answer_d
|
|
261
|
+
dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
|
|
262
|
+
pbar.update(1)
|
|
247
263
|
else:
|
|
248
264
|
for dataset_name, data_map in dataset_name_map.items():
|
|
249
265
|
# get evaluator for the dataset
|
|
@@ -253,13 +269,14 @@ class EvaluatorCollection:
|
|
|
253
269
|
# get batch samples
|
|
254
270
|
batch_ids = ids[i:i + eval_batch_size]
|
|
255
271
|
batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
|
|
256
|
-
answer_list,
|
|
272
|
+
answer_list, samples = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
|
|
257
273
|
# update answers
|
|
258
|
-
for
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
274
|
+
for answer_d, sample in zip(answer_list, samples):
|
|
275
|
+
if answer_d is None:
|
|
276
|
+
continue
|
|
277
|
+
answers[sample.index] = answer_d
|
|
278
|
+
dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
|
|
279
|
+
pbar.update(1)
|
|
263
280
|
return answers
|
|
264
281
|
|
|
265
282
|
def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
|
|
@@ -289,19 +306,22 @@ class EvaluatorCollection:
|
|
|
289
306
|
|
|
290
307
|
reviews = {}
|
|
291
308
|
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
309
|
+
try:
|
|
310
|
+
file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
|
|
311
|
+
|
|
312
|
+
if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
|
|
313
|
+
# Use cached review if available
|
|
314
|
+
review_d = review_history_map[file_name][sample.index]
|
|
315
|
+
else:
|
|
316
|
+
# Generate new review
|
|
317
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
318
|
+
review_d = evaluator.get_review(answers[sample.index])
|
|
319
|
+
# Only save the review if it's not in the cache
|
|
320
|
+
self._save_review(review_file_path, file_name, review_d)
|
|
321
|
+
|
|
322
|
+
reviews[sample.index] = review_d
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.error(f'Error getting review for sample index {sample.index}: {e}. Skipping this sample.')
|
|
305
325
|
|
|
306
326
|
return reviews
|
|
307
327
|
|
|
@@ -339,6 +359,8 @@ class EvaluatorCollection:
|
|
|
339
359
|
scores = defaultdict(dict)
|
|
340
360
|
for sample in tqdm(self.dataset, desc='Getting scores'):
|
|
341
361
|
evaluator = self.evaluators[sample.dataset_name]
|
|
362
|
+
if sample.index not in reviews:
|
|
363
|
+
continue
|
|
342
364
|
review_d = reviews[sample.index]
|
|
343
365
|
score = evaluator.get_score(review_d)
|
|
344
366
|
scores[sample.index] = score
|
evalscope/constants.py
CHANGED
|
@@ -146,7 +146,7 @@ class EvalType:
|
|
|
146
146
|
|
|
147
147
|
|
|
148
148
|
class OutputType:
|
|
149
|
-
LOGITS = 'logits' # for
|
|
149
|
+
LOGITS = 'logits' # for logits output tasks
|
|
150
150
|
GENERATION = 'generation' # for text generation tasks and general tasks
|
|
151
151
|
MULTIPLE_CHOICE = 'multiple_choice_logits' # for multiple choice tasks
|
|
152
152
|
CONTINUOUS = 'continuous_logits' # for continuous tasks
|