evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +40 -30
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +77 -39
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +2 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +99 -16
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +91 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
- evalscope/benchmarks/tool_bench/utils.py +5 -4
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/utils.py +25 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +68 -34
- evalscope/config.py +8 -2
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +40 -28
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/llm_judge.py +12 -5
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +80 -23
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +4 -2
- evalscope/perf/benchmark.py +16 -12
- evalscope/perf/main.py +7 -0
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +1 -1
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +40 -6
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +84 -4
- evalscope/run.py +12 -0
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
- tests/aigc/test_t2i.py +48 -11
- tests/cli/test_all.py +14 -3
- tests/cli/test_collection.py +6 -4
- tests/cli/test_run.py +50 -25
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +51 -7
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
-
from evalscope.constants import OutputType
|
|
3
2
|
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
4
3
|
from evalscope.utils.logger import get_logger
|
|
5
4
|
|
|
@@ -11,6 +10,9 @@ logger = get_logger()
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='aime25',
|
|
13
12
|
pretty_name='AIME-2025',
|
|
13
|
+
tags=['Mathematics'],
|
|
14
|
+
description=
|
|
15
|
+
'The AIME 2025 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.',
|
|
14
16
|
dataset_id='opencompass/AIME2025',
|
|
15
17
|
subset_list=['AIME2025-I', 'AIME2025-II'],
|
|
16
18
|
metric_list=['AveragePass@1'],
|
|
@@ -47,6 +47,11 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
|
|
|
47
47
|
@Benchmark.register(
|
|
48
48
|
name='alpaca_eval',
|
|
49
49
|
pretty_name='AlpacaEval2.0',
|
|
50
|
+
tags=['Instruction-Following', 'Reasoning'],
|
|
51
|
+
description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
|
|
52
|
+
'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
|
|
53
|
+
'provide more accurate and cost-effective model assessments. '
|
|
54
|
+
'Currently not support `length-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-turbo`.', # noqa: E501
|
|
50
55
|
dataset_id='AI-ModelScope/alpaca_eval',
|
|
51
56
|
subset_list=['alpaca_eval_gpt4_baseline'],
|
|
52
57
|
metric_list=['winrate'],
|
|
@@ -17,6 +17,9 @@ logger = get_logger()
|
|
|
17
17
|
@Benchmark.register(
|
|
18
18
|
name='arc',
|
|
19
19
|
pretty_name='ARC',
|
|
20
|
+
tags=['Reasoning', 'MCQ'],
|
|
21
|
+
description=
|
|
22
|
+
'The ARC (AI2 Reasoning Challenge) benchmark is designed to evaluate the reasoning capabilities of AI models through multiple-choice questions derived from science exams. It includes two subsets: ARC-Easy and ARC-Challenge, which vary in difficulty.', # noqa: E501
|
|
20
23
|
dataset_id='modelscope/ai2_arc',
|
|
21
24
|
model_adapter=OutputType.GENERATION,
|
|
22
25
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from collections import defaultdict
|
|
3
1
|
from typing import Any, List
|
|
4
2
|
|
|
5
3
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
@@ -19,12 +17,18 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
|
|
|
19
17
|
@Benchmark.register(
|
|
20
18
|
name='arena_hard',
|
|
21
19
|
pretty_name='ArenaHard',
|
|
20
|
+
tags=['Instruction-Following', 'Reasoning'],
|
|
21
|
+
description=
|
|
22
|
+
'ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, '
|
|
23
|
+
'where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. '
|
|
24
|
+
'It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. '
|
|
25
|
+
'Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.', # noqa: E501
|
|
22
26
|
dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
|
|
23
27
|
metric_list=['winrate'],
|
|
24
28
|
few_shot_num=0,
|
|
25
29
|
train_split=None,
|
|
26
30
|
eval_split='test')
|
|
27
|
-
class
|
|
31
|
+
class ArenaHardAdapter(DataAdapter):
|
|
28
32
|
|
|
29
33
|
def __init__(self, *args, **kwargs):
|
|
30
34
|
super().__init__(*args, **kwargs)
|
|
@@ -59,6 +59,9 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
|
59
59
|
@Benchmark.register(
|
|
60
60
|
name='bbh',
|
|
61
61
|
pretty_name='BBH',
|
|
62
|
+
tags=['Reasoning'],
|
|
63
|
+
description=
|
|
64
|
+
'The BBH (Big Bench Hard) benchmark is a collection of challenging tasks designed to evaluate the reasoning capabilities of AI models. It includes both free-form and multiple-choice tasks, covering a wide range of reasoning skills.', # noqa: E501
|
|
62
65
|
dataset_id='modelscope/bbh',
|
|
63
66
|
subset_list=SUBSET_LIST,
|
|
64
67
|
metric_list=['AverageAccuracy'],
|
|
@@ -28,6 +28,8 @@ class BenchmarkMeta:
|
|
|
28
28
|
system_prompt: Optional[str] = None
|
|
29
29
|
query_template: Optional[str] = None
|
|
30
30
|
pretty_name: Optional[str] = None
|
|
31
|
+
description: Optional[str] = None
|
|
32
|
+
tags: Optional[List[str]] = field(default_factory=list)
|
|
31
33
|
filters: Optional[OrderedDict] = None
|
|
32
34
|
extra_params: Optional[Dict] = field(default_factory=dict)
|
|
33
35
|
|
|
File without changes
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import importlib
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import traceback
|
|
6
|
+
from typing import Any, List
|
|
7
|
+
|
|
8
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
+
from evalscope.constants import EvalType
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
SUBJECT_MAPPING = {
|
|
15
|
+
'simple': 'AST_NON_LIVE',
|
|
16
|
+
'multiple': 'AST_NON_LIVE',
|
|
17
|
+
'parallel': 'AST_NON_LIVE',
|
|
18
|
+
'parallel_multiple': 'AST_NON_LIVE',
|
|
19
|
+
'java': 'AST_NON_LIVE',
|
|
20
|
+
'javascript': 'AST_NON_LIVE',
|
|
21
|
+
'live_simple': 'AST_LIVE',
|
|
22
|
+
'live_multiple': 'AST_LIVE',
|
|
23
|
+
'live_parallel': 'AST_LIVE',
|
|
24
|
+
'live_parallel_multiple': 'AST_LIVE',
|
|
25
|
+
'irrelevance': 'RELEVANCE',
|
|
26
|
+
'live_relevance': 'RELEVANCE',
|
|
27
|
+
'live_irrelevance': 'RELEVANCE',
|
|
28
|
+
'multi_turn_base': 'MULTI_TURN',
|
|
29
|
+
'multi_turn_miss_func': 'MULTI_TURN',
|
|
30
|
+
'multi_turn_miss_param': 'MULTI_TURN',
|
|
31
|
+
'multi_turn_long_context': 'MULTI_TURN'
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@Benchmark.register(
|
|
36
|
+
name='bfcl_v3',
|
|
37
|
+
pretty_name='BFCL-v3',
|
|
38
|
+
tags=['Agent'],
|
|
39
|
+
description=
|
|
40
|
+
'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
|
|
41
|
+
'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
|
|
42
|
+
'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
|
|
43
|
+
'Need to run `pip install bfcl-eval` before evaluating. '
|
|
44
|
+
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)', # noqa: E501
|
|
45
|
+
dataset_id='AI-ModelScope/bfcl_v3',
|
|
46
|
+
subset_list=list(SUBJECT_MAPPING.keys()),
|
|
47
|
+
model_adapter='bfcl_server',
|
|
48
|
+
metric_list=['AverageAccuracy'],
|
|
49
|
+
few_shot_num=0,
|
|
50
|
+
train_split=None,
|
|
51
|
+
eval_split='train',
|
|
52
|
+
extra_params={
|
|
53
|
+
'underscore_to_dot': True,
|
|
54
|
+
'is_fc_model': True,
|
|
55
|
+
})
|
|
56
|
+
class BFCLAdapter(DataAdapter):
|
|
57
|
+
|
|
58
|
+
def __init__(self, **kwargs):
|
|
59
|
+
super().__init__(**kwargs)
|
|
60
|
+
|
|
61
|
+
spec = importlib.util.find_spec('bfcl_eval')
|
|
62
|
+
if spec is None:
|
|
63
|
+
raise ImportError(
|
|
64
|
+
'`bfcl_eval` not found, please install it with `pip install bfcl-eval` before evaluating.')
|
|
65
|
+
|
|
66
|
+
self.category_map = SUBJECT_MAPPING
|
|
67
|
+
|
|
68
|
+
extra_params = kwargs.get('extra_params', {})
|
|
69
|
+
self.underscore_to_dot = extra_params.get('underscore_to_dot', False)
|
|
70
|
+
self.is_fc_model = extra_params.get('is_fc_model', True)
|
|
71
|
+
|
|
72
|
+
def load(self, **kwargs):
|
|
73
|
+
kwargs['subset_list'] = ['default']
|
|
74
|
+
data_dict = super().load(**kwargs)
|
|
75
|
+
return self.reformat_subset(data_dict, subset_key='subset', format='{}')
|
|
76
|
+
|
|
77
|
+
def preprocess_row(self, row: dict):
|
|
78
|
+
"""
|
|
79
|
+
Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
|
|
80
|
+
"""
|
|
81
|
+
row['should_execute_tool_calls'] = True if row['multi_turn'] else False
|
|
82
|
+
row['functions'] = json.loads(row['functions'])
|
|
83
|
+
row['tools'] = json.loads(row['tools'])
|
|
84
|
+
row['turns'] = json.loads(row['turns'])
|
|
85
|
+
row['missing_functions'] = json.loads(row['missed_functions'])
|
|
86
|
+
row['ground_truth'] = json.loads(row.get('ground_truth', '{}'))
|
|
87
|
+
row['initial_config'] = json.loads(row['initial_config'])
|
|
88
|
+
row['is_fc_model'] = self.is_fc_model
|
|
89
|
+
|
|
90
|
+
def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
|
|
91
|
+
self.preprocess_row(input_d)
|
|
92
|
+
|
|
93
|
+
# If the model is a function calling model, we need to remove the system prompt
|
|
94
|
+
if self.is_fc_model:
|
|
95
|
+
turns = input_d['turns']
|
|
96
|
+
new_turns = []
|
|
97
|
+
for turn_idx, messages in enumerate(turns):
|
|
98
|
+
current_messages = messages.copy()
|
|
99
|
+
if len(current_messages) > 0 and current_messages[0]['role'] == 'system':
|
|
100
|
+
current_messages = current_messages[1:]
|
|
101
|
+
new_turns.append(current_messages)
|
|
102
|
+
input_d['turns'] = new_turns
|
|
103
|
+
|
|
104
|
+
return self.gen_prompt_data(prompt='', messages=input_d)
|
|
105
|
+
|
|
106
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
107
|
+
# Get the gold choice
|
|
108
|
+
return input_d.get('ground_truth', )
|
|
109
|
+
|
|
110
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> dict:
|
|
111
|
+
row = copy.deepcopy(raw_input_d)
|
|
112
|
+
del row['turns'] # Remove turns as they are not needed for the match function
|
|
113
|
+
|
|
114
|
+
row['generation'] = result
|
|
115
|
+
return row
|
|
116
|
+
|
|
117
|
+
def match(self, gold: dict, pred: dict) -> dict:
|
|
118
|
+
from bfcl_eval.eval_checker.ast_eval.ast_checker import ast_checker
|
|
119
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import multi_turn_checker
|
|
120
|
+
from bfcl_eval.model_handler.utils import (convert_to_function_call, default_decode_ast_prompting,
|
|
121
|
+
default_decode_execute_prompting)
|
|
122
|
+
from bfcl_eval.utils import is_empty_output
|
|
123
|
+
|
|
124
|
+
# NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
|
|
125
|
+
# which decides if model was provided with functions of the type
|
|
126
|
+
# spotify.list_songs or spotify_list_songs
|
|
127
|
+
# It is False for all llama models (when using via prompting)
|
|
128
|
+
# and True for API calls
|
|
129
|
+
if self.underscore_to_dot:
|
|
130
|
+
dummy_model = 'gpt-4o-2024-11-20-FC'
|
|
131
|
+
else:
|
|
132
|
+
dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
|
|
133
|
+
|
|
134
|
+
row = pred
|
|
135
|
+
test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
|
|
136
|
+
if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
|
|
137
|
+
error = None
|
|
138
|
+
try:
|
|
139
|
+
if self.is_fc_model:
|
|
140
|
+
decoded_tool_calls = []
|
|
141
|
+
for tool_call in row['generation'][0]:
|
|
142
|
+
name = list(tool_call.keys())[0]
|
|
143
|
+
params = json.loads(tool_call[name])
|
|
144
|
+
decoded_tool_calls.append({name: params})
|
|
145
|
+
else:
|
|
146
|
+
decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
|
|
147
|
+
|
|
148
|
+
# successful decode means valid function call was present
|
|
149
|
+
contains_func_call = True
|
|
150
|
+
if is_empty_output(decoded_tool_calls):
|
|
151
|
+
# Empty output is not considered as a valid function call
|
|
152
|
+
contains_func_call = False
|
|
153
|
+
error = 'Empty decoded output.'
|
|
154
|
+
except Exception:
|
|
155
|
+
contains_func_call = False
|
|
156
|
+
error = f'Failed to decode with traceback: {traceback.format_exc()}'
|
|
157
|
+
finally:
|
|
158
|
+
valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
|
|
159
|
+
score_result = {'valid': valid, 'error_message': error}
|
|
160
|
+
|
|
161
|
+
elif row['multi_turn']:
|
|
162
|
+
# each step might give a list of tool calls and each turn is multi-step
|
|
163
|
+
# and multi-turn has generations of all the turns
|
|
164
|
+
# hence in a multi-turn setting,
|
|
165
|
+
# multi_turn_decoded_generations is a list of list of list of strings
|
|
166
|
+
multi_turn_decoded_generations: list[list[list[str]]] = []
|
|
167
|
+
for single_turn_generations in row['generation']:
|
|
168
|
+
single_turn_decoded_generations: list[list[str]] = []
|
|
169
|
+
for generation in single_turn_generations:
|
|
170
|
+
try:
|
|
171
|
+
if self.is_fc_model:
|
|
172
|
+
tool_calls = convert_to_function_call(generation)
|
|
173
|
+
else:
|
|
174
|
+
tool_calls = default_decode_execute_prompting(generation)
|
|
175
|
+
|
|
176
|
+
single_turn_decoded_generations.append(tool_calls)
|
|
177
|
+
except Exception:
|
|
178
|
+
single_turn_decoded_generations.append([generation])
|
|
179
|
+
|
|
180
|
+
multi_turn_decoded_generations.append(single_turn_decoded_generations)
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
raw_score_result = multi_turn_checker(
|
|
184
|
+
multi_turn_decoded_generations,
|
|
185
|
+
row['ground_truth'],
|
|
186
|
+
row,
|
|
187
|
+
test_category,
|
|
188
|
+
dummy_model,
|
|
189
|
+
)
|
|
190
|
+
except Exception:
|
|
191
|
+
raw_score_result = {
|
|
192
|
+
'valid': False,
|
|
193
|
+
'error_type': 'multi_turn:checker_failed',
|
|
194
|
+
'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
score_result = {
|
|
198
|
+
'valid': float(raw_score_result['valid']),
|
|
199
|
+
'error_message': raw_score_result.get('error_message', ''),
|
|
200
|
+
'error_type': raw_score_result.get('error_type', ''),
|
|
201
|
+
}
|
|
202
|
+
else:
|
|
203
|
+
try:
|
|
204
|
+
if self.is_fc_model:
|
|
205
|
+
decoded_tool_calls = []
|
|
206
|
+
for tool_call in row['generation'][0]:
|
|
207
|
+
name = list(tool_call.keys())[0]
|
|
208
|
+
params = json.loads(tool_call[name])
|
|
209
|
+
decoded_tool_calls.append({name: params})
|
|
210
|
+
else:
|
|
211
|
+
decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
|
|
212
|
+
|
|
213
|
+
score_result = ast_checker(
|
|
214
|
+
row['functions'],
|
|
215
|
+
decoded_tool_calls,
|
|
216
|
+
row['ground_truth'],
|
|
217
|
+
row['language'],
|
|
218
|
+
row['test_category'],
|
|
219
|
+
dummy_model,
|
|
220
|
+
)
|
|
221
|
+
except Exception:
|
|
222
|
+
score_result = {
|
|
223
|
+
'valid': False,
|
|
224
|
+
'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
|
|
225
|
+
'error_type': 'ast_decoder:decoder_failed',
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return {
|
|
229
|
+
'AverageAccuracy': float(score_result['valid']),
|
|
230
|
+
'raw_score_result': score_result,
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
|
|
234
|
+
# aggregate review results
|
|
235
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
236
|
+
|
|
237
|
+
return super().compute_metric(res_dict, **kwargs)
|
|
@@ -126,6 +126,9 @@ SUBJECT_MAPPING = {
|
|
|
126
126
|
@Benchmark.register(
|
|
127
127
|
name='ceval',
|
|
128
128
|
pretty_name='C-Eval',
|
|
129
|
+
tags=['Knowledge', 'MCQ', 'Chinese'],
|
|
130
|
+
description=
|
|
131
|
+
'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.', # noqa: E501
|
|
129
132
|
dataset_id='modelscope/ceval-exam',
|
|
130
133
|
model_adapter=OutputType.GENERATION,
|
|
131
134
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -87,7 +87,10 @@ SUBSET_LIST = ['中华文化', '人文与社会科学', '工程、技术与应
|
|
|
87
87
|
|
|
88
88
|
@Benchmark.register(
|
|
89
89
|
name='chinese_simpleqa',
|
|
90
|
-
pretty_name='Chinese
|
|
90
|
+
pretty_name='Chinese-SimpleQA',
|
|
91
|
+
tags=['Knowledge', 'QA', 'Chinese'],
|
|
92
|
+
description=
|
|
93
|
+
"Chinese SimpleQA is a Chinese question-answering dataset designed to evaluate the performance of language models on simple factual questions. It includes a variety of topics and is structured to test the model's ability to understand and generate correct answers in Chinese.", # noqa: E501
|
|
91
94
|
subset_list=SUBSET_LIST,
|
|
92
95
|
dataset_id='AI-ModelScope/Chinese-SimpleQA',
|
|
93
96
|
metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
|
|
@@ -103,6 +103,9 @@ SUBJECT_MAPPING = {
|
|
|
103
103
|
@Benchmark.register(
|
|
104
104
|
name='cmmlu',
|
|
105
105
|
pretty_name='C-MMLU',
|
|
106
|
+
tags=['Knowledge', 'MCQ', 'Chinese'],
|
|
107
|
+
description=
|
|
108
|
+
'C-MMLU is a benchmark designed to evaluate the performance of AI models on Chinese language tasks, including reading comprehension, text classification, and more.',
|
|
106
109
|
dataset_id='modelscope/cmmlu',
|
|
107
110
|
model_adapter=OutputType.GENERATION,
|
|
108
111
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
@@ -17,6 +17,9 @@ logger = get_logger()
|
|
|
17
17
|
@Benchmark.register(
|
|
18
18
|
name='competition_math',
|
|
19
19
|
pretty_name='MATH',
|
|
20
|
+
tags=['Mathematics'],
|
|
21
|
+
description=
|
|
22
|
+
'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
|
|
20
23
|
dataset_id='modelscope/competition_math',
|
|
21
24
|
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
22
25
|
metric_list=['AveragePass@1'],
|
|
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
|
|
|
5
5
|
from collections import defaultdict
|
|
6
6
|
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
|
-
from evalscope.benchmarks.utils import PromptData, preprocess_decorator
|
|
8
|
+
from evalscope.benchmarks.utils import PromptData, load_file_with_extension, preprocess_decorator
|
|
9
9
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
10
10
|
from evalscope.metrics import LLMJudge, metric_registry
|
|
11
11
|
from evalscope.report import Report, ReportGenerator
|
|
@@ -15,6 +15,13 @@ logger = get_logger()
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class DataAdapter(ABC):
|
|
18
|
+
"""
|
|
19
|
+
Data Adapter for the benchmark. You need to implement the following methods:
|
|
20
|
+
- gen_prompt
|
|
21
|
+
- get_gold_answer
|
|
22
|
+
- parse_pred_result
|
|
23
|
+
- match
|
|
24
|
+
"""
|
|
18
25
|
|
|
19
26
|
def __init__(self,
|
|
20
27
|
name: str,
|
|
@@ -31,30 +38,37 @@ class DataAdapter(ABC):
|
|
|
31
38
|
system_prompt: Optional[str] = None,
|
|
32
39
|
query_template: Optional[str] = None,
|
|
33
40
|
pretty_name: Optional[str] = None,
|
|
41
|
+
description: Optional[str] = None,
|
|
42
|
+
tags: Optional[List[str]] = None,
|
|
34
43
|
**kwargs):
|
|
35
44
|
"""
|
|
36
|
-
Data Adapter for the benchmark. You need to implement the following methods:
|
|
37
|
-
- gen_prompt
|
|
38
|
-
- get_gold_answer
|
|
39
|
-
- parse_pred_result
|
|
40
|
-
- match
|
|
41
45
|
Args:
|
|
42
46
|
name: str, the name of the benchmark.
|
|
43
47
|
dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
|
|
48
|
+
model_adapter: str, the model adapter to use for the benchmark.
|
|
44
49
|
subset_list: list of subset names for the dataset.
|
|
45
50
|
metric_list: list, the metric list to evaluate the model on specific benchmark.
|
|
51
|
+
llm_as_a_judge: bool, whether to use LLM as a judge to evaluate the predicted answer against the gold answer.
|
|
52
|
+
output_types: list, the output types of the model adapter. Default: [model_adapter]
|
|
46
53
|
few_shot_num: int, number of few-shot examples. Default: 0
|
|
47
54
|
train_split: str, usually for few-shot examples. e.g. 'train'
|
|
48
55
|
eval_split: str, the target eval split name. e.g. 'test'
|
|
49
56
|
prompt_template: str, the prompt template for the benchmark,
|
|
50
57
|
e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
|
|
51
58
|
the form of A or B or C or D, do not output explanation:`
|
|
52
|
-
|
|
59
|
+
system_prompt: str, the system prompt for the benchmark, e.g. 'You are a helpful assistant.'
|
|
60
|
+
query_template: str, the query template for the benchmark, e.g. 'Please answer the following question: {}'
|
|
61
|
+
pretty_name: str, the pretty name of the benchmark, e.g. 'ARC Challenge Set'.
|
|
62
|
+
description: str, the description of the benchmark,
|
|
63
|
+
e.g. 'ARC Challenge Set is a benchmark for evaluating reasoning abilities of models on science questions.'
|
|
64
|
+
""" # noqa: E501
|
|
53
65
|
self.name = name
|
|
54
66
|
self.dataset_id = dataset_id
|
|
55
67
|
self.model_adapter = model_adapter
|
|
56
68
|
self.subset_list = subset_list
|
|
57
69
|
self.metric_list = metric_list
|
|
70
|
+
self.llm_as_a_judge = llm_as_a_judge
|
|
71
|
+
self.output_types = output_types or [model_adapter]
|
|
58
72
|
self.few_shot_num = few_shot_num
|
|
59
73
|
self.train_split = train_split
|
|
60
74
|
self.eval_split = eval_split
|
|
@@ -62,9 +76,9 @@ class DataAdapter(ABC):
|
|
|
62
76
|
self.system_prompt = system_prompt
|
|
63
77
|
self.query_template = query_template
|
|
64
78
|
self.pretty_name = pretty_name
|
|
79
|
+
self.description = description
|
|
80
|
+
self.tags = tags or []
|
|
65
81
|
self.config_kwargs = kwargs
|
|
66
|
-
self.output_types = output_types or [model_adapter]
|
|
67
|
-
self.llm_as_a_judge = llm_as_a_judge
|
|
68
82
|
self.category_map = kwargs.get('category_map', {})
|
|
69
83
|
self.choices = kwargs.get('choices', None)
|
|
70
84
|
|
|
@@ -156,6 +170,49 @@ class DataAdapter(ABC):
|
|
|
156
170
|
"""
|
|
157
171
|
return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
158
172
|
|
|
173
|
+
def load_with_snapshot(self,
|
|
174
|
+
file_structure: Dict[str, List[str]],
|
|
175
|
+
dataset_name_or_path: str = None,
|
|
176
|
+
subset_list: list = None,
|
|
177
|
+
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
178
|
+
**kwargs) -> dict:
|
|
179
|
+
"""
|
|
180
|
+
For datasets that cannot be correctly loaded using MsDataset, utilize snapshot downloading to load the data.
|
|
181
|
+
This feature supports both remote and local datasets.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
file_structure: dict, the file structure of the dataset, e.g. {'subset_name': ['file1.jsonl', 'file2.jsonl']}.
|
|
185
|
+
dataset_name_or_path: str, the dataset id on ModelScope or local path for the benchmark.
|
|
186
|
+
subset_list: list of subset names for the dataset.
|
|
187
|
+
work_dir: str, the working directory to store the dataset.
|
|
188
|
+
Returns: {'subset_name': {'eval': eval_dataset}}
|
|
189
|
+
""" # noqa: E501
|
|
190
|
+
dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
|
|
191
|
+
subset_list = subset_list or self.subset_list
|
|
192
|
+
|
|
193
|
+
# Try to load dataset from local disk
|
|
194
|
+
if os.path.exists(dataset_name_or_path):
|
|
195
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
196
|
+
dataset_path = dataset_name_or_path
|
|
197
|
+
else:
|
|
198
|
+
from modelscope import dataset_snapshot_download
|
|
199
|
+
|
|
200
|
+
# Load dataset from remote
|
|
201
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
202
|
+
# flatten file structure
|
|
203
|
+
file_names = [file for sub_files in file_structure.values() for file in sub_files]
|
|
204
|
+
# download dataset snapshot
|
|
205
|
+
dataset_path = dataset_snapshot_download(
|
|
206
|
+
dataset_name_or_path, cache_dir=work_dir, allow_file_pattern=file_names)
|
|
207
|
+
# read and process files
|
|
208
|
+
data_dict = defaultdict(dict)
|
|
209
|
+
for sub_name in subset_list:
|
|
210
|
+
file_paths = [os.path.join(dataset_path, file_name) for file_name in file_structure[sub_name]]
|
|
211
|
+
# not train split, only eval split
|
|
212
|
+
data_dict[sub_name][self.eval_split] = load_file_with_extension(file_paths)
|
|
213
|
+
|
|
214
|
+
return data_dict
|
|
215
|
+
|
|
159
216
|
def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
|
|
160
217
|
"""
|
|
161
218
|
Reformat the dataset subset with subset_key and format.
|
|
@@ -249,7 +306,7 @@ class DataAdapter(ABC):
|
|
|
249
306
|
def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
|
|
250
307
|
**kwargs) -> Dict[str, List[float]]:
|
|
251
308
|
"""
|
|
252
|
-
compute weighted mean of
|
|
309
|
+
compute weighted mean of score of all samples
|
|
253
310
|
|
|
254
311
|
Args:
|
|
255
312
|
review_res_list: [score1, score2, ...]
|
|
@@ -270,7 +327,7 @@ class DataAdapter(ABC):
|
|
|
270
327
|
items['AverageAccuracy'].append(scores)
|
|
271
328
|
return items
|
|
272
329
|
|
|
273
|
-
def gen_report(self, subset_score_map: dict,
|
|
330
|
+
def gen_report(self, subset_score_map: dict, model_name: str, **kwargs) -> Report:
|
|
274
331
|
"""
|
|
275
332
|
Generate report for the evaluation results for all subsets.
|
|
276
333
|
|
|
@@ -278,7 +335,7 @@ class DataAdapter(ABC):
|
|
|
278
335
|
subset_score_map: The subset-score map.
|
|
279
336
|
e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
|
|
280
337
|
|
|
281
|
-
|
|
338
|
+
model_name: The evaluation model name.
|
|
282
339
|
|
|
283
340
|
Returns: The evaluation report.
|
|
284
341
|
|
|
@@ -312,9 +369,17 @@ class DataAdapter(ABC):
|
|
|
312
369
|
"model_name": "qwen2.5"
|
|
313
370
|
}
|
|
314
371
|
""" # noqa: E501
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
372
|
+
return ReportGenerator.gen_report(subset_score_map, model_name, data_adapter=self, **kwargs)
|
|
373
|
+
|
|
374
|
+
def post_process_report(self, report: Report, **kwargs):
|
|
375
|
+
"""
|
|
376
|
+
Post-process the report after generation. Draw a chart, save to file, etc.
|
|
377
|
+
This method can be overridden to customize the report format or content.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
report (Report): The generated report.
|
|
381
|
+
"""
|
|
382
|
+
pass
|
|
318
383
|
|
|
319
384
|
def gen_prompt_data(self,
|
|
320
385
|
prompt: str,
|
|
@@ -324,6 +389,23 @@ class DataAdapter(ABC):
|
|
|
324
389
|
id: Optional[Union[int, str]] = None,
|
|
325
390
|
messages: Optional[List[dict]] = None,
|
|
326
391
|
**kwargs) -> dict:
|
|
392
|
+
"""
|
|
393
|
+
Generates a dictionary representation of prompt data for evaluation or inference.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
prompt (str): The main prompt or input text. Can also be a list of prompts.
|
|
397
|
+
system_prompt (Optional[str], optional): An optional system-level prompt to provide context or instructions. Defaults to None.
|
|
398
|
+
choices (Optional[List[str]], optional): A list of possible choices for multi-choice tasks.
|
|
399
|
+
If not provided, uses self.choices. Defaults to None.
|
|
400
|
+
index (Optional[Union[int, str]], optional): An optional index or identifier for the prompt.
|
|
401
|
+
Defaults to 0 if not provided. Defaults to None.
|
|
402
|
+
id (Optional[Union[int, str]], optional): An optional unique identifier for the prompt data. Defaults to None.
|
|
403
|
+
messages (Optional[List[dict]], optional): An optional list of message dictionaries, typically for chat-based prompts. Defaults to None.
|
|
404
|
+
If messages is provided, it will be used as the prompt data instead of the prompt string.
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
dict: A dictionary representation of the prompt data, suitable for further processing or model input.
|
|
408
|
+
""" # noqa: E501
|
|
327
409
|
data = [prompt] if not isinstance(prompt, list) else prompt
|
|
328
410
|
prompt_data = PromptData(
|
|
329
411
|
data=data,
|
|
@@ -416,7 +498,8 @@ class DataAdapter(ABC):
|
|
|
416
498
|
|
|
417
499
|
# Extract question from raw_input if available
|
|
418
500
|
raw_input = kwargs.get('raw_input', {})
|
|
419
|
-
question_keys = ['question', 'prompt', 'query', 'problem']
|
|
501
|
+
question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
|
|
502
|
+
# Find the first non-empty question key in raw_input
|
|
420
503
|
question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
|
|
421
504
|
|
|
422
505
|
# Request judge and obtain score
|
|
File without changes
|