evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,8 @@ logger = get_logger()
|
|
|
14
14
|
@Benchmark.register(
|
|
15
15
|
name='general_qa',
|
|
16
16
|
pretty_name='General-QA',
|
|
17
|
-
description='
|
|
17
|
+
description='A general question answering dataset for custom evaluation. '
|
|
18
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
|
|
18
19
|
tags=['QA', 'Custom'],
|
|
19
20
|
dataset_id='general_qa',
|
|
20
21
|
subset_list=['default'],
|
|
@@ -25,13 +26,21 @@ logger = get_logger()
|
|
|
25
26
|
prompt_template='请回答问题\n{query}',
|
|
26
27
|
)
|
|
27
28
|
class GeneralQAAdapter(DataAdapter):
|
|
28
|
-
# TODO: set few_shot_num
|
|
29
29
|
|
|
30
30
|
def __init__(self, **kwargs):
|
|
31
|
-
|
|
32
31
|
super().__init__(**kwargs)
|
|
33
32
|
|
|
34
33
|
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
34
|
+
"""
|
|
35
|
+
Load dataset from the given path or dataset name.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
dataset_name_or_path (str): Path to dataset directory or file.
|
|
39
|
+
subset_list (list): List of subset names to load.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
dict: Loaded dataset organized by subset.
|
|
43
|
+
"""
|
|
35
44
|
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
36
45
|
subset_list = subset_list or self.subset_list
|
|
37
46
|
|
|
@@ -61,58 +70,64 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
61
70
|
|
|
62
71
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
63
72
|
"""
|
|
73
|
+
Generate prompt for the model based on input data.
|
|
74
|
+
|
|
64
75
|
Args:
|
|
65
|
-
input_d:
|
|
66
|
-
|
|
67
|
-
|
|
76
|
+
input_d (dict): Input data dictionary.
|
|
77
|
+
subset_name (str): Name of the subset.
|
|
78
|
+
few_shot_list (list): List of few-shot examples.
|
|
68
79
|
|
|
69
80
|
Returns:
|
|
70
|
-
|
|
71
|
-
|
|
81
|
+
dict: Dictionary containing the generated prompt.
|
|
72
82
|
"""
|
|
73
|
-
|
|
74
|
-
history = input_d.get('history', []) # history: [['q1', 'a1'], ['q2', 'a2'], ...]
|
|
75
|
-
if len(history) > 0:
|
|
76
|
-
logger.warning('The history is not included in the prompt for GeneralQA. \
|
|
77
|
-
To be supported in the future.')
|
|
78
|
-
|
|
83
|
+
messages = input_d.get('messages')
|
|
79
84
|
query = input_d.get('question', '') or input_d.get('query', '')
|
|
80
85
|
system_prompt = input_d.get('system')
|
|
81
86
|
prompt = self.prompt_template.format(query=query)
|
|
82
|
-
return self.gen_prompt_data(prompt, system_prompt=system_prompt)
|
|
87
|
+
return self.gen_prompt_data(prompt, system_prompt=system_prompt, messages=messages)
|
|
83
88
|
|
|
84
89
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
85
90
|
"""
|
|
91
|
+
Extract the gold (reference) answer from the input data.
|
|
92
|
+
|
|
86
93
|
Args:
|
|
87
|
-
input_d
|
|
94
|
+
input_d (dict): Input data dictionary.
|
|
88
95
|
|
|
89
96
|
Returns:
|
|
90
|
-
|
|
91
|
-
|
|
97
|
+
str: Gold answer string.
|
|
92
98
|
"""
|
|
93
|
-
return input_d.get('answer'
|
|
99
|
+
return input_d.get('answer') or input_d.get('response')
|
|
94
100
|
|
|
95
101
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
96
102
|
"""
|
|
103
|
+
Parse the prediction result.
|
|
104
|
+
|
|
97
105
|
Args:
|
|
98
|
-
result:
|
|
106
|
+
result (str): Model prediction result.
|
|
107
|
+
raw_input_d (dict, optional): Original input data.
|
|
108
|
+
eval_type (str): Evaluation type.
|
|
99
109
|
|
|
100
110
|
Returns:
|
|
101
|
-
|
|
102
|
-
|
|
111
|
+
str: Parsed prediction result.
|
|
103
112
|
"""
|
|
104
113
|
return result
|
|
105
114
|
|
|
106
115
|
def match(self, gold: str, pred: str) -> dict:
|
|
107
116
|
"""
|
|
117
|
+
Compute metric scores between gold and predicted answers.
|
|
118
|
+
|
|
108
119
|
Args:
|
|
109
|
-
gold:
|
|
110
|
-
pred:
|
|
120
|
+
gold (str): Gold answer.
|
|
121
|
+
pred (str): Predicted answer.
|
|
111
122
|
|
|
112
123
|
Returns:
|
|
113
|
-
|
|
114
|
-
|
|
124
|
+
dict: Dictionary of computed metric scores.
|
|
115
125
|
"""
|
|
126
|
+
# reference free metrics
|
|
127
|
+
if gold is None:
|
|
128
|
+
return {'AverageAccuracy': -1}
|
|
129
|
+
|
|
130
|
+
# calculate rouge and bleu scores
|
|
116
131
|
res = dict()
|
|
117
132
|
if 'AverageRouge' in self.metric_list:
|
|
118
133
|
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
@@ -128,14 +143,13 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
128
143
|
|
|
129
144
|
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
130
145
|
"""
|
|
131
|
-
|
|
146
|
+
Compute weighted mean of the metric scores for all samples.
|
|
132
147
|
|
|
133
148
|
Args:
|
|
134
|
-
review_res_list:
|
|
149
|
+
review_res_list (list): List of metric score dictionaries.
|
|
135
150
|
|
|
136
151
|
Returns:
|
|
137
|
-
|
|
138
|
-
|
|
152
|
+
list: List of dictionaries with averaged metric results.
|
|
139
153
|
"""
|
|
140
154
|
items = super().compute_dict_metric(review_res_list, **kwargs)
|
|
141
155
|
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
@@ -6,9 +6,9 @@ import re
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType, OutputType
|
|
8
8
|
from evalscope.metrics import exact_match
|
|
9
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
9
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
11
|
-
from evalscope.utils.utils import ResponseParser
|
|
12
12
|
|
|
13
13
|
# flake8: noqa
|
|
14
14
|
|
|
File without changes
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Any, List
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.metrics import DEFAULT_PROMPT_TEMPLATE, LLMJudge, exact_match, mean
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
# flake8: noqa
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
SUBSET_LIST = [
|
|
14
|
+
'Biology/Medicine',
|
|
15
|
+
'Chemistry',
|
|
16
|
+
'Computer Science/AI',
|
|
17
|
+
'Engineering',
|
|
18
|
+
'Humanities/Social Science',
|
|
19
|
+
'Math',
|
|
20
|
+
'Physics',
|
|
21
|
+
'Other',
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@Benchmark.register(
|
|
26
|
+
name='hle',
|
|
27
|
+
pretty_name="Humanity's-Last-Exam",
|
|
28
|
+
tags=['Knowledge', 'QA'],
|
|
29
|
+
description=
|
|
30
|
+
'Humanity\'s Last Exam (HLE) is a language model benchmark consisting of 2,500 questions across a broad range of subjects. It was created jointly by the Center for AI Safety and Scale AI. The benchmark classifies the questions into the following broad subjects: mathematics (41%), physics (9%), biology/medicine (11%), humanities/social science (9%), computer science/artificial intelligence (10%), engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions require the ability to understand both text and images, i.e., multi-modality. 24% of the questions are multiple-choice; the rest are short-answer, exact-match questions.', # noqa: E501
|
|
31
|
+
dataset_id='cais/hle',
|
|
32
|
+
subset_list=SUBSET_LIST,
|
|
33
|
+
metric_list=['AverageAccuracy'],
|
|
34
|
+
few_shot_num=0,
|
|
35
|
+
train_split=None,
|
|
36
|
+
eval_split='test',
|
|
37
|
+
prompt_template='{query}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
38
|
+
)
|
|
39
|
+
class HLEAdapter(DataAdapter):
|
|
40
|
+
|
|
41
|
+
def __init__(self, *args, **kwargs):
|
|
42
|
+
super().__init__(*args, **kwargs)
|
|
43
|
+
|
|
44
|
+
self.llm_as_a_judge = True
|
|
45
|
+
|
|
46
|
+
def load(self, **kwargs):
|
|
47
|
+
kwargs['subset_list'] = ['default']
|
|
48
|
+
data_dict = super().load(**kwargs)
|
|
49
|
+
return self.reformat_subset(data_dict, subset_key='category', format='{}')
|
|
50
|
+
|
|
51
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
52
|
+
# remove image preview
|
|
53
|
+
input_d.pop('image_preview', None)
|
|
54
|
+
input_d.pop('rationale_image', None)
|
|
55
|
+
# generate prompt
|
|
56
|
+
question = input_d['question']
|
|
57
|
+
prompt = self.prompt_template.format(query=question)
|
|
58
|
+
image = input_d.get('image', None)
|
|
59
|
+
# build messages for multi-modal input
|
|
60
|
+
messages = []
|
|
61
|
+
if self.system_prompt:
|
|
62
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
63
|
+
if image:
|
|
64
|
+
messages.append({
|
|
65
|
+
'role':
|
|
66
|
+
'user',
|
|
67
|
+
'content': [{
|
|
68
|
+
'type': 'text',
|
|
69
|
+
'text': prompt
|
|
70
|
+
}, {
|
|
71
|
+
'type': 'image_url',
|
|
72
|
+
'image_url': {
|
|
73
|
+
'url': image
|
|
74
|
+
}
|
|
75
|
+
}]
|
|
76
|
+
})
|
|
77
|
+
else:
|
|
78
|
+
messages.append({'role': 'user', 'content': prompt})
|
|
79
|
+
return self.gen_prompt_data(prompt='', messages=messages)
|
|
80
|
+
|
|
81
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
82
|
+
return input_d['answer']
|
|
83
|
+
|
|
84
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
|
|
85
|
+
# Extract the answer from the model output \boxed{answer}
|
|
86
|
+
match = re.search(r'\\boxed{([^}]*)}', result)
|
|
87
|
+
if match:
|
|
88
|
+
return match.group(1).strip()
|
|
89
|
+
else:
|
|
90
|
+
logger.warning(f'No answer found in the model output: {result}')
|
|
91
|
+
return ''
|
|
92
|
+
|
|
93
|
+
def llm_parse_pred_result(self, result, raw_input_d=None, **kwargs) -> str:
|
|
94
|
+
return result.strip()
|
|
95
|
+
|
|
96
|
+
def match(self, gold: str, pred: str) -> dict:
|
|
97
|
+
# simple match
|
|
98
|
+
return {
|
|
99
|
+
'AverageAccuracy': 1.0 if exact_match(gold, pred) else 0.0,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
|
|
103
|
+
raw_input = kwargs.get('raw_input', None)
|
|
104
|
+
question = raw_input['question']
|
|
105
|
+
# get grading response
|
|
106
|
+
prompt = judge.build_prompt(pred, gold, question)
|
|
107
|
+
judge_response = judge(prompt)
|
|
108
|
+
score = judge.get_score(judge_response)
|
|
109
|
+
return {
|
|
110
|
+
'AverageAccuracy': score,
|
|
111
|
+
'response': judge_response,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
115
|
+
# zip dict answers
|
|
116
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
117
|
+
|
|
118
|
+
return super().compute_metric(res_dict, **kwargs)
|
|
@@ -22,7 +22,8 @@ logger = get_logger()
|
|
|
22
22
|
few_shot_num=0,
|
|
23
23
|
train_split=None,
|
|
24
24
|
eval_split='test',
|
|
25
|
-
prompt_template=
|
|
25
|
+
prompt_template=
|
|
26
|
+
'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{query}', # noqa: E501
|
|
26
27
|
extra_params={
|
|
27
28
|
'num_workers': 4,
|
|
28
29
|
'timeout': 4
|
|
@@ -76,26 +77,9 @@ class HumanevalAdapter(DataAdapter):
|
|
|
76
77
|
|
|
77
78
|
@classmethod
|
|
78
79
|
def _postprocess(cls, text: str) -> str:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
text = text.split('```')[1] # fall back to default strategy
|
|
83
|
-
else:
|
|
84
|
-
text = blocks[0] # fetch the first code block
|
|
85
|
-
if not text.startswith('\n'): # in case starting with ```python
|
|
86
|
-
text = text[max(text.find('\n') + 1, 0):]
|
|
87
|
-
if text.strip().startswith('from') or text.strip().startswith('import'):
|
|
88
|
-
def_idx = text.find('def')
|
|
89
|
-
if def_idx != -1:
|
|
90
|
-
text = text[max(text.find('\n', def_idx) + 1, 0):]
|
|
91
|
-
text = text.split('\n\n')[0]
|
|
92
|
-
if text.strip().startswith('def'):
|
|
93
|
-
text = '\n'.join(text.split('\n')[1:])
|
|
94
|
-
if not text.startswith(' '):
|
|
95
|
-
if text.startswith(' '):
|
|
96
|
-
text = ' ' + text.lstrip()
|
|
97
|
-
else:
|
|
98
|
-
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
80
|
+
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
|
|
81
|
+
if len(blocks) >= 1:
|
|
82
|
+
text = blocks[0]
|
|
99
83
|
return text
|
|
100
84
|
|
|
101
85
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
@@ -2,7 +2,6 @@ from collections import defaultdict
|
|
|
2
2
|
from typing import Any, Dict, List
|
|
3
3
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
-
from evalscope.benchmarks.ifeval.utils import process_results
|
|
6
5
|
from evalscope.constants import EvalType
|
|
7
6
|
from evalscope.metrics import Metric, mean, metric_registry
|
|
8
7
|
|
|
@@ -43,10 +42,9 @@ class IFEvalAdapter(DataAdapter):
|
|
|
43
42
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
44
43
|
return input_d
|
|
45
44
|
|
|
46
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
47
|
-
return result
|
|
48
|
-
|
|
49
45
|
def match(self, gold: Any, pred: Any) -> Dict:
|
|
46
|
+
from evalscope.benchmarks.ifeval.utils import process_results
|
|
47
|
+
|
|
50
48
|
return process_results(gold, [pred])
|
|
51
49
|
|
|
52
50
|
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
2
|
from evalscope.constants import EvalType, OutputType
|
|
3
3
|
from evalscope.metrics import exact_match
|
|
4
|
-
from evalscope.
|
|
4
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@Benchmark.register(
|
|
@@ -69,12 +69,6 @@ class LiveCodeBenchAdapter(DataAdapter):
|
|
|
69
69
|
# Extract the gold answer from the input dict.
|
|
70
70
|
return input_d
|
|
71
71
|
|
|
72
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
73
|
-
"""
|
|
74
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
75
|
-
"""
|
|
76
|
-
return result
|
|
77
|
-
|
|
78
72
|
def match(self, gold: dict, pred: str) -> float:
|
|
79
73
|
from .evaluate_utils import codegen_metrics
|
|
80
74
|
from .extract_utils import extract_code_generation
|
|
@@ -3,7 +3,7 @@ from typing import Any
|
|
|
3
3
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
4
4
|
from evalscope.constants import EvalType, OutputType
|
|
5
5
|
from evalscope.metrics import exact_match
|
|
6
|
-
from evalscope.
|
|
6
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
7
7
|
|
|
8
8
|
SUBSET_LIST = ['default']
|
|
9
9
|
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.
|
|
8
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
11
11
|
# flake8: noqa
|
|
@@ -144,7 +144,7 @@ SUBJECT_MAPPING = {
|
|
|
144
144
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
145
145
|
subset_list=SUBSET_LIST,
|
|
146
146
|
metric_list=['AverageAccuracy'],
|
|
147
|
-
few_shot_num=
|
|
147
|
+
few_shot_num=0,
|
|
148
148
|
train_split='train',
|
|
149
149
|
eval_split='test',
|
|
150
150
|
prompt_template=
|
|
@@ -4,7 +4,7 @@ from typing import Any, Dict
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
5
|
from evalscope.constants import EvalType, OutputType
|
|
6
6
|
from evalscope.metrics import exact_match
|
|
7
|
-
from evalscope.
|
|
7
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
8
8
|
|
|
9
9
|
SUBSET_LIST = [
|
|
10
10
|
'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
|
|
@@ -4,8 +4,8 @@ from typing import Any, Dict
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
5
|
from evalscope.constants import EvalType, OutputType
|
|
6
6
|
from evalscope.metrics import exact_match
|
|
7
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
7
8
|
from evalscope.utils.logger import get_logger
|
|
8
|
-
from evalscope.utils.utils import ResponseParser
|
|
9
9
|
|
|
10
10
|
logger = get_logger()
|
|
11
11
|
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
5
|
from evalscope.constants import EvalType, OutputType
|
|
6
6
|
from evalscope.metrics import exact_match
|
|
7
|
-
from evalscope.
|
|
7
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@Benchmark.register(
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.
|
|
8
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
9
9
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
File without changes
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.metrics import Metric, mean, metric_registry
|
|
7
|
+
from evalscope.utils import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@Benchmark.register(
|
|
13
|
+
name='tau_bench',
|
|
14
|
+
pretty_name='τ-bench',
|
|
15
|
+
tags=['Reasoning', 'Agent', 'Function Calling'],
|
|
16
|
+
description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
|
|
17
|
+
'and a language agent provided with domain-specific API tools and policy guidelines. '
|
|
18
|
+
'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating and set a user model. ', # noqa: E501
|
|
19
|
+
dataset_id='https://github.com/sierra-research/tau-bench',
|
|
20
|
+
model_adapter='tau_bench_server',
|
|
21
|
+
subset_list=['airline', 'retail'],
|
|
22
|
+
metric_list=['Pass^1'],
|
|
23
|
+
eval_split='test',
|
|
24
|
+
extra_params={
|
|
25
|
+
'user_model': 'qwen-plus',
|
|
26
|
+
'api_key': 'EMPTY',
|
|
27
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
28
|
+
'generation_config': {
|
|
29
|
+
'temperature': 0.7,
|
|
30
|
+
'max_new_tokens': 1024
|
|
31
|
+
}
|
|
32
|
+
})
|
|
33
|
+
class TauBenchAdapter(DataAdapter):
|
|
34
|
+
|
|
35
|
+
def __init__(self, **kwargs):
|
|
36
|
+
super().__init__(**kwargs)
|
|
37
|
+
|
|
38
|
+
spec = importlib.util.find_spec('tau_bench')
|
|
39
|
+
if spec is None:
|
|
40
|
+
raise ImportError(
|
|
41
|
+
'`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
metric_registry.register(Metric(name='Pass^1', object=mean))
|
|
45
|
+
|
|
46
|
+
# setup user model args
|
|
47
|
+
extra_params = kwargs.get('extra_params', {})
|
|
48
|
+
self.user_model = extra_params.get('user_model', 'qwen-plus')
|
|
49
|
+
self.api_key = extra_params.get('api_key', 'EMPTY')
|
|
50
|
+
self.api_base = extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
|
|
51
|
+
self.generation_config = extra_params.get('generation_config', {'temperature': 0.7, 'max_new_tokens': 1024})
|
|
52
|
+
|
|
53
|
+
self._patch_env_completion()
|
|
54
|
+
|
|
55
|
+
def _patch_env_completion(self) -> str:
|
|
56
|
+
from tau_bench.envs.user import LLMUserSimulationEnv
|
|
57
|
+
|
|
58
|
+
def new_generate_next_message(self, messages):
|
|
59
|
+
from evalscope.models import ServerModelAdapter
|
|
60
|
+
|
|
61
|
+
user_server = ServerModelAdapter(
|
|
62
|
+
api_url=adapter_instance.api_base,
|
|
63
|
+
model_id=adapter_instance.user_model,
|
|
64
|
+
api_key=adapter_instance.api_key)
|
|
65
|
+
request_json = user_server.make_request(
|
|
66
|
+
input_item={'messages': messages}, infer_cfg=adapter_instance.generation_config)
|
|
67
|
+
res = user_server.send_request(request_json)
|
|
68
|
+
|
|
69
|
+
message = res['choices'][0]['message']
|
|
70
|
+
self.messages.append(message)
|
|
71
|
+
self.total_cost = 0
|
|
72
|
+
return message['content']
|
|
73
|
+
|
|
74
|
+
# get the current instance of TauBenchAdapter
|
|
75
|
+
adapter_instance = self
|
|
76
|
+
LLMUserSimulationEnv.generate_next_message = new_generate_next_message
|
|
77
|
+
|
|
78
|
+
def load(self, **kwargs):
|
|
79
|
+
from tau_bench.envs import get_env
|
|
80
|
+
|
|
81
|
+
data_dict = defaultdict(dict)
|
|
82
|
+
for env_name in self.subset_list:
|
|
83
|
+
logger.info(f'Loading TauBench environment: {env_name}')
|
|
84
|
+
env = get_env(
|
|
85
|
+
env_name=env_name,
|
|
86
|
+
user_strategy='llm',
|
|
87
|
+
user_model='dummy', # Use dummy model to prevent errors
|
|
88
|
+
user_provider='openai', # Use dummy provider to prevent errors
|
|
89
|
+
task_split=self.eval_split,
|
|
90
|
+
)
|
|
91
|
+
tasks = []
|
|
92
|
+
for i in range(len(env.tasks)):
|
|
93
|
+
tasks.append({
|
|
94
|
+
'task_index': i,
|
|
95
|
+
'env_name': env_name,
|
|
96
|
+
})
|
|
97
|
+
data_dict[env_name][self.eval_split] = tasks
|
|
98
|
+
|
|
99
|
+
return data_dict
|
|
100
|
+
|
|
101
|
+
def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
|
|
102
|
+
return self.gen_prompt_data(extra_data=input_d)
|
|
103
|
+
|
|
104
|
+
def get_gold_answer(self, input_d):
|
|
105
|
+
return ''
|
|
106
|
+
|
|
107
|
+
def match(self, gold, pred):
|
|
108
|
+
import json
|
|
109
|
+
res = json.loads(pred)
|
|
110
|
+
return res.get('reward', 0.0)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from typing import Dict, List
|
|
2
3
|
|
|
3
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
@@ -8,7 +9,7 @@ from evalscope.metrics import Metric, mean, metric_registry
|
|
|
8
9
|
@Benchmark.register(
|
|
9
10
|
name='tool_bench',
|
|
10
11
|
pretty_name='ToolBench-Static',
|
|
11
|
-
tags=['Reasoning', 'Agent'],
|
|
12
|
+
tags=['Reasoning', 'Agent', 'Function Calling'],
|
|
12
13
|
description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
|
|
13
14
|
'It includes various subsets such as in-domain and out-of-domain, '
|
|
14
15
|
'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
|
|
@@ -40,6 +41,11 @@ class ToolBenchAdapter(DataAdapter):
|
|
|
40
41
|
for message in messages:
|
|
41
42
|
if 'name' in message:
|
|
42
43
|
del message['name']
|
|
44
|
+
if 'role' in message:
|
|
45
|
+
if message['role'] == 'function':
|
|
46
|
+
content = json.dumps(message, ensure_ascii=False)
|
|
47
|
+
message['role'] = 'user'
|
|
48
|
+
message['content'] = content
|
|
43
49
|
return self.gen_prompt_data(prompt='', messages=messages)
|
|
44
50
|
|
|
45
51
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
@@ -96,13 +96,16 @@ class TriviaQaAdapter(DataAdapter):
|
|
|
96
96
|
def get_sys_prompt(inp: dict) -> str:
|
|
97
97
|
return inp['input'][0]['content']
|
|
98
98
|
|
|
99
|
-
|
|
99
|
+
if self.few_shot_num > 0:
|
|
100
|
+
sys_prompt = get_sys_prompt(input_d)
|
|
101
|
+
else:
|
|
102
|
+
sys_prompt = None
|
|
100
103
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
101
|
-
context
|
|
104
|
+
context = '\n'.join(few_shot_prompts) + '\n'
|
|
102
105
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
103
106
|
full_prompt = context
|
|
104
107
|
|
|
105
|
-
return self.gen_prompt_data(full_prompt)
|
|
108
|
+
return self.gen_prompt_data(full_prompt, system_prompt=sys_prompt)
|
|
106
109
|
|
|
107
110
|
def get_gold_answer(self, input_d: dict) -> list:
|
|
108
111
|
# Get the gold choice
|
|
@@ -124,7 +127,9 @@ class TriviaQaAdapter(DataAdapter):
|
|
|
124
127
|
return result
|
|
125
128
|
|
|
126
129
|
def match(self, gold: list, pred: str) -> float:
|
|
127
|
-
|
|
130
|
+
lower_pred = pred.lower()
|
|
131
|
+
gold = [g.lower() for g in gold]
|
|
132
|
+
is_correct = any([cand in lower_pred for cand in gold])
|
|
128
133
|
return 1 if is_correct else 0
|
|
129
134
|
|
|
130
135
|
@classmethod
|
evalscope/benchmarks/utils.py
CHANGED
|
@@ -2,8 +2,7 @@ from dataclasses import asdict, dataclass
|
|
|
2
2
|
from functools import wraps
|
|
3
3
|
from typing import Dict, List, Optional, Union
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from evalscope.utils.filters import Filter
|
|
5
|
+
from .filters import Filter
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
@dataclass
|
|
@@ -14,6 +13,7 @@ class PromptData:
|
|
|
14
13
|
multi_choices: Optional[List[str]] = None
|
|
15
14
|
id: Optional[str] = None
|
|
16
15
|
messages: Optional[List[dict]] = None
|
|
16
|
+
extra_data: Optional[Dict] = None
|
|
17
17
|
|
|
18
18
|
def to_dict(self) -> Dict:
|
|
19
19
|
return {k: v for k, v in asdict(self).items() if v is not None}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
2
|
from evalscope.constants import EvalType, OutputType
|
|
3
3
|
from evalscope.metrics import exact_match
|
|
4
|
-
from evalscope.
|
|
4
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@Benchmark.register(
|