evalscope 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/data_adapter.py +9 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/utils.py +1 -0
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +5 -3
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/completion_parsers.py +7 -0
- evalscope/metrics/llm_judge.py +6 -5
- evalscope/metrics/metrics.py +19 -7
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +2 -0
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +13 -0
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +2 -2
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/utils/io_utils.py +10 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA +54 -34
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/RECORD +65 -58
- tests/cli/test_all.py +18 -2
- tests/cli/test_run.py +25 -37
- tests/perf/test_perf.py +29 -2
- evalscope/models/model.py +0 -189
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
|
@@ -35,7 +35,7 @@ SUBJECT_MAPPING = {
|
|
|
35
35
|
@Benchmark.register(
|
|
36
36
|
name='bfcl_v3',
|
|
37
37
|
pretty_name='BFCL-v3',
|
|
38
|
-
tags=['Agent'],
|
|
38
|
+
tags=['Agent', 'Function Calling'],
|
|
39
39
|
description=
|
|
40
40
|
'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
|
|
41
41
|
'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
|
|
@@ -168,6 +168,11 @@ class DataAdapter(ABC):
|
|
|
168
168
|
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
|
|
169
169
|
Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
|
|
170
170
|
"""
|
|
171
|
+
# remove dataset_infos.json file if exists, since MsDataset will occur an error if it exists.
|
|
172
|
+
dataset_infos_path = os.path.join(dataset_name_or_path, 'dataset_infos.json')
|
|
173
|
+
if os.path.exists(dataset_infos_path):
|
|
174
|
+
logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid MsDataset errors.')
|
|
175
|
+
os.remove(dataset_infos_path)
|
|
171
176
|
return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)
|
|
172
177
|
|
|
173
178
|
def load_with_snapshot(self,
|
|
@@ -382,7 +387,7 @@ class DataAdapter(ABC):
|
|
|
382
387
|
pass
|
|
383
388
|
|
|
384
389
|
def gen_prompt_data(self,
|
|
385
|
-
prompt: str,
|
|
390
|
+
prompt: str = '',
|
|
386
391
|
system_prompt: Optional[str] = None,
|
|
387
392
|
choices: Optional[List[str]] = None,
|
|
388
393
|
index: Optional[Union[int, str]] = None,
|
|
@@ -413,7 +418,8 @@ class DataAdapter(ABC):
|
|
|
413
418
|
system_prompt=system_prompt or self.system_prompt,
|
|
414
419
|
index=index or 0,
|
|
415
420
|
id=id,
|
|
416
|
-
messages=messages
|
|
421
|
+
messages=messages,
|
|
422
|
+
extra_data=kwargs.get('extra_data', None))
|
|
417
423
|
return prompt_data.to_dict()
|
|
418
424
|
|
|
419
425
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
@@ -477,7 +483,6 @@ class DataAdapter(ABC):
|
|
|
477
483
|
"""
|
|
478
484
|
return result
|
|
479
485
|
|
|
480
|
-
@abstractmethod
|
|
481
486
|
def match(self, gold: Any, pred: Any) -> Any:
|
|
482
487
|
"""
|
|
483
488
|
Match the gold answer and the predicted answer.
|
|
@@ -491,7 +496,7 @@ class DataAdapter(ABC):
|
|
|
491
496
|
Returns:
|
|
492
497
|
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
493
498
|
"""
|
|
494
|
-
|
|
499
|
+
return 1.0 if gold == pred else 0.0
|
|
495
500
|
|
|
496
501
|
def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
|
|
497
502
|
"""
|
|
@@ -17,7 +17,8 @@ logger = get_logger()
|
|
|
17
17
|
@Benchmark.register(
|
|
18
18
|
name='general_mcq',
|
|
19
19
|
pretty_name='General-MCQ',
|
|
20
|
-
description='A general multiple-choice question answering dataset.'
|
|
20
|
+
description='A general multiple-choice question answering dataset for custom evaluation. '
|
|
21
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#mcq).',
|
|
21
22
|
tags=['MCQ', 'Custom'],
|
|
22
23
|
dataset_id='general_mcq',
|
|
23
24
|
model_adapter=OutputType.GENERATION,
|
|
@@ -14,7 +14,8 @@ logger = get_logger()
|
|
|
14
14
|
@Benchmark.register(
|
|
15
15
|
name='general_qa',
|
|
16
16
|
pretty_name='General-QA',
|
|
17
|
-
description='
|
|
17
|
+
description='A general question answering dataset for custom evaluation. '
|
|
18
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
|
|
18
19
|
tags=['QA', 'Custom'],
|
|
19
20
|
dataset_id='general_qa',
|
|
20
21
|
subset_list=['default'],
|
|
File without changes
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Any, List
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.metrics import DEFAULT_PROMPT_TEMPLATE, LLMJudge, exact_match, mean
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
# flake8: noqa
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
SUBSET_LIST = [
|
|
14
|
+
'Biology/Medicine',
|
|
15
|
+
'Chemistry',
|
|
16
|
+
'Computer Science/AI',
|
|
17
|
+
'Engineering',
|
|
18
|
+
'Humanities/Social Science',
|
|
19
|
+
'Math',
|
|
20
|
+
'Physics',
|
|
21
|
+
'Other',
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@Benchmark.register(
|
|
26
|
+
name='hle',
|
|
27
|
+
pretty_name="Humanity's-Last-Exam",
|
|
28
|
+
tags=['Knowledge', 'QA'],
|
|
29
|
+
description=
|
|
30
|
+
'Humanity\'s Last Exam (HLE) is a language model benchmark consisting of 2,500 questions across a broad range of subjects. It was created jointly by the Center for AI Safety and Scale AI. The benchmark classifies the questions into the following broad subjects: mathematics (41%), physics (9%), biology/medicine (11%), humanities/social science (9%), computer science/artificial intelligence (10%), engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions require the ability to understand both text and images, i.e., multi-modality. 24% of the questions are multiple-choice; the rest are short-answer, exact-match questions.', # noqa: E501
|
|
31
|
+
dataset_id='cais/hle',
|
|
32
|
+
subset_list=SUBSET_LIST,
|
|
33
|
+
metric_list=['AverageAccuracy'],
|
|
34
|
+
few_shot_num=0,
|
|
35
|
+
train_split=None,
|
|
36
|
+
eval_split='test',
|
|
37
|
+
prompt_template='{query}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
38
|
+
)
|
|
39
|
+
class HLEAdapter(DataAdapter):
|
|
40
|
+
|
|
41
|
+
def __init__(self, *args, **kwargs):
|
|
42
|
+
super().__init__(*args, **kwargs)
|
|
43
|
+
|
|
44
|
+
self.llm_as_a_judge = True
|
|
45
|
+
|
|
46
|
+
def load(self, **kwargs):
|
|
47
|
+
kwargs['subset_list'] = ['default']
|
|
48
|
+
data_dict = super().load(**kwargs)
|
|
49
|
+
return self.reformat_subset(data_dict, subset_key='category', format='{}')
|
|
50
|
+
|
|
51
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
52
|
+
# remove image preview
|
|
53
|
+
input_d.pop('image_preview', None)
|
|
54
|
+
input_d.pop('rationale_image', None)
|
|
55
|
+
# generate prompt
|
|
56
|
+
question = input_d['question']
|
|
57
|
+
prompt = self.prompt_template.format(query=question)
|
|
58
|
+
image = input_d.get('image', None)
|
|
59
|
+
# build messages for multi-modal input
|
|
60
|
+
messages = []
|
|
61
|
+
if self.system_prompt:
|
|
62
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
63
|
+
if image:
|
|
64
|
+
messages.append({
|
|
65
|
+
'role':
|
|
66
|
+
'user',
|
|
67
|
+
'content': [{
|
|
68
|
+
'type': 'text',
|
|
69
|
+
'text': prompt
|
|
70
|
+
}, {
|
|
71
|
+
'type': 'image_url',
|
|
72
|
+
'image_url': {
|
|
73
|
+
'url': image
|
|
74
|
+
}
|
|
75
|
+
}]
|
|
76
|
+
})
|
|
77
|
+
else:
|
|
78
|
+
messages.append({'role': 'user', 'content': prompt})
|
|
79
|
+
return self.gen_prompt_data(prompt='', messages=messages)
|
|
80
|
+
|
|
81
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
82
|
+
return input_d['answer']
|
|
83
|
+
|
|
84
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
|
|
85
|
+
# Extract the answer from the model output \boxed{answer}
|
|
86
|
+
match = re.search(r'\\boxed{([^}]*)}', result)
|
|
87
|
+
if match:
|
|
88
|
+
return match.group(1).strip()
|
|
89
|
+
else:
|
|
90
|
+
logger.warning(f'No answer found in the model output: {result}')
|
|
91
|
+
return ''
|
|
92
|
+
|
|
93
|
+
def llm_parse_pred_result(self, result, raw_input_d=None, **kwargs) -> str:
|
|
94
|
+
return result.strip()
|
|
95
|
+
|
|
96
|
+
def match(self, gold: str, pred: str) -> dict:
|
|
97
|
+
# simple match
|
|
98
|
+
return {
|
|
99
|
+
'AverageAccuracy': 1.0 if exact_match(gold, pred) else 0.0,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
|
|
103
|
+
raw_input = kwargs.get('raw_input', None)
|
|
104
|
+
question = raw_input['question']
|
|
105
|
+
# get grading response
|
|
106
|
+
prompt = judge.build_prompt(pred, gold, question)
|
|
107
|
+
judge_response = judge(prompt)
|
|
108
|
+
score = judge.get_score(judge_response)
|
|
109
|
+
return {
|
|
110
|
+
'AverageAccuracy': score,
|
|
111
|
+
'response': judge_response,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
115
|
+
# zip dict answers
|
|
116
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
117
|
+
|
|
118
|
+
return super().compute_metric(res_dict, **kwargs)
|
|
@@ -22,7 +22,8 @@ logger = get_logger()
|
|
|
22
22
|
few_shot_num=0,
|
|
23
23
|
train_split=None,
|
|
24
24
|
eval_split='test',
|
|
25
|
-
prompt_template=
|
|
25
|
+
prompt_template=
|
|
26
|
+
'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{query}', # noqa: E501
|
|
26
27
|
extra_params={
|
|
27
28
|
'num_workers': 4,
|
|
28
29
|
'timeout': 4
|
|
@@ -76,26 +77,9 @@ class HumanevalAdapter(DataAdapter):
|
|
|
76
77
|
|
|
77
78
|
@classmethod
|
|
78
79
|
def _postprocess(cls, text: str) -> str:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
text = text.split('```')[1] # fall back to default strategy
|
|
83
|
-
else:
|
|
84
|
-
text = blocks[0] # fetch the first code block
|
|
85
|
-
if not text.startswith('\n'): # in case starting with ```python
|
|
86
|
-
text = text[max(text.find('\n') + 1, 0):]
|
|
87
|
-
if text.strip().startswith('from') or text.strip().startswith('import'):
|
|
88
|
-
def_idx = text.find('def')
|
|
89
|
-
if def_idx != -1:
|
|
90
|
-
text = text[max(text.find('\n', def_idx) + 1, 0):]
|
|
91
|
-
text = text.split('\n\n')[0]
|
|
92
|
-
if text.strip().startswith('def'):
|
|
93
|
-
text = '\n'.join(text.split('\n')[1:])
|
|
94
|
-
if not text.startswith(' '):
|
|
95
|
-
if text.startswith(' '):
|
|
96
|
-
text = ' ' + text.lstrip()
|
|
97
|
-
else:
|
|
98
|
-
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
80
|
+
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
|
|
81
|
+
if len(blocks) >= 1:
|
|
82
|
+
text = blocks[0]
|
|
99
83
|
return text
|
|
100
84
|
|
|
101
85
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
@@ -144,7 +144,7 @@ SUBJECT_MAPPING = {
|
|
|
144
144
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
145
145
|
subset_list=SUBSET_LIST,
|
|
146
146
|
metric_list=['AverageAccuracy'],
|
|
147
|
-
few_shot_num=
|
|
147
|
+
few_shot_num=0,
|
|
148
148
|
train_split='train',
|
|
149
149
|
eval_split='test',
|
|
150
150
|
prompt_template=
|
|
File without changes
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.metrics import Metric, mean, metric_registry
|
|
7
|
+
from evalscope.utils import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@Benchmark.register(
|
|
13
|
+
name='tau_bench',
|
|
14
|
+
pretty_name='τ-bench',
|
|
15
|
+
tags=['Reasoning', 'Agent', 'Function Calling'],
|
|
16
|
+
description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
|
|
17
|
+
'and a language agent provided with domain-specific API tools and policy guidelines. '
|
|
18
|
+
'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating and set a user model. ', # noqa: E501
|
|
19
|
+
dataset_id='https://github.com/sierra-research/tau-bench',
|
|
20
|
+
model_adapter='tau_bench_server',
|
|
21
|
+
subset_list=['airline', 'retail'],
|
|
22
|
+
metric_list=['Pass^1'],
|
|
23
|
+
eval_split='test',
|
|
24
|
+
extra_params={
|
|
25
|
+
'user_model': 'qwen-plus',
|
|
26
|
+
'api_key': 'EMPTY',
|
|
27
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
28
|
+
'generation_config': {
|
|
29
|
+
'temperature': 0.7,
|
|
30
|
+
'max_new_tokens': 1024
|
|
31
|
+
}
|
|
32
|
+
})
|
|
33
|
+
class TauBenchAdapter(DataAdapter):
|
|
34
|
+
|
|
35
|
+
def __init__(self, **kwargs):
|
|
36
|
+
super().__init__(**kwargs)
|
|
37
|
+
|
|
38
|
+
spec = importlib.util.find_spec('tau_bench')
|
|
39
|
+
if spec is None:
|
|
40
|
+
raise ImportError(
|
|
41
|
+
'`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
metric_registry.register(Metric(name='Pass^1', object=mean))
|
|
45
|
+
|
|
46
|
+
# setup user model args
|
|
47
|
+
extra_params = kwargs.get('extra_params', {})
|
|
48
|
+
self.user_model = extra_params.get('user_model', 'qwen-plus')
|
|
49
|
+
self.api_key = extra_params.get('api_key', 'EMPTY')
|
|
50
|
+
self.api_base = extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
|
|
51
|
+
self.generation_config = extra_params.get('generation_config', {'temperature': 0.7, 'max_new_tokens': 1024})
|
|
52
|
+
|
|
53
|
+
self._patch_env_completion()
|
|
54
|
+
|
|
55
|
+
def _patch_env_completion(self) -> str:
|
|
56
|
+
from tau_bench.envs.user import LLMUserSimulationEnv
|
|
57
|
+
|
|
58
|
+
def new_generate_next_message(self, messages):
|
|
59
|
+
from evalscope.models import ServerModelAdapter
|
|
60
|
+
|
|
61
|
+
user_server = ServerModelAdapter(
|
|
62
|
+
api_url=adapter_instance.api_base,
|
|
63
|
+
model_id=adapter_instance.user_model,
|
|
64
|
+
api_key=adapter_instance.api_key)
|
|
65
|
+
request_json = user_server.make_request(
|
|
66
|
+
input_item={'messages': messages}, infer_cfg=adapter_instance.generation_config)
|
|
67
|
+
res = user_server.send_request(request_json)
|
|
68
|
+
|
|
69
|
+
message = res['choices'][0]['message']
|
|
70
|
+
self.messages.append(message)
|
|
71
|
+
self.total_cost = 0
|
|
72
|
+
return message['content']
|
|
73
|
+
|
|
74
|
+
# get the current instance of TauBenchAdapter
|
|
75
|
+
adapter_instance = self
|
|
76
|
+
LLMUserSimulationEnv.generate_next_message = new_generate_next_message
|
|
77
|
+
|
|
78
|
+
def load(self, **kwargs):
|
|
79
|
+
from tau_bench.envs import get_env
|
|
80
|
+
|
|
81
|
+
data_dict = defaultdict(dict)
|
|
82
|
+
for env_name in self.subset_list:
|
|
83
|
+
logger.info(f'Loading TauBench environment: {env_name}')
|
|
84
|
+
env = get_env(
|
|
85
|
+
env_name=env_name,
|
|
86
|
+
user_strategy='llm',
|
|
87
|
+
user_model='dummy', # Use dummy model to prevent errors
|
|
88
|
+
user_provider='openai', # Use dummy provider to prevent errors
|
|
89
|
+
task_split=self.eval_split,
|
|
90
|
+
)
|
|
91
|
+
tasks = []
|
|
92
|
+
for i in range(len(env.tasks)):
|
|
93
|
+
tasks.append({
|
|
94
|
+
'task_index': i,
|
|
95
|
+
'env_name': env_name,
|
|
96
|
+
})
|
|
97
|
+
data_dict[env_name][self.eval_split] = tasks
|
|
98
|
+
|
|
99
|
+
return data_dict
|
|
100
|
+
|
|
101
|
+
def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
|
|
102
|
+
return self.gen_prompt_data(extra_data=input_d)
|
|
103
|
+
|
|
104
|
+
def get_gold_answer(self, input_d):
|
|
105
|
+
return ''
|
|
106
|
+
|
|
107
|
+
def match(self, gold, pred):
|
|
108
|
+
import json
|
|
109
|
+
res = json.loads(pred)
|
|
110
|
+
return res.get('reward', 0.0)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from typing import Dict, List
|
|
2
3
|
|
|
3
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
@@ -8,7 +9,7 @@ from evalscope.metrics import Metric, mean, metric_registry
|
|
|
8
9
|
@Benchmark.register(
|
|
9
10
|
name='tool_bench',
|
|
10
11
|
pretty_name='ToolBench-Static',
|
|
11
|
-
tags=['Reasoning', 'Agent'],
|
|
12
|
+
tags=['Reasoning', 'Agent', 'Function Calling'],
|
|
12
13
|
description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
|
|
13
14
|
'It includes various subsets such as in-domain and out-of-domain, '
|
|
14
15
|
'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
|
|
@@ -40,6 +41,11 @@ class ToolBenchAdapter(DataAdapter):
|
|
|
40
41
|
for message in messages:
|
|
41
42
|
if 'name' in message:
|
|
42
43
|
del message['name']
|
|
44
|
+
if 'role' in message:
|
|
45
|
+
if message['role'] == 'function':
|
|
46
|
+
content = json.dumps(message, ensure_ascii=False)
|
|
47
|
+
message['role'] = 'user'
|
|
48
|
+
message['content'] = content
|
|
43
49
|
return self.gen_prompt_data(prompt='', messages=messages)
|
|
44
50
|
|
|
45
51
|
def get_gold_answer(self, input_d: dict) -> str:
|
evalscope/benchmarks/utils.py
CHANGED
|
@@ -13,6 +13,7 @@ class PromptData:
|
|
|
13
13
|
multi_choices: Optional[List[str]] = None
|
|
14
14
|
id: Optional[str] = None
|
|
15
15
|
messages: Optional[List[dict]] = None
|
|
16
|
+
extra_data: Optional[Dict] = None
|
|
16
17
|
|
|
17
18
|
def to_dict(self) -> Dict:
|
|
18
19
|
return {k: v for k, v in asdict(self).items() if v is not None}
|
evalscope/constants.py
CHANGED
|
@@ -41,27 +41,6 @@ class MetricsConstant:
|
|
|
41
41
|
]
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
class MetricMembers:
|
|
45
|
-
|
|
46
|
-
# Math accuracy metric
|
|
47
|
-
MATH_ACCURACY = 'math_accuracy'
|
|
48
|
-
|
|
49
|
-
# Code pass@k metric
|
|
50
|
-
CODE_PASS_K = 'code_pass_k'
|
|
51
|
-
|
|
52
|
-
# Code rouge metric
|
|
53
|
-
ROUGE = 'rouge'
|
|
54
|
-
|
|
55
|
-
# ELO rating system for pairwise comparison
|
|
56
|
-
ELO = 'elo'
|
|
57
|
-
|
|
58
|
-
# Pairwise comparison win/lose and tie(optional)
|
|
59
|
-
PAIRWISE = 'pairwise'
|
|
60
|
-
|
|
61
|
-
# Rating score for single model
|
|
62
|
-
SCORE = 'score'
|
|
63
|
-
|
|
64
|
-
|
|
65
44
|
class ArenaWinner:
|
|
66
45
|
|
|
67
46
|
MODEL_A = 'model_a'
|
|
@@ -172,6 +151,11 @@ class JudgeStrategy:
|
|
|
172
151
|
LLM_RECALL = 'llm_recall'
|
|
173
152
|
|
|
174
153
|
|
|
154
|
+
class JudgeScoreType:
|
|
155
|
+
NUMERIC = 'numeric' # numeric score
|
|
156
|
+
PATTERN = 'pattern' # pattern matching score
|
|
157
|
+
|
|
158
|
+
|
|
175
159
|
class ModelTask:
|
|
176
160
|
TEXT_GENERATION = 'text_generation'
|
|
177
161
|
IMAGE_GENERATION = 'image_generation'
|
evalscope/evaluator/__init__.py
CHANGED
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -7,17 +7,19 @@ from collections import OrderedDict, defaultdict
|
|
|
7
7
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
8
8
|
from copy import deepcopy
|
|
9
9
|
from tqdm import tqdm
|
|
10
|
-
from typing import Any, Dict, List, Optional, Union
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
|
11
11
|
|
|
12
12
|
from evalscope.benchmarks import DataAdapter
|
|
13
13
|
from evalscope.config import TaskConfig
|
|
14
14
|
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
|
|
15
|
-
from evalscope.models import BaseModelAdapter
|
|
16
15
|
from evalscope.report import Report, gen_table
|
|
17
16
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
|
|
18
17
|
from evalscope.utils.logger import get_logger
|
|
19
18
|
from evalscope.utils.model_utils import dict_torch_dtype_to_str
|
|
20
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from evalscope.models import BaseModelAdapter
|
|
22
|
+
|
|
21
23
|
logger = get_logger()
|
|
22
24
|
|
|
23
25
|
|
|
@@ -38,7 +40,7 @@ class Evaluator(object):
|
|
|
38
40
|
|
|
39
41
|
def __init__(self,
|
|
40
42
|
data_adapter: DataAdapter,
|
|
41
|
-
model_adapter: BaseModelAdapter,
|
|
43
|
+
model_adapter: 'BaseModelAdapter',
|
|
42
44
|
outputs: OutputsStructure = None,
|
|
43
45
|
task_cfg: TaskConfig = None,
|
|
44
46
|
**kwargs):
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ from evalscope.utils.import_utils import _LazyModule
|
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
|
|
8
|
-
from .llm_judge import LLMJudge
|
|
8
|
+
from .llm_judge import DEFAULT_NUMERIC_SCORE_TEMPLATE, DEFAULT_PROMPT_TEMPLATE, LLMJudge
|
|
9
9
|
from .math_parser import extract_answer, math_equal, strip_answer_string
|
|
10
10
|
from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
|
|
11
11
|
weighted_mean)
|
|
@@ -34,6 +34,8 @@ else:
|
|
|
34
34
|
],
|
|
35
35
|
'llm_judge': [
|
|
36
36
|
'LLMJudge',
|
|
37
|
+
'DEFAULT_PROMPT_TEMPLATE',
|
|
38
|
+
'DEFAULT_NUMERIC_SCORE_TEMPLATE',
|
|
37
39
|
],
|
|
38
40
|
'math_parser': [
|
|
39
41
|
'extract_answer',
|
|
@@ -218,3 +218,10 @@ class ResponseParser:
|
|
|
218
218
|
# Join options into a regex pattern separated by '|', to match any of the options
|
|
219
219
|
options_pattern = '|'.join(escaped_options)
|
|
220
220
|
return options_pattern
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
if __name__ == '__main__':
|
|
224
|
+
result = '**Answer: A **Answer: C**'
|
|
225
|
+
options = ['A', 'B', 'C', 'D']
|
|
226
|
+
parsed_result = ResponseParser.parse_first_option(result, options)
|
|
227
|
+
print(f'Parsed result: {parsed_result}') # Should print 'C'
|
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import Any, Dict, List, Optional
|
|
4
4
|
|
|
5
|
+
from evalscope.constants import JudgeScoreType
|
|
5
6
|
from evalscope.utils.logger import get_logger
|
|
6
7
|
|
|
7
8
|
logger = get_logger()
|
|
@@ -56,7 +57,7 @@ class LLMJudge:
|
|
|
56
57
|
generation_config: Optional[Dict[str, Any]] = None,
|
|
57
58
|
score_pattern: Optional[str] = None,
|
|
58
59
|
score_mapping: Optional[Dict[str, float]] = None,
|
|
59
|
-
score_type: str =
|
|
60
|
+
score_type: str = JudgeScoreType.PATTERN, # 'pattern', 'numeric'
|
|
60
61
|
**kwargs):
|
|
61
62
|
"""
|
|
62
63
|
Initialize LLMJudge metric.
|
|
@@ -82,11 +83,11 @@ class LLMJudge:
|
|
|
82
83
|
|
|
83
84
|
# Default score mapping for A/B pattern
|
|
84
85
|
self.score_type = score_type
|
|
85
|
-
if self.score_type ==
|
|
86
|
+
if self.score_type == JudgeScoreType.NUMERIC:
|
|
86
87
|
self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
|
|
87
88
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
|
|
88
89
|
DEFAULT_NUMERIC_SCORE_TEMPLATE)
|
|
89
|
-
elif self.score_type ==
|
|
90
|
+
elif self.score_type == JudgeScoreType.PATTERN:
|
|
90
91
|
self.score_pattern = score_pattern or r'(A|B)'
|
|
91
92
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
92
93
|
else:
|
|
@@ -159,9 +160,9 @@ class LLMJudge:
|
|
|
159
160
|
return 0.0
|
|
160
161
|
|
|
161
162
|
# choose extraction method based on score_type
|
|
162
|
-
if self.score_type ==
|
|
163
|
+
if self.score_type == JudgeScoreType.NUMERIC:
|
|
163
164
|
return self._extract_numeric_score(response)
|
|
164
|
-
elif self.score_type ==
|
|
165
|
+
elif self.score_type == JudgeScoreType.PATTERN:
|
|
165
166
|
return self._extract_pattern_score(response)
|
|
166
167
|
|
|
167
168
|
def _extract_numeric_score(self, response: str) -> Optional[float]:
|
evalscope/metrics/metrics.py
CHANGED
|
@@ -9,7 +9,7 @@ import random
|
|
|
9
9
|
import sacrebleu
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
from collections.abc import Iterable
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import Dict, List, Union
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def mean(arr: list):
|
|
@@ -22,16 +22,28 @@ def mean(arr: list):
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def pass_at_k(arr: Union[List[int], List[List[int]]], k: int = 1) -> float:
|
|
25
|
+
"""
|
|
26
|
+
Calculates the pass@k metric using the calculate_pass_at_k function.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
arr: List of binary values (1 for correct, 0 for incorrect) or list of such lists
|
|
30
|
+
k: Number of attempts allowed
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
The average pass@k score across all problems
|
|
34
|
+
"""
|
|
25
35
|
if not arr:
|
|
26
36
|
return 0.0
|
|
37
|
+
if not isinstance(arr[0], list):
|
|
38
|
+
# If arr is a simple list of binary results, convert it to a list of lists
|
|
39
|
+
arr = [arr]
|
|
27
40
|
|
|
28
|
-
|
|
29
|
-
|
|
41
|
+
# For list of lists case, each inner list represents attempts for one problem
|
|
42
|
+
num_samples = [len(sub_arr) for sub_arr in arr]
|
|
43
|
+
num_correct = [sum(sub_arr) for sub_arr in arr]
|
|
44
|
+
pass_at_k_values = calculate_pass_at_k(num_samples, num_correct, k)
|
|
30
45
|
|
|
31
|
-
|
|
32
|
-
return sum(sub_pass_at_k(sub_arr) for sub_arr in arr) / len(arr)
|
|
33
|
-
else:
|
|
34
|
-
return sum(arr) / len(arr)
|
|
46
|
+
return float(np.mean(pass_at_k_values))
|
|
35
47
|
|
|
36
48
|
|
|
37
49
|
def pop_stddev(arr):
|
evalscope/models/__init__.py
CHANGED
|
@@ -4,12 +4,11 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .adapters import (BaseModelAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
|
|
7
|
+
from .adapters import (BaseModelAdapter, BFCLAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
|
|
8
8
|
CustomModelAdapter, MultiChoiceModelAdapter, ServerModelAdapter, T2IModelAdapter,
|
|
9
|
-
initialize_model_adapter)
|
|
9
|
+
TauBenchAdapter, initialize_model_adapter)
|
|
10
10
|
from .custom import CustomModel, DummyCustomModel
|
|
11
11
|
from .local_model import LocalModel, get_local_model
|
|
12
|
-
from .model import BaseModel, ChatBaseModel, OpenAIModel
|
|
13
12
|
from .register import get_model_adapter
|
|
14
13
|
|
|
15
14
|
else:
|
|
@@ -23,6 +22,8 @@ else:
|
|
|
23
22
|
'CustomModelAdapter',
|
|
24
23
|
'ServerModelAdapter',
|
|
25
24
|
'T2IModelAdapter',
|
|
25
|
+
'TauBenchAdapter',
|
|
26
|
+
'BFCLAdapter',
|
|
26
27
|
],
|
|
27
28
|
'custom': [
|
|
28
29
|
'CustomModel',
|
|
@@ -32,11 +33,6 @@ else:
|
|
|
32
33
|
'LocalModel',
|
|
33
34
|
'get_local_model',
|
|
34
35
|
],
|
|
35
|
-
'model': [
|
|
36
|
-
'BaseModel',
|
|
37
|
-
'ChatBaseModel',
|
|
38
|
-
'OpenAIModel',
|
|
39
|
-
],
|
|
40
36
|
'register': [
|
|
41
37
|
'get_model_adapter',
|
|
42
38
|
],
|
|
@@ -5,15 +5,10 @@ from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdap
|
|
|
5
5
|
from .custom_adapter import CustomModelAdapter
|
|
6
6
|
from .server_adapter import ServerModelAdapter
|
|
7
7
|
from .t2i_adapter import T2IModelAdapter
|
|
8
|
+
from .tau_bench_adapter import TauBenchAdapter
|
|
8
9
|
|
|
9
10
|
__all__ = [
|
|
10
|
-
'initialize_model_adapter',
|
|
11
|
-
'
|
|
12
|
-
'
|
|
13
|
-
'ContinuationLogitsModelAdapter',
|
|
14
|
-
'MultiChoiceModelAdapter',
|
|
15
|
-
'CustomModelAdapter',
|
|
16
|
-
'ServerModelAdapter',
|
|
17
|
-
'BFCLAdapter',
|
|
18
|
-
'T2IModelAdapter',
|
|
11
|
+
'initialize_model_adapter', 'BaseModelAdapter', 'ChatGenerationModelAdapter', 'ContinuationLogitsModelAdapter',
|
|
12
|
+
'MultiChoiceModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter', 'BFCLAdapter', 'T2IModelAdapter',
|
|
13
|
+
'TauBenchAdapter'
|
|
19
14
|
]
|