evalscope 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +35 -0
- evalscope/api/benchmark/meta.py +6 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/state.py +12 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +47 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +0 -1
- evalscope/api/model/generate_config.py +1 -3
- evalscope/api/model/model.py +4 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +2 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
- evalscope/benchmarks/bfcl/generation.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +72 -13
- evalscope/constants.py +8 -0
- evalscope/evaluator/evaluator.py +6 -4
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +20 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +7 -4
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/utils/benchmark_util.py +8 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/generator.py +8 -87
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +42 -1
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
- tests/benchmark/test_eval.py +30 -31
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.evaluator import TaskState
|
|
8
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
|
|
14
|
+
|
|
15
|
+
# flake8: noqa
|
|
16
|
+
|
|
17
|
+
logger = get_logger()
|
|
18
|
+
|
|
19
|
+
SUBSET_LIST = [
|
|
20
|
+
'Accounting',
|
|
21
|
+
'Agriculture',
|
|
22
|
+
'Architecture_and_Engineering',
|
|
23
|
+
'Art',
|
|
24
|
+
'Art_Theory',
|
|
25
|
+
'Basic_Medical_Science',
|
|
26
|
+
'Biology',
|
|
27
|
+
'Chemistry',
|
|
28
|
+
'Clinical_Medicine',
|
|
29
|
+
'Computer_Science',
|
|
30
|
+
'Design',
|
|
31
|
+
'Diagnostics_and_Laboratory_Medicine',
|
|
32
|
+
'Economics',
|
|
33
|
+
'Electronics',
|
|
34
|
+
'Energy_and_Power',
|
|
35
|
+
'Finance',
|
|
36
|
+
'Geography',
|
|
37
|
+
'History',
|
|
38
|
+
'Literature',
|
|
39
|
+
'Manage',
|
|
40
|
+
'Marketing',
|
|
41
|
+
'Materials',
|
|
42
|
+
'Math',
|
|
43
|
+
'Mechanical_Engineering',
|
|
44
|
+
'Music',
|
|
45
|
+
'Pharmacy',
|
|
46
|
+
'Physics',
|
|
47
|
+
'Psychology',
|
|
48
|
+
'Public_Health',
|
|
49
|
+
'Sociology',
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
53
|
+
|
|
54
|
+
OPEN_PROMPT = """
|
|
55
|
+
Solve the following problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
|
|
56
|
+
|
|
57
|
+
{question}
|
|
58
|
+
|
|
59
|
+
Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
MULTI_CHOICE_TYPE = 'multiple-choice'
|
|
63
|
+
OPEN_TYPE = 'open'
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@register_benchmark(
|
|
67
|
+
BenchmarkMeta(
|
|
68
|
+
name='mmmu',
|
|
69
|
+
pretty_name='MMMU',
|
|
70
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
71
|
+
description=
|
|
72
|
+
'MMMU (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI) benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning. MMMU includes 11.5K meticulously collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering. These questions span 30 subjects and 183 subfields, comprising 30 highly heterogeneous image types, such as charts, diagrams, maps, tables, music sheets, and chemical structures.', # noqa: E501
|
|
73
|
+
dataset_id='AI-ModelScope/MMMU',
|
|
74
|
+
subset_list=SUBSET_LIST,
|
|
75
|
+
metric_list=['acc'],
|
|
76
|
+
eval_split='validation',
|
|
77
|
+
prompt_template=OPEN_PROMPT,
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
class MMMUAdapter(VisionLanguageAdapter):
|
|
81
|
+
MAX_IMAGES: int = 7
|
|
82
|
+
|
|
83
|
+
def __init__(self, *args, **kwargs):
|
|
84
|
+
super().__init__(*args, **kwargs)
|
|
85
|
+
|
|
86
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
87
|
+
question_type = record['question_type']
|
|
88
|
+
content_list, answers_list = MMMUAdapter.create_content_and_answers_list(record)
|
|
89
|
+
|
|
90
|
+
metadata = {
|
|
91
|
+
'id': record['id'],
|
|
92
|
+
'question_type': record['question_type'],
|
|
93
|
+
'subfield': record['subfield'],
|
|
94
|
+
'explanation': record['explanation'],
|
|
95
|
+
'img_type': record['img_type'],
|
|
96
|
+
'topic_difficulty': record['topic_difficulty'],
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if question_type == MULTI_CHOICE_TYPE:
|
|
100
|
+
return Sample(
|
|
101
|
+
input=[ChatMessageUser(content=content_list)],
|
|
102
|
+
choices=answers_list,
|
|
103
|
+
target=record['answer'],
|
|
104
|
+
metadata=metadata,
|
|
105
|
+
)
|
|
106
|
+
elif question_type == OPEN_TYPE:
|
|
107
|
+
return Sample(
|
|
108
|
+
input=[ChatMessageUser(content=content_list)],
|
|
109
|
+
target=record['answer'],
|
|
110
|
+
metadata=metadata,
|
|
111
|
+
)
|
|
112
|
+
else:
|
|
113
|
+
raise ValueError(f'Unsupported question type: {question_type}')
|
|
114
|
+
|
|
115
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
116
|
+
question_type = task_state.metadata['question_type']
|
|
117
|
+
if question_type == MULTI_CHOICE_TYPE:
|
|
118
|
+
answers = parse_answers(task_state)
|
|
119
|
+
return ''.join(sorted(list(answers)))
|
|
120
|
+
elif question_type == OPEN_TYPE:
|
|
121
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
122
|
+
match = re.search(pattern, prediction)
|
|
123
|
+
if match:
|
|
124
|
+
return match.group(1).strip()
|
|
125
|
+
return ''
|
|
126
|
+
else:
|
|
127
|
+
raise ValueError(f'Unsupported question type: {question_type}')
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def create_content_and_answers_list(record: Dict[str, Any]) -> tuple[List[Content], List[str]]:
|
|
131
|
+
"""
|
|
132
|
+
Create a list of content elements and a list of answers from a record.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
record (dict): The record containing question, images, and options.
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
tuple: A tuple containing:
|
|
140
|
+
- content_list (list): A list of content elements (text and images).
|
|
141
|
+
- answers_list (list): A list of possible answers (for multiple-choice questions).
|
|
142
|
+
"""
|
|
143
|
+
question_type = record['question_type']
|
|
144
|
+
|
|
145
|
+
if question_type == MULTI_CHOICE_TYPE:
|
|
146
|
+
answers_list: List[str] = ast.literal_eval(record['options'])
|
|
147
|
+
input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
|
|
148
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
149
|
+
else:
|
|
150
|
+
answers_list: List[str] = []
|
|
151
|
+
content_list: List[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
|
|
152
|
+
|
|
153
|
+
for i in range(MMMUAdapter.MAX_IMAGES):
|
|
154
|
+
image = record[f'image_{i+1}']
|
|
155
|
+
if image:
|
|
156
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
157
|
+
content_list.append(ContentImage(image=image_base64))
|
|
158
|
+
|
|
159
|
+
return content_list, answers_list
|
|
File without changes
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate, answer_character, parse_answers, prompt
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
SUBSET_LIST = [
|
|
17
|
+
'Accounting',
|
|
18
|
+
'Agriculture',
|
|
19
|
+
'Architecture_and_Engineering',
|
|
20
|
+
'Art',
|
|
21
|
+
'Art_Theory',
|
|
22
|
+
'Basic_Medical_Science',
|
|
23
|
+
'Biology',
|
|
24
|
+
'Chemistry',
|
|
25
|
+
'Clinical_Medicine',
|
|
26
|
+
'Computer_Science',
|
|
27
|
+
'Design',
|
|
28
|
+
'Diagnostics_and_Laboratory_Medicine',
|
|
29
|
+
'Economics',
|
|
30
|
+
'Electronics',
|
|
31
|
+
'Energy_and_Power',
|
|
32
|
+
'Finance',
|
|
33
|
+
'Geography',
|
|
34
|
+
'History',
|
|
35
|
+
'Literature',
|
|
36
|
+
'Manage',
|
|
37
|
+
'Marketing',
|
|
38
|
+
'Materials',
|
|
39
|
+
'Math',
|
|
40
|
+
'Mechanical_Engineering',
|
|
41
|
+
'Music',
|
|
42
|
+
'Pharmacy',
|
|
43
|
+
'Physics',
|
|
44
|
+
'Psychology',
|
|
45
|
+
'Public_Health',
|
|
46
|
+
'Sociology',
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
50
|
+
|
|
51
|
+
VISION_PROMPT = r"""
|
|
52
|
+
Answer the following multiple choice question in image. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
|
|
53
|
+
|
|
54
|
+
""".strip() # noqa: E501
|
|
55
|
+
|
|
56
|
+
DATASET_FORMATS = ['standard (4 options)', 'standard (10 options)', 'vision']
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@register_benchmark(
|
|
60
|
+
BenchmarkMeta(
|
|
61
|
+
name='mmmu_pro',
|
|
62
|
+
pretty_name='MMMU-PRO',
|
|
63
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
64
|
+
description=
|
|
65
|
+
'MMMU-Pro is an enhanced multimodal benchmark designed to rigorously assess the true understanding capabilities of advanced AI models across multiple modalities. It builds upon the original MMMU benchmark by introducing several key improvements that make it more challenging and realistic, ensuring that models are evaluated on their genuine ability to integrate and comprehend both visual and textual information.', # noqa: E501
|
|
66
|
+
dataset_id='AI-ModelScope/MMMU_Pro',
|
|
67
|
+
subset_list=SUBSET_LIST,
|
|
68
|
+
metric_list=['acc'],
|
|
69
|
+
eval_split='test',
|
|
70
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
71
|
+
extra_params={
|
|
72
|
+
'dataset_format': f"# choose from {DATASET_FORMATS}, default 'standard (4 options)'",
|
|
73
|
+
}
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
class MMMUPROAdapter(VisionLanguageAdapter):
|
|
77
|
+
MAX_IMAGES: int = 7
|
|
78
|
+
|
|
79
|
+
def __init__(self, *args, **kwargs):
|
|
80
|
+
super().__init__(*args, **kwargs)
|
|
81
|
+
|
|
82
|
+
self.reformat_subset = True
|
|
83
|
+
self.dataset_format = self.extra_params.get('dataset_format', 'standard (4 options)')
|
|
84
|
+
if self.dataset_format not in DATASET_FORMATS:
|
|
85
|
+
logger.warning(f"Invalid dataset_format '{self.dataset_format}', fallback to 'standard (4 options)'")
|
|
86
|
+
self.dataset_format = 'standard (4 options)'
|
|
87
|
+
self.default_subset = self.dataset_format
|
|
88
|
+
|
|
89
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
90
|
+
|
|
91
|
+
metadata = {
|
|
92
|
+
'id': record['id'],
|
|
93
|
+
'explanation': record.get('explanation'),
|
|
94
|
+
'img_type': record.get('img_type'),
|
|
95
|
+
'topic_difficulty': record.get('topic_difficulty'),
|
|
96
|
+
'subject': record.get('subject')
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
answers_list: List[str] = ast.literal_eval(record['options'])
|
|
100
|
+
|
|
101
|
+
if self.dataset_format == 'vision':
|
|
102
|
+
letters = ','.join(answer_character(i) for i in range(len(answers_list)))
|
|
103
|
+
input_text = VISION_PROMPT.format(letters=letters)
|
|
104
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
105
|
+
|
|
106
|
+
image = record.get('image')
|
|
107
|
+
if image:
|
|
108
|
+
content_list.append(ContentImage(image=bytes_to_base64(image['bytes'], format='png', add_header=True)))
|
|
109
|
+
else:
|
|
110
|
+
input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
|
|
111
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
112
|
+
|
|
113
|
+
for i in range(MMMUPROAdapter.MAX_IMAGES):
|
|
114
|
+
image = record.get(f'image_{i+1}')
|
|
115
|
+
if image:
|
|
116
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
117
|
+
content_list.append(ContentImage(image=image_base64))
|
|
118
|
+
|
|
119
|
+
return Sample(
|
|
120
|
+
input=[ChatMessageUser(content=content_list)],
|
|
121
|
+
choices=answers_list,
|
|
122
|
+
target=record['answer'],
|
|
123
|
+
subset_key=record['subject'],
|
|
124
|
+
metadata=metadata,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
128
|
+
answers = parse_answers(task_state)
|
|
129
|
+
return ''.join(sorted(list(answers)))
|
|
@@ -164,7 +164,11 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
|
|
|
164
164
|
records.append(record)
|
|
165
165
|
|
|
166
166
|
dataset = DictDataLoader(
|
|
167
|
-
dict_list=records,
|
|
167
|
+
dict_list=records,
|
|
168
|
+
limit=self.limit,
|
|
169
|
+
repeats=self.repeats,
|
|
170
|
+
sample_fields=self.record_to_sample,
|
|
171
|
+
shuffle=self.shuffle,
|
|
168
172
|
).load()
|
|
169
173
|
|
|
170
174
|
datasets[subset_name] = dataset
|
|
@@ -45,7 +45,7 @@ def _patch_agent_solve(model: Model):
|
|
|
45
45
|
input=[dict_to_chat_message(msg) for msg in messages],
|
|
46
46
|
tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
|
|
47
47
|
)
|
|
48
|
-
oai_res = openai_chat_choices(res.choices)
|
|
48
|
+
oai_res = openai_chat_choices(res.choices, include_reasoning=False)
|
|
49
49
|
|
|
50
50
|
next_message = oai_res[0].message.model_dump(exclude_none=True)
|
|
51
51
|
|
|
@@ -13,6 +13,7 @@ from evalscope.api.registry import register_benchmark
|
|
|
13
13
|
from evalscope.constants import Tags
|
|
14
14
|
from evalscope.utils import get_logger
|
|
15
15
|
from evalscope.utils.function_utils import run_once
|
|
16
|
+
from evalscope.utils.import_utils import check_import
|
|
16
17
|
|
|
17
18
|
logger = get_logger()
|
|
18
19
|
|
|
@@ -35,8 +36,8 @@ logger = get_logger()
|
|
|
35
36
|
'api_key': 'EMPTY',
|
|
36
37
|
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
37
38
|
'generation_config': {
|
|
38
|
-
'temperature': 0.
|
|
39
|
-
'
|
|
39
|
+
'temperature': 0.0,
|
|
40
|
+
'max_tokens': 4096,
|
|
40
41
|
}
|
|
41
42
|
}
|
|
42
43
|
)
|
|
@@ -46,22 +47,13 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
46
47
|
def __init__(self, **kwargs):
|
|
47
48
|
super().__init__(**kwargs)
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
if spec is None:
|
|
51
|
-
raise ImportError(
|
|
52
|
-
'`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
|
|
53
|
-
)
|
|
50
|
+
check_import('tau_bench', package='git+https://github.com/sierra-research/tau-bench', raise_error=True)
|
|
54
51
|
|
|
55
52
|
# setup user model args
|
|
56
53
|
self.user_model = self.extra_params.get('user_model', 'qwen-plus')
|
|
57
54
|
self.api_key = self.extra_params.get('api_key', 'EMPTY')
|
|
58
55
|
self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
|
|
59
|
-
self.generation_config = self.extra_params.get(
|
|
60
|
-
'generation_config', {
|
|
61
|
-
'temperature': 0.7,
|
|
62
|
-
'max_new_tokens': 1024
|
|
63
|
-
}
|
|
64
|
-
)
|
|
56
|
+
self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
|
|
65
57
|
|
|
66
58
|
self._patch_env_completion()
|
|
67
59
|
|
|
@@ -84,10 +76,10 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
84
76
|
|
|
85
77
|
res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
|
|
86
78
|
|
|
87
|
-
message = res.
|
|
79
|
+
message = {'role': 'assistant', 'content': res.completion}
|
|
88
80
|
self.messages.append(message)
|
|
89
81
|
self.total_cost = 0
|
|
90
|
-
return
|
|
82
|
+
return res.completion
|
|
91
83
|
|
|
92
84
|
# get the current instance of TauBenchAdapter
|
|
93
85
|
adapter_instance = self
|
|
@@ -114,7 +106,11 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
114
106
|
})
|
|
115
107
|
# load dataset
|
|
116
108
|
dataset = DictDataLoader(
|
|
117
|
-
dict_list=tasks,
|
|
109
|
+
dict_list=tasks,
|
|
110
|
+
sample_fields=self.record_to_sample,
|
|
111
|
+
limit=self.limit,
|
|
112
|
+
repeats=self.repeats,
|
|
113
|
+
shuffle=self.shuffle,
|
|
118
114
|
).load()
|
|
119
115
|
|
|
120
116
|
data_dict[env_name] = dataset
|
|
@@ -145,15 +141,15 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
145
141
|
|
|
146
142
|
try:
|
|
147
143
|
# Parse the prediction to get the reward
|
|
148
|
-
|
|
149
|
-
reward =
|
|
144
|
+
task_result = task_state.metadata['task_result']
|
|
145
|
+
reward = task_result.get('reward', 0.0)
|
|
150
146
|
|
|
151
147
|
score.value = {
|
|
152
148
|
'Pass^1': float(reward),
|
|
153
149
|
}
|
|
154
150
|
score.explanation = f'Task completed with reward: {reward}'
|
|
155
151
|
score.metadata = {
|
|
156
|
-
'task_result':
|
|
152
|
+
'task_result': task_result,
|
|
157
153
|
'env_name': task_state.metadata.get('env_name', 'unknown'),
|
|
158
154
|
'task_index': task_state.metadata.get('task_index', -1)
|
|
159
155
|
}
|
|
File without changes
|
|
@@ -16,8 +16,10 @@ logger = get_logger()
|
|
|
16
16
|
@register_benchmark(
|
|
17
17
|
BenchmarkMeta(
|
|
18
18
|
name='evalmuse',
|
|
19
|
+
pretty_name='EvalMuse',
|
|
19
20
|
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
20
|
-
description='EvalMuse Text-to-Image Benchmark'
|
|
21
|
+
description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
|
|
22
|
+
'and semantic alignment of finely generated images',
|
|
21
23
|
tags=[Tags.TEXT_TO_IMAGE],
|
|
22
24
|
subset_list=['EvalMuse'],
|
|
23
25
|
metric_list=['FGA_BLIP2Score'],
|
|
@@ -4,7 +4,6 @@ import os
|
|
|
4
4
|
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
5
5
|
from evalscope.api.dataset import Sample
|
|
6
6
|
from evalscope.api.messages import ChatMessageUser
|
|
7
|
-
from evalscope.api.metric.scorer import Score
|
|
8
7
|
from evalscope.api.registry import get_metric, register_benchmark
|
|
9
8
|
from evalscope.constants import Tags
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
@@ -15,8 +14,9 @@ logger = get_logger()
|
|
|
15
14
|
@register_benchmark(
|
|
16
15
|
BenchmarkMeta(
|
|
17
16
|
name='genai_bench',
|
|
17
|
+
pretty_name='GenAI-Bench',
|
|
18
18
|
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
19
|
-
description='GenAI-Bench Text-to-Image Benchmark',
|
|
19
|
+
description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
|
|
20
20
|
tags=[Tags.TEXT_TO_IMAGE],
|
|
21
21
|
subset_list=['GenAI-Bench-1600'],
|
|
22
22
|
metric_list=['VQAScore'],
|
|
@@ -16,7 +16,7 @@ logger = get_logger()
|
|
|
16
16
|
name='general_t2i',
|
|
17
17
|
dataset_id='general_t2i',
|
|
18
18
|
description='General Text-to-Image Benchmark',
|
|
19
|
-
tags=[Tags.TEXT_TO_IMAGE],
|
|
19
|
+
tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
|
|
20
20
|
subset_list=['default'],
|
|
21
21
|
metric_list=['PickScore'],
|
|
22
22
|
few_shot_num=0,
|
|
@@ -14,8 +14,10 @@ logger = get_logger()
|
|
|
14
14
|
@register_benchmark(
|
|
15
15
|
BenchmarkMeta(
|
|
16
16
|
name='hpdv2',
|
|
17
|
+
pretty_name='HPD-v2',
|
|
17
18
|
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
18
|
-
description='HPDv2 Text-to-Image Benchmark'
|
|
19
|
+
description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
|
|
20
|
+
'trained on the Human Preference Dataset (HPD v2)',
|
|
19
21
|
tags=[Tags.TEXT_TO_IMAGE],
|
|
20
22
|
subset_list=['HPDv2'],
|
|
21
23
|
metric_list=['HPSv2.1Score'],
|
|
@@ -41,7 +43,10 @@ class HPDv2Adapter(Text2ImageAdapter):
|
|
|
41
43
|
return Sample(
|
|
42
44
|
input=[ChatMessageUser(content=record['prompt'])],
|
|
43
45
|
metadata={
|
|
46
|
+
'id': record['id'],
|
|
47
|
+
'prompt': record['prompt'],
|
|
44
48
|
'category': record.get('tags', {}).get('category', ''),
|
|
45
|
-
'tags': record.get('tags', {})
|
|
49
|
+
'tags': record.get('tags', {}),
|
|
50
|
+
'image_path': record.get('image_path', ''), # Optional field for existing image path
|
|
46
51
|
}
|
|
47
52
|
)
|
|
@@ -37,6 +37,7 @@ TRUTHFUL_QA_PROMPT = (
|
|
|
37
37
|
dataset_id='evalscope/truthful_qa',
|
|
38
38
|
metric_list=['multi_choice_acc'],
|
|
39
39
|
subset_list=['multiple_choice'],
|
|
40
|
+
shuffle_choices=True,
|
|
40
41
|
few_shot_num=0,
|
|
41
42
|
train_split=None,
|
|
42
43
|
eval_split='validation',
|
|
@@ -55,8 +56,6 @@ class TruthfulQaAdapter(MultiChoiceAdapter):
|
|
|
55
56
|
|
|
56
57
|
super().__init__(**kwargs)
|
|
57
58
|
|
|
58
|
-
self.shuffle_choices = True
|
|
59
|
-
|
|
60
59
|
self.multiple_correct = self.extra_params.get('multiple_correct', False)
|
|
61
60
|
if self.multiple_correct:
|
|
62
61
|
self.prompt_template = MultipleChoiceTemplate.MULTIPLE_ANSWER
|
evalscope/cli/start_app.py
CHANGED
|
@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
|
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
-
|
|
31
|
+
try:
|
|
32
|
+
from evalscope.app import create_app
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f'Failed to import create_app from evalscope.app, due to {e}. '
|
|
36
|
+
"Please run `pip install 'evalscope[app]'`."
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
create_app(self.args)
|
evalscope/cli/start_perf.py
CHANGED
|
@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
|
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
-
|
|
31
|
+
try:
|
|
32
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
|
|
36
|
+
"Please run `pip install 'evalscope[perf]'`."
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
run_perf_benchmark(self.args)
|