evalscope 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (97) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
  6. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  7. evalscope/api/benchmark/benchmark.py +35 -0
  8. evalscope/api/benchmark/meta.py +6 -0
  9. evalscope/api/dataset/dataset.py +6 -6
  10. evalscope/api/dataset/loader.py +2 -1
  11. evalscope/api/evaluator/cache.py +24 -1
  12. evalscope/api/evaluator/state.py +12 -1
  13. evalscope/api/messages/__init__.py +1 -0
  14. evalscope/api/messages/chat_message.py +47 -2
  15. evalscope/api/metric/scorer.py +15 -7
  16. evalscope/api/mixin/__init__.py +0 -1
  17. evalscope/api/model/generate_config.py +1 -3
  18. evalscope/api/model/model.py +4 -1
  19. evalscope/app/app.py +3 -0
  20. evalscope/app/ui/single_model.py +3 -3
  21. evalscope/app/utils/data_utils.py +7 -7
  22. evalscope/app/utils/env_utils.py +12 -0
  23. evalscope/app/utils/text_utils.py +14 -12
  24. evalscope/arguments.py +2 -4
  25. evalscope/backend/opencompass/backend_manager.py +0 -2
  26. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  27. evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
  28. evalscope/benchmarks/bfcl/generation.py +2 -2
  29. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  30. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  31. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  32. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  33. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  34. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  35. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  36. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  37. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  38. evalscope/benchmarks/mmmu/__init__.py +0 -0
  39. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  40. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  41. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  42. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
  43. evalscope/benchmarks/tau_bench/generation.py +1 -1
  44. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
  45. evalscope/benchmarks/text2image/__init__.py +0 -0
  46. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  47. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  48. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  49. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  50. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  51. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  52. evalscope/cli/start_app.py +7 -1
  53. evalscope/cli/start_perf.py +7 -1
  54. evalscope/config.py +72 -13
  55. evalscope/constants.py +8 -0
  56. evalscope/evaluator/evaluator.py +6 -4
  57. evalscope/metrics/llm_judge.py +19 -7
  58. evalscope/models/image_edit_model.py +125 -0
  59. evalscope/models/model_apis.py +20 -0
  60. evalscope/models/openai_compatible.py +3 -0
  61. evalscope/models/text2image_model.py +2 -2
  62. evalscope/models/utils/openai.py +7 -4
  63. evalscope/perf/benchmark.py +2 -0
  64. evalscope/perf/utils/benchmark_util.py +8 -5
  65. evalscope/perf/utils/local_server.py +3 -0
  66. evalscope/report/__init__.py +0 -1
  67. evalscope/report/generator.py +8 -87
  68. evalscope/run.py +9 -5
  69. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  70. evalscope/utils/chat_service.py +1 -1
  71. evalscope/utils/import_utils.py +23 -1
  72. evalscope/utils/io_utils.py +42 -1
  73. evalscope/utils/model_utils.py +4 -3
  74. evalscope/utils/multi_choices.py +23 -6
  75. evalscope/version.py +2 -2
  76. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
  77. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
  78. tests/benchmark/test_eval.py +30 -31
  79. tests/benchmark/test_image_edit.py +65 -0
  80. tests/benchmark/test_vlm.py +80 -0
  81. tests/cli/test_all.py +83 -43
  82. tests/cli/test_collection.py +8 -5
  83. tests/cli/test_reasoning.py +81 -0
  84. tests/common.py +73 -0
  85. tests/perf/test_perf.py +4 -2
  86. tests/rag/test_clip_benchmark.py +0 -3
  87. evalscope/api/mixin/dataset_mixin.py +0 -105
  88. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  89. tests/aigc/__init__.py +0 -1
  90. /evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
  91. /evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
  92. /evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
  93. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  94. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  95. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  96. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  97. /tests/{aigc → benchmark}/test_t2i.py +0 -0
@@ -0,0 +1,159 @@
1
+ import ast
2
+ import re
3
+ from typing import Any, Dict, List
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
14
+
15
+ # flake8: noqa
16
+
17
+ logger = get_logger()
18
+
19
+ SUBSET_LIST = [
20
+ 'Accounting',
21
+ 'Agriculture',
22
+ 'Architecture_and_Engineering',
23
+ 'Art',
24
+ 'Art_Theory',
25
+ 'Basic_Medical_Science',
26
+ 'Biology',
27
+ 'Chemistry',
28
+ 'Clinical_Medicine',
29
+ 'Computer_Science',
30
+ 'Design',
31
+ 'Diagnostics_and_Laboratory_Medicine',
32
+ 'Economics',
33
+ 'Electronics',
34
+ 'Energy_and_Power',
35
+ 'Finance',
36
+ 'Geography',
37
+ 'History',
38
+ 'Literature',
39
+ 'Manage',
40
+ 'Marketing',
41
+ 'Materials',
42
+ 'Math',
43
+ 'Mechanical_Engineering',
44
+ 'Music',
45
+ 'Pharmacy',
46
+ 'Physics',
47
+ 'Psychology',
48
+ 'Public_Health',
49
+ 'Sociology',
50
+ ]
51
+
52
+ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
53
+
54
+ OPEN_PROMPT = """
55
+ Solve the following problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
56
+
57
+ {question}
58
+
59
+ Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
60
+ """
61
+
62
+ MULTI_CHOICE_TYPE = 'multiple-choice'
63
+ OPEN_TYPE = 'open'
64
+
65
+
66
+ @register_benchmark(
67
+ BenchmarkMeta(
68
+ name='mmmu',
69
+ pretty_name='MMMU',
70
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
71
+ description=
72
+ 'MMMU (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI) benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning. MMMU includes 11.5K meticulously collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering. These questions span 30 subjects and 183 subfields, comprising 30 highly heterogeneous image types, such as charts, diagrams, maps, tables, music sheets, and chemical structures.', # noqa: E501
73
+ dataset_id='AI-ModelScope/MMMU',
74
+ subset_list=SUBSET_LIST,
75
+ metric_list=['acc'],
76
+ eval_split='validation',
77
+ prompt_template=OPEN_PROMPT,
78
+ )
79
+ )
80
+ class MMMUAdapter(VisionLanguageAdapter):
81
+ MAX_IMAGES: int = 7
82
+
83
+ def __init__(self, *args, **kwargs):
84
+ super().__init__(*args, **kwargs)
85
+
86
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
87
+ question_type = record['question_type']
88
+ content_list, answers_list = MMMUAdapter.create_content_and_answers_list(record)
89
+
90
+ metadata = {
91
+ 'id': record['id'],
92
+ 'question_type': record['question_type'],
93
+ 'subfield': record['subfield'],
94
+ 'explanation': record['explanation'],
95
+ 'img_type': record['img_type'],
96
+ 'topic_difficulty': record['topic_difficulty'],
97
+ }
98
+
99
+ if question_type == MULTI_CHOICE_TYPE:
100
+ return Sample(
101
+ input=[ChatMessageUser(content=content_list)],
102
+ choices=answers_list,
103
+ target=record['answer'],
104
+ metadata=metadata,
105
+ )
106
+ elif question_type == OPEN_TYPE:
107
+ return Sample(
108
+ input=[ChatMessageUser(content=content_list)],
109
+ target=record['answer'],
110
+ metadata=metadata,
111
+ )
112
+ else:
113
+ raise ValueError(f'Unsupported question type: {question_type}')
114
+
115
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
116
+ question_type = task_state.metadata['question_type']
117
+ if question_type == MULTI_CHOICE_TYPE:
118
+ answers = parse_answers(task_state)
119
+ return ''.join(sorted(list(answers)))
120
+ elif question_type == OPEN_TYPE:
121
+ pattern = r'ANSWER:\s*(.*)'
122
+ match = re.search(pattern, prediction)
123
+ if match:
124
+ return match.group(1).strip()
125
+ return ''
126
+ else:
127
+ raise ValueError(f'Unsupported question type: {question_type}')
128
+
129
+ @staticmethod
130
+ def create_content_and_answers_list(record: Dict[str, Any]) -> tuple[List[Content], List[str]]:
131
+ """
132
+ Create a list of content elements and a list of answers from a record.
133
+
134
+ Args:
135
+ record (dict): The record containing question, images, and options.
136
+
137
+
138
+ Returns:
139
+ tuple: A tuple containing:
140
+ - content_list (list): A list of content elements (text and images).
141
+ - answers_list (list): A list of possible answers (for multiple-choice questions).
142
+ """
143
+ question_type = record['question_type']
144
+
145
+ if question_type == MULTI_CHOICE_TYPE:
146
+ answers_list: List[str] = ast.literal_eval(record['options'])
147
+ input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
148
+ content_list: List[Content] = [ContentText(text=input_text)]
149
+ else:
150
+ answers_list: List[str] = []
151
+ content_list: List[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
152
+
153
+ for i in range(MMMUAdapter.MAX_IMAGES):
154
+ image = record[f'image_{i+1}']
155
+ if image:
156
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
157
+ content_list.append(ContentImage(image=image_base64))
158
+
159
+ return content_list, answers_list
File without changes
@@ -0,0 +1,129 @@
1
+ import ast
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate, answer_character, parse_answers, prompt
13
+
14
+ logger = get_logger()
15
+
16
+ SUBSET_LIST = [
17
+ 'Accounting',
18
+ 'Agriculture',
19
+ 'Architecture_and_Engineering',
20
+ 'Art',
21
+ 'Art_Theory',
22
+ 'Basic_Medical_Science',
23
+ 'Biology',
24
+ 'Chemistry',
25
+ 'Clinical_Medicine',
26
+ 'Computer_Science',
27
+ 'Design',
28
+ 'Diagnostics_and_Laboratory_Medicine',
29
+ 'Economics',
30
+ 'Electronics',
31
+ 'Energy_and_Power',
32
+ 'Finance',
33
+ 'Geography',
34
+ 'History',
35
+ 'Literature',
36
+ 'Manage',
37
+ 'Marketing',
38
+ 'Materials',
39
+ 'Math',
40
+ 'Mechanical_Engineering',
41
+ 'Music',
42
+ 'Pharmacy',
43
+ 'Physics',
44
+ 'Psychology',
45
+ 'Public_Health',
46
+ 'Sociology',
47
+ ]
48
+
49
+ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
50
+
51
+ VISION_PROMPT = r"""
52
+ Answer the following multiple choice question in image. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
53
+
54
+ """.strip() # noqa: E501
55
+
56
+ DATASET_FORMATS = ['standard (4 options)', 'standard (10 options)', 'vision']
57
+
58
+
59
+ @register_benchmark(
60
+ BenchmarkMeta(
61
+ name='mmmu_pro',
62
+ pretty_name='MMMU-PRO',
63
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
64
+ description=
65
+ 'MMMU-Pro is an enhanced multimodal benchmark designed to rigorously assess the true understanding capabilities of advanced AI models across multiple modalities. It builds upon the original MMMU benchmark by introducing several key improvements that make it more challenging and realistic, ensuring that models are evaluated on their genuine ability to integrate and comprehend both visual and textual information.', # noqa: E501
66
+ dataset_id='AI-ModelScope/MMMU_Pro',
67
+ subset_list=SUBSET_LIST,
68
+ metric_list=['acc'],
69
+ eval_split='test',
70
+ prompt_template=MULT_CHOICE_PROMPT,
71
+ extra_params={
72
+ 'dataset_format': f"# choose from {DATASET_FORMATS}, default 'standard (4 options)'",
73
+ }
74
+ )
75
+ )
76
+ class MMMUPROAdapter(VisionLanguageAdapter):
77
+ MAX_IMAGES: int = 7
78
+
79
+ def __init__(self, *args, **kwargs):
80
+ super().__init__(*args, **kwargs)
81
+
82
+ self.reformat_subset = True
83
+ self.dataset_format = self.extra_params.get('dataset_format', 'standard (4 options)')
84
+ if self.dataset_format not in DATASET_FORMATS:
85
+ logger.warning(f"Invalid dataset_format '{self.dataset_format}', fallback to 'standard (4 options)'")
86
+ self.dataset_format = 'standard (4 options)'
87
+ self.default_subset = self.dataset_format
88
+
89
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
90
+
91
+ metadata = {
92
+ 'id': record['id'],
93
+ 'explanation': record.get('explanation'),
94
+ 'img_type': record.get('img_type'),
95
+ 'topic_difficulty': record.get('topic_difficulty'),
96
+ 'subject': record.get('subject')
97
+ }
98
+
99
+ answers_list: List[str] = ast.literal_eval(record['options'])
100
+
101
+ if self.dataset_format == 'vision':
102
+ letters = ','.join(answer_character(i) for i in range(len(answers_list)))
103
+ input_text = VISION_PROMPT.format(letters=letters)
104
+ content_list: List[Content] = [ContentText(text=input_text)]
105
+
106
+ image = record.get('image')
107
+ if image:
108
+ content_list.append(ContentImage(image=bytes_to_base64(image['bytes'], format='png', add_header=True)))
109
+ else:
110
+ input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
111
+ content_list: List[Content] = [ContentText(text=input_text)]
112
+
113
+ for i in range(MMMUPROAdapter.MAX_IMAGES):
114
+ image = record.get(f'image_{i+1}')
115
+ if image:
116
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
117
+ content_list.append(ContentImage(image=image_base64))
118
+
119
+ return Sample(
120
+ input=[ChatMessageUser(content=content_list)],
121
+ choices=answers_list,
122
+ target=record['answer'],
123
+ subset_key=record['subject'],
124
+ metadata=metadata,
125
+ )
126
+
127
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
128
+ answers = parse_answers(task_state)
129
+ return ''.join(sorted(list(answers)))
@@ -164,7 +164,11 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
164
164
  records.append(record)
165
165
 
166
166
  dataset = DictDataLoader(
167
- dict_list=records, limit=self.limit, repeats=self.repeats, sample_fields=self.record_to_sample
167
+ dict_list=records,
168
+ limit=self.limit,
169
+ repeats=self.repeats,
170
+ sample_fields=self.record_to_sample,
171
+ shuffle=self.shuffle,
168
172
  ).load()
169
173
 
170
174
  datasets[subset_name] = dataset
@@ -45,7 +45,7 @@ def _patch_agent_solve(model: Model):
45
45
  input=[dict_to_chat_message(msg) for msg in messages],
46
46
  tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
47
47
  )
48
- oai_res = openai_chat_choices(res.choices)
48
+ oai_res = openai_chat_choices(res.choices, include_reasoning=False)
49
49
 
50
50
  next_message = oai_res[0].message.model_dump(exclude_none=True)
51
51
 
@@ -13,6 +13,7 @@ from evalscope.api.registry import register_benchmark
13
13
  from evalscope.constants import Tags
14
14
  from evalscope.utils import get_logger
15
15
  from evalscope.utils.function_utils import run_once
16
+ from evalscope.utils.import_utils import check_import
16
17
 
17
18
  logger = get_logger()
18
19
 
@@ -35,8 +36,8 @@ logger = get_logger()
35
36
  'api_key': 'EMPTY',
36
37
  'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
37
38
  'generation_config': {
38
- 'temperature': 0.7,
39
- 'max_new_tokens': 1024
39
+ 'temperature': 0.0,
40
+ 'max_tokens': 4096,
40
41
  }
41
42
  }
42
43
  )
@@ -46,22 +47,13 @@ class TauBenchAdapter(DefaultDataAdapter):
46
47
  def __init__(self, **kwargs):
47
48
  super().__init__(**kwargs)
48
49
 
49
- spec = importlib.util.find_spec('tau_bench')
50
- if spec is None:
51
- raise ImportError(
52
- '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
53
- )
50
+ check_import('tau_bench', package='git+https://github.com/sierra-research/tau-bench', raise_error=True)
54
51
 
55
52
  # setup user model args
56
53
  self.user_model = self.extra_params.get('user_model', 'qwen-plus')
57
54
  self.api_key = self.extra_params.get('api_key', 'EMPTY')
58
55
  self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
59
- self.generation_config = self.extra_params.get(
60
- 'generation_config', {
61
- 'temperature': 0.7,
62
- 'max_new_tokens': 1024
63
- }
64
- )
56
+ self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
65
57
 
66
58
  self._patch_env_completion()
67
59
 
@@ -84,10 +76,10 @@ class TauBenchAdapter(DefaultDataAdapter):
84
76
 
85
77
  res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
86
78
 
87
- message = res.message.model_dump(exclude_none=True)
79
+ message = {'role': 'assistant', 'content': res.completion}
88
80
  self.messages.append(message)
89
81
  self.total_cost = 0
90
- return message['content']
82
+ return res.completion
91
83
 
92
84
  # get the current instance of TauBenchAdapter
93
85
  adapter_instance = self
@@ -114,7 +106,11 @@ class TauBenchAdapter(DefaultDataAdapter):
114
106
  })
115
107
  # load dataset
116
108
  dataset = DictDataLoader(
117
- dict_list=tasks, sample_fields=self.record_to_sample, limit=self.limit, repeats=self.repeats
109
+ dict_list=tasks,
110
+ sample_fields=self.record_to_sample,
111
+ limit=self.limit,
112
+ repeats=self.repeats,
113
+ shuffle=self.shuffle,
118
114
  ).load()
119
115
 
120
116
  data_dict[env_name] = dataset
@@ -145,15 +141,15 @@ class TauBenchAdapter(DefaultDataAdapter):
145
141
 
146
142
  try:
147
143
  # Parse the prediction to get the reward
148
- res = task_state.metadata
149
- reward = res.get('reward', 0.0)
144
+ task_result = task_state.metadata['task_result']
145
+ reward = task_result.get('reward', 0.0)
150
146
 
151
147
  score.value = {
152
148
  'Pass^1': float(reward),
153
149
  }
154
150
  score.explanation = f'Task completed with reward: {reward}'
155
151
  score.metadata = {
156
- 'task_result': res,
152
+ 'task_result': task_result,
157
153
  'env_name': task_state.metadata.get('env_name', 'unknown'),
158
154
  'task_index': task_state.metadata.get('task_index', -1)
159
155
  }
File without changes
@@ -16,8 +16,10 @@ logger = get_logger()
16
16
  @register_benchmark(
17
17
  BenchmarkMeta(
18
18
  name='evalmuse',
19
+ pretty_name='EvalMuse',
19
20
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
20
- description='EvalMuse Text-to-Image Benchmark',
21
+ description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
22
+ 'and semantic alignment of finely generated images',
21
23
  tags=[Tags.TEXT_TO_IMAGE],
22
24
  subset_list=['EvalMuse'],
23
25
  metric_list=['FGA_BLIP2Score'],
@@ -4,7 +4,6 @@ import os
4
4
  from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
5
5
  from evalscope.api.dataset import Sample
6
6
  from evalscope.api.messages import ChatMessageUser
7
- from evalscope.api.metric.scorer import Score
8
7
  from evalscope.api.registry import get_metric, register_benchmark
9
8
  from evalscope.constants import Tags
10
9
  from evalscope.utils.logger import get_logger
@@ -15,8 +14,9 @@ logger = get_logger()
15
14
  @register_benchmark(
16
15
  BenchmarkMeta(
17
16
  name='genai_bench',
17
+ pretty_name='GenAI-Bench',
18
18
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
19
- description='GenAI-Bench Text-to-Image Benchmark',
19
+ description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
20
20
  tags=[Tags.TEXT_TO_IMAGE],
21
21
  subset_list=['GenAI-Bench-1600'],
22
22
  metric_list=['VQAScore'],
@@ -16,7 +16,7 @@ logger = get_logger()
16
16
  name='general_t2i',
17
17
  dataset_id='general_t2i',
18
18
  description='General Text-to-Image Benchmark',
19
- tags=[Tags.TEXT_TO_IMAGE],
19
+ tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
20
20
  subset_list=['default'],
21
21
  metric_list=['PickScore'],
22
22
  few_shot_num=0,
@@ -14,8 +14,10 @@ logger = get_logger()
14
14
  @register_benchmark(
15
15
  BenchmarkMeta(
16
16
  name='hpdv2',
17
+ pretty_name='HPD-v2',
17
18
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
- description='HPDv2 Text-to-Image Benchmark',
19
+ description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
20
+ 'trained on the Human Preference Dataset (HPD v2)',
19
21
  tags=[Tags.TEXT_TO_IMAGE],
20
22
  subset_list=['HPDv2'],
21
23
  metric_list=['HPSv2.1Score'],
@@ -41,7 +43,10 @@ class HPDv2Adapter(Text2ImageAdapter):
41
43
  return Sample(
42
44
  input=[ChatMessageUser(content=record['prompt'])],
43
45
  metadata={
46
+ 'id': record['id'],
47
+ 'prompt': record['prompt'],
44
48
  'category': record.get('tags', {}).get('category', ''),
45
- 'tags': record.get('tags', {})
49
+ 'tags': record.get('tags', {}),
50
+ 'image_path': record.get('image_path', ''), # Optional field for existing image path
46
51
  }
47
52
  )
@@ -10,6 +10,7 @@ logger = get_logger()
10
10
  @register_benchmark(
11
11
  BenchmarkMeta(
12
12
  name='tifa160',
13
+ pretty_name='TIFA-160',
13
14
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
14
15
  description='TIFA-160 Text-to-Image Benchmark',
15
16
  tags=[Tags.TEXT_TO_IMAGE],
@@ -37,6 +37,7 @@ TRUTHFUL_QA_PROMPT = (
37
37
  dataset_id='evalscope/truthful_qa',
38
38
  metric_list=['multi_choice_acc'],
39
39
  subset_list=['multiple_choice'],
40
+ shuffle_choices=True,
40
41
  few_shot_num=0,
41
42
  train_split=None,
42
43
  eval_split='validation',
@@ -55,8 +56,6 @@ class TruthfulQaAdapter(MultiChoiceAdapter):
55
56
 
56
57
  super().__init__(**kwargs)
57
58
 
58
- self.shuffle_choices = True
59
-
60
59
  self.multiple_correct = self.extra_params.get('multiple_correct', False)
61
60
  if self.multiple_correct:
62
61
  self.prompt_template = MultipleChoiceTemplate.MULTIPLE_ANSWER
@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
- from evalscope.app import create_app
31
+ try:
32
+ from evalscope.app import create_app
33
+ except ImportError as e:
34
+ raise ImportError(
35
+ f'Failed to import create_app from evalscope.app, due to {e}. '
36
+ "Please run `pip install 'evalscope[app]'`."
37
+ )
32
38
 
33
39
  create_app(self.args)
@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
- from evalscope.perf.main import run_perf_benchmark
31
+ try:
32
+ from evalscope.perf.main import run_perf_benchmark
33
+ except ImportError as e:
34
+ raise ImportError(
35
+ f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
36
+ "Please run `pip install 'evalscope[perf]'`."
37
+ )
32
38
 
33
39
  run_perf_benchmark(self.args)