evalscope 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (87) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  2. evalscope/app/ui/multi_model.py +6 -1
  3. evalscope/app/ui/single_model.py +8 -2
  4. evalscope/app/utils/data_utils.py +3 -2
  5. evalscope/app/utils/visualization.py +2 -2
  6. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  7. evalscope/benchmarks/bfcl/bfcl_adapter.py +10 -45
  8. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  9. evalscope/benchmarks/chartqa/__init__.py +0 -0
  10. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  11. evalscope/benchmarks/chartqa/utils.py +38 -0
  12. evalscope/benchmarks/docvqa/__init__.py +0 -0
  13. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  14. evalscope/benchmarks/general_arena/utils.py +2 -1
  15. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  16. evalscope/benchmarks/infovqa/__init__.py +0 -0
  17. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  18. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  19. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  20. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  21. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  22. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  23. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  24. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  25. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  26. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  27. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  28. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  29. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  30. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  31. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  32. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  33. evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  34. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  35. evalscope/metrics/metric.py +51 -0
  36. evalscope/metrics/metrics.py +16 -0
  37. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  38. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  39. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  40. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  41. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  42. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  43. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  44. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  45. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  46. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  47. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  48. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  49. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  50. evalscope/report/__init__.py +9 -1
  51. evalscope/report/combinator.py +52 -2
  52. evalscope/utils/json_schema.py +8 -6
  53. evalscope/utils/multi_choices.py +16 -1
  54. evalscope/version.py +2 -2
  55. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/METADATA +6 -32
  56. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/RECORD +51 -54
  57. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
  58. tests/__init__.py +0 -1
  59. tests/benchmark/__init__.py +0 -1
  60. tests/benchmark/test_eval.py +0 -429
  61. tests/benchmark/test_image_edit.py +0 -65
  62. tests/benchmark/test_sandbox.py +0 -81
  63. tests/benchmark/test_t2i.py +0 -142
  64. tests/benchmark/test_vlm.py +0 -137
  65. tests/cli/__init__.py +0 -1
  66. tests/cli/test_all.py +0 -269
  67. tests/cli/test_collection.py +0 -99
  68. tests/cli/test_custom.py +0 -268
  69. tests/cli/test_reasoning.py +0 -81
  70. tests/common.py +0 -73
  71. tests/perf/__init__.py +0 -1
  72. tests/perf/test_perf.py +0 -206
  73. tests/rag/test_clip_benchmark.py +0 -87
  74. tests/rag/test_mteb.py +0 -213
  75. tests/rag/test_ragas.py +0 -128
  76. tests/swift/__init__.py +0 -1
  77. tests/swift/test_run_swift_eval.py +0 -146
  78. tests/swift/test_run_swift_vlm_eval.py +0 -128
  79. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  80. tests/test_run_all.py +0 -12
  81. tests/utils.py +0 -13
  82. tests/vlm/__init__.py +0 -1
  83. tests/vlm/test_vlmeval.py +0 -102
  84. {tests/rag → evalscope/benchmarks/blink}/__init__.py +0 -0
  85. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
  86. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
  87. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
@@ -128,6 +128,9 @@ class DefaultDataAdapter(DataAdapter):
128
128
  for sample in self.test_dataset[subset]:
129
129
  if isinstance(sample.input, str):
130
130
  sample.input = self.process_sample_str_input(sample, subset)
131
+ elif isinstance(sample.input, list):
132
+ # Handle list[ChatMessage] and add system prompt if needed
133
+ sample.input = self.process_sample_messages_input(sample, subset)
131
134
 
132
135
  def process_sample_str_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
133
136
  """
@@ -142,6 +145,15 @@ class DefaultDataAdapter(DataAdapter):
142
145
  input_messages.insert(0, ChatMessageSystem(content=self.system_prompt))
143
146
  return input_messages
144
147
 
148
+ def process_sample_messages_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
149
+ """
150
+ Normalize a sample's existing List[ChatMessage] input and ensure system prompt is set once.
151
+ """
152
+ messages = list(sample.input) # shallow copy to avoid in-place mutations
153
+ if self.system_prompt and not any(isinstance(m, ChatMessageSystem) for m in messages):
154
+ messages = [ChatMessageSystem(content=self.system_prompt)] + messages
155
+ return messages
156
+
145
157
  def process_sample_input(self, sample: Sample, subset: str) -> str:
146
158
  """
147
159
  Process a single sample's input by applying prompt templates and few-shot formatting.
@@ -204,7 +204,12 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
204
204
  data_score_df_b, _ = get_single_dataset_df(report_df_b, dataset_name)
205
205
 
206
206
  # Get subset choices - should be same for both models
207
- subsets = data_score_df_a[ReportKey.subset_name].unique().tolist()
207
+ # Only select the subsets that Cat.0 is not '-'
208
+ df_for_subsets = data_score_df_a.copy()
209
+ subsets = sorted(
210
+ df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
211
+ ReportKey.subset_name].dropna().unique().tolist()
212
+ )
208
213
 
209
214
  return gr.update(choices=subsets, value=None), None
210
215
 
@@ -134,11 +134,17 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
134
134
  )
135
135
  def update_single_report_dataset(dataset_name, report_list):
136
136
  logger.debug(f'Updating single report dataset: {dataset_name}')
137
- report_df = get_data_frame(report_list=report_list)
137
+ report_df = get_data_frame(report_list=report_list, flatten_metrics=True, flatten_categories=True)
138
138
  analysis = get_report_analysis(report_list, dataset_name)
139
139
  data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
140
140
  data_score_plot = plot_single_dataset_scores(data_score_df)
141
- subsets = data_score_df[ReportKey.subset_name].unique().tolist()
141
+ # Only select the subsets that Cat.0 is not '-'
142
+ df_for_subsets = data_score_df.copy()
143
+ subsets = sorted(
144
+ df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
145
+ ReportKey.subset_name].dropna().unique().tolist()
146
+ )
147
+
142
148
  logger.debug(f'subsets: {subsets}')
143
149
  return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
144
150
 
@@ -168,9 +168,10 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
168
168
  'Index': str(review_result.index),
169
169
  'Input': review_result.input.replace('\n', '\n\n'), # for markdown
170
170
  'Metadata': metadata,
171
- 'Generated': prediction,
171
+ 'Generated': prediction or '', # Ensure no None value
172
172
  'Gold': target,
173
- 'Pred': extracted_prediction if extracted_prediction != prediction else '*Same as Generated*',
173
+ 'Pred': (extracted_prediction if extracted_prediction != prediction else '*Same as Generated*')
174
+ or '', # Ensure no None value
174
175
  'Score': score.model_dump(exclude_none=True),
175
176
  'NScore': normalize_score(score.main_value)
176
177
  }
@@ -18,7 +18,7 @@ logger = get_logger()
18
18
  def plot_single_report_scores(df: pd.DataFrame):
19
19
  if df is None:
20
20
  return None
21
- logger.debug(f'df: {df}')
21
+ logger.debug(f'df: \n{df}')
22
22
  plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
23
23
 
24
24
  width = DEFAULT_BAR_WIDTH if len(df[ReportKey.dataset_name]) <= 5 else None
@@ -36,7 +36,7 @@ def plot_single_report_sunburst(report_list: List[Report]):
36
36
  df = get_data_frame(report_list=report_list, flatten_metrics=False)
37
37
  categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
38
38
  path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
39
- logger.debug(f'df: {df}')
39
+ logger.debug(f'df: \n{df}')
40
40
  df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
41
41
 
42
42
  plot = px.sunburst(
@@ -22,7 +22,8 @@ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
22
22
  name='ai2d',
23
23
  pretty_name='AI2D',
24
24
  tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
25
- description='A Diagram Is Worth A Dozen Images',
25
+ description=
26
+ 'AI2D is a benchmark dataset for researching the understanding of diagrams by AI. It contains over 5,000 diverse diagrams from science textbooks (e.g., the water cycle, food webs). Each diagram is accompanied by multiple-choice questions that test an AI\'s ability to interpret visual elements, text labels, and their relationships. The benchmark is challenging because it requires jointly understanding the layout, symbols, and text to answer questions correctly.', # noqa: E501
26
27
  dataset_id='lmms-lab/ai2d',
27
28
  subset_list=SUBSET_LIST,
28
29
  metric_list=['acc'],
@@ -37,7 +38,7 @@ class Ai2dAdapter(VisionLanguageAdapter):
37
38
 
38
39
  def record_to_sample(self, record: Dict[str, Any]) -> Sample:
39
40
  answers_list: list[str] = record['options']
40
- input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
41
+ input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
41
42
  content_list: list[Content] = [ContentText(text=input_text)]
42
43
  image = record.get('image')
43
44
  if image:
@@ -8,11 +8,10 @@ from evalscope.api.dataset import Sample
8
8
  from evalscope.api.evaluator import TaskState
9
9
  from evalscope.api.messages.chat_message import ChatMessageUser
10
10
  from evalscope.api.metric import Score
11
- from evalscope.api.metric.scorer import AggScore
12
11
  from evalscope.api.model import Model, ModelOutput
13
12
  from evalscope.api.registry import register_benchmark
14
13
  from evalscope.constants import Tags
15
- from evalscope.report import Category, Report, Subset
14
+ from evalscope.report import Category, Report, Subset, unweighted_average_from_subsets, weighted_average_from_subsets
16
15
  from evalscope.utils.import_utils import check_import
17
16
  from evalscope.utils.logger import get_logger
18
17
 
@@ -79,40 +78,6 @@ class BFCLAdapter(DefaultDataAdapter):
79
78
  self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
80
79
  self.is_fc_model = self.extra_params.get('is_fc_model', True)
81
80
 
82
- def _weighted_average_from_subsets(self, subset_names: List[str], subset_dict: Dict[str, Subset]) -> Subset:
83
- """Calculate weighted average for given subsets.
84
-
85
- Returns:
86
- Subset: A new Subset object with weighted average score
87
- """
88
- total_score = 0
89
- total_count = 0
90
- for name in subset_names:
91
- if name in subset_dict:
92
- subset = subset_dict[name]
93
- total_score += subset.score * subset.num
94
- total_count += subset.num
95
-
96
- weighted_avg = total_score / total_count if total_count > 0 else 0
97
- return Subset(name='', score=weighted_avg, num=total_count)
98
-
99
- def _unweighted_average_from_subsets(self, subset_names: List[str], subset_dict: Dict[str, Subset]) -> Subset:
100
- """Calculate unweighted average for given subsets.
101
-
102
- Returns:
103
- Subset: A new Subset object with unweighted average score
104
- """
105
- scores = []
106
- total_count = 0
107
- for name in subset_names:
108
- if name in subset_dict:
109
- subset = subset_dict[name]
110
- scores.append(subset.score)
111
- total_count += subset.num
112
-
113
- unweighted_avg = sum(scores) / len(scores) if scores else 0
114
- return Subset(name='', score=unweighted_avg, num=total_count)
115
-
116
81
  def preprocess_row(self, row: dict):
117
82
  """
118
83
  Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
@@ -323,19 +288,19 @@ class BFCLAdapter(DefaultDataAdapter):
323
288
 
324
289
  # Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
325
290
  simple_subsets = ['simple', 'java', 'javascript']
326
- simple_ast = self._unweighted_average_from_subsets(simple_subsets, subset_dict)
291
+ simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
327
292
  subset_dict['simple_ast'] = simple_ast
328
293
 
329
294
  # Step 2.1: Calculate ast_non_live
330
295
  # (simple_ast, multiple, parallel, parallel_multiple unweighted average)
331
296
  ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
332
- ast_non_live = self._unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
297
+ ast_non_live = unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
333
298
  subset_dict['ast_non_live'] = ast_non_live
334
299
 
335
300
  # Step 2.2: Calculate ast_live
336
301
  # (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
337
302
  live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
338
- ast_live = self._weighted_average_from_subsets(live_subsets, subset_dict)
303
+ ast_live = weighted_average_from_subsets(live_subsets, subset_dict)
339
304
  subset_dict['ast_live'] = ast_live
340
305
 
341
306
  # Step 2.3: hallucination_non_live (irrelevance)
@@ -346,7 +311,7 @@ class BFCLAdapter(DefaultDataAdapter):
346
311
 
347
312
  # Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
348
313
  hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
349
- hallucination_live = self._weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
314
+ hallucination_live = weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
350
315
  subset_dict['hallucination_live'] = hallucination_live
351
316
 
352
317
  # Step 2.5: multi_turn_base
@@ -356,27 +321,27 @@ class BFCLAdapter(DefaultDataAdapter):
356
321
  # Step 2.6: Calculate multi_turn_augmented
357
322
  # (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
358
323
  multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
359
- multi_turn_augmented = self._weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
324
+ multi_turn_augmented = weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
360
325
  subset_dict['multi_turn_augmented'] = multi_turn_augmented
361
326
 
362
327
  # Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
363
328
  non_live_subsets = ['ast_non_live', 'hallucination_non_live']
364
- non_live = self._unweighted_average_from_subsets(non_live_subsets, subset_dict)
329
+ non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
365
330
  subset_dict['non_live'] = non_live
366
331
 
367
332
  # Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
368
333
  live_agg_subsets = ['ast_live', 'hallucination_live']
369
- live = self._weighted_average_from_subsets(live_agg_subsets, subset_dict)
334
+ live = weighted_average_from_subsets(live_agg_subsets, subset_dict)
370
335
  subset_dict['live'] = live
371
336
 
372
337
  # Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
373
338
  multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
374
- multi_turn = self._unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
339
+ multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
375
340
  subset_dict['multi_turn'] = multi_turn
376
341
 
377
342
  # Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
378
343
  overall_subsets = ['non_live', 'live', 'multi_turn']
379
- overall = self._unweighted_average_from_subsets(overall_subsets, subset_dict)
344
+ overall = unweighted_average_from_subsets(overall_subsets, subset_dict)
380
345
  subset_dict['overall'] = overall
381
346
 
382
347
  # Add computed scores to the category
@@ -0,0 +1,61 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.io_utils import bytes_to_base64
10
+ from evalscope.utils.logger import get_logger
11
+ from evalscope.utils.multi_choices import format_letter_choices
12
+
13
+ logger = get_logger()
14
+
15
+ MULT_CHOICE_PROMPT = r"""
16
+ Answer the following multiple choice question. The last line of your response should be of the following format:
17
+ 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
18
+
19
+ {question}
20
+ """.strip()
21
+
22
+ SUBSET_LIST = [
23
+ 'Art_Style', 'Counting', 'Forensic_Detection', 'Functional_Correspondence', 'IQ_Test', 'Jigsaw',
24
+ 'Multi-view_Reasoning', 'Object_Localization', 'Relative_Depth', 'Relative_Reflectance', 'Semantic_Correspondence',
25
+ 'Spatial_Relation', 'Visual_Correspondence', 'Visual_Similarity'
26
+ ]
27
+
28
+
29
+ @register_benchmark(
30
+ BenchmarkMeta(
31
+ name='blink',
32
+ pretty_name='BLINK',
33
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
34
+ description=
35
+ 'BLINK is a benchmark designed to evaluate the core visual perception abilities of multimodal large language models (MLLMs). It transforms 14 classic computer vision tasks into 3,807 multiple-choice questions, accompanied by single or multiple images and visual prompts.', # noqa: E501
36
+ dataset_id='evalscope/BLINK',
37
+ subset_list=SUBSET_LIST,
38
+ metric_list=['acc'],
39
+ eval_split='val',
40
+ prompt_template=MULT_CHOICE_PROMPT,
41
+ )
42
+ )
43
+ class BLINKAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
44
+ MAX_IMAGES: int = 4
45
+
46
+ def __init__(self, **kwargs):
47
+ super().__init__(**kwargs)
48
+
49
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
50
+ choices = record.get('choices')
51
+ input_text = MULT_CHOICE_PROMPT.format(question=record['prompt'], letters=format_letter_choices(choices))
52
+ content_list: List[Content] = [ContentText(text=input_text)]
53
+
54
+ for i in range(1, self.MAX_IMAGES + 1):
55
+ image = record.get(f'image_{i}')
56
+ if image:
57
+ image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
58
+ content_list.append(ContentImage(image=image_base64))
59
+
60
+ label_answer = record['answer'].strip('(').strip(')')
61
+ return Sample(input=[ChatMessageUser(content=content_list)], choices=choices, target=label_answer)
File without changes
@@ -0,0 +1,80 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.metric.scorer import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ # flake8: noqa
15
+
16
+ logger = get_logger()
17
+
18
+ OPEN_PROMPT = """
19
+ {question}
20
+
21
+ The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the a single word answer to the problem.
22
+ """
23
+
24
+
25
+ @register_benchmark(
26
+ BenchmarkMeta(
27
+ name='chartqa',
28
+ pretty_name='ChartQA',
29
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
30
+ description=
31
+ 'ChartQA is a benchmark designed to evaluate question-answering capabilities about charts (e.g., bar charts, line graphs, pie charts), focusing on both visual and logical reasoning.', # noqa: E501
32
+ dataset_id='lmms-lab/ChartQA',
33
+ subset_list=['human_test', 'augmented_test'],
34
+ metric_list=['relaxed_acc'],
35
+ eval_split='test',
36
+ prompt_template=OPEN_PROMPT,
37
+ )
38
+ )
39
+ class ChartQAAdapter(VisionLanguageAdapter):
40
+
41
+ def __init__(self, *args, **kwargs):
42
+ super().__init__(*args, **kwargs)
43
+
44
+ self.add_aggregation_name = False
45
+ self.reformat_subset = True
46
+
47
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
48
+ question = record['question']
49
+ image_data = record['image']
50
+ image_base64 = bytes_to_base64(image_data['bytes'], format='png', add_header=True)
51
+
52
+ content_list: List[Content] = [
53
+ ContentText(text=OPEN_PROMPT.format(question=question)),
54
+ ContentImage(image=image_base64)
55
+ ]
56
+
57
+ return Sample(
58
+ input=[ChatMessageUser(content=content_list)],
59
+ target=record['answer'],
60
+ subset_key=record['type'], # 'human_test' or 'augmented_split'
61
+ )
62
+
63
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
64
+ pattern = r'ANSWER:\s*(.*)'
65
+ match = re.search(pattern, prediction)
66
+ if match:
67
+ return match.group(1).strip()
68
+ return ''
69
+
70
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
71
+ from .utils import relaxed_correctness
72
+
73
+ score = relaxed_correctness(filtered_prediction, reference)
74
+ score = 1.0 if score else 0.0
75
+
76
+ return Score(
77
+ value={'relaxed_acc': score},
78
+ prediction=original_prediction,
79
+ extracted_prediction=filtered_prediction,
80
+ )
@@ -0,0 +1,38 @@
1
+ def relaxed_correctness(prediction: str, target: str, max_relative_change: float = 0.05) -> bool:
2
+ """Calculates relaxed correctness.
3
+
4
+ The correctness tolerates certain error ratio defined by max_relative_change.
5
+ See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
6
+ “Following Methani et al. (2020), we use a relaxed accuracy measure for the
7
+ numeric answers to allow a minor inaccuracy that may result from the automatic
8
+ data extraction process. We consider an answer to be correct if it is within
9
+ 5% of the gold answer. For non-numeric answers, we still need an exact match
10
+ to consider an answer to be correct.”
11
+
12
+ This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
13
+ Args:
14
+ target: List of target string.
15
+ prediction: List of predicted string.
16
+ max_relative_change: Maximum relative change.
17
+
18
+ Returns:
19
+ Whether the prediction was correct given the specified tolerance.
20
+ """ # noqa: E501
21
+
22
+ def _to_float(text: str):
23
+ try:
24
+ if text.endswith('%'):
25
+ # Convert percentages to floats.
26
+ return float(text.rstrip('%')) / 100.0
27
+ else:
28
+ return float(text)
29
+ except ValueError:
30
+ return None
31
+
32
+ prediction_float = _to_float(prediction)
33
+ target_float = _to_float(target)
34
+ if prediction_float is not None and target_float:
35
+ relative_change = abs(prediction_float - target_float) / abs(target_float)
36
+ return relative_change <= max_relative_change
37
+ else:
38
+ return prediction.lower() == target.lower()
File without changes
@@ -0,0 +1,67 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator.state import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ PROMPT = """Answer the question according to the image using a single word or phrase.
16
+ {question}
17
+ The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.""" # noqa: E501
18
+
19
+
20
+ @register_benchmark(
21
+ BenchmarkMeta(
22
+ name='docvqa',
23
+ pretty_name='DocVQA',
24
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
25
+ description=
26
+ 'DocVQA (Document Visual Question Answering) is a benchmark designed to evaluate AI systems on their ability to answer questions based on the content of document images, such as scanned pages, forms, or invoices. Unlike general visual question answering, it requires understanding not just the text extracted by OCR, but also the complex layout, structure, and visual elements of a document.', # noqa: E501
27
+ dataset_id='lmms-lab/DocVQA',
28
+ subset_list=['DocVQA'],
29
+ metric_list=['anls'],
30
+ eval_split='validation',
31
+ prompt_template=PROMPT,
32
+ )
33
+ )
34
+ class DocVQAAdapter(VisionLanguageAdapter):
35
+
36
+ def __init__(self, **kwargs):
37
+ super().__init__(**kwargs)
38
+ self.add_aggregation_name = False
39
+
40
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
41
+
42
+ input_text = PROMPT.format(question=record['question'])
43
+ content_list: List[Content] = [ContentText(text=input_text)]
44
+ image = record.get('image')
45
+ if image:
46
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
47
+ content_list.append(ContentImage(image=image_base64))
48
+ return Sample(
49
+ input=[ChatMessageUser(content=content_list)],
50
+ target=json.dumps(record.get('answers')), # answers is a list
51
+ metadata={
52
+ 'questionId': record.get('questionId'),
53
+ 'question_types': record.get('question_types'),
54
+ 'docId': record.get('docId'),
55
+ 'ucsf_document_id': record.get('ucsf_document_id'),
56
+ 'ucsf_document_page_no': record.get('ucsf_document_page_no'),
57
+ }
58
+ )
59
+
60
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
61
+ import re
62
+
63
+ pattern = r'ANSWER:\s*(.*)'
64
+ match = re.search(pattern, prediction)
65
+ if match:
66
+ return match.group(1).strip()
67
+ return prediction.strip()
@@ -34,7 +34,8 @@ def process_review_item(review_result: ReviewResult) -> list:
34
34
  'Index': str(review_result.index),
35
35
  'Input': review_result.input,
36
36
  'Question': review_result.input, # Use input as question
37
- 'Generated': prediction if prediction != extracted_prediction else extracted_prediction,
37
+ 'Generated':
38
+ prediction if prediction != extracted_prediction else extracted_prediction or '', # Ensure no None value
38
39
  'Gold': target,
39
40
  'Pred': extracted_prediction,
40
41
  'Score': sample_score.score.model_dump(exclude_none=True),
@@ -57,8 +57,9 @@ Your judgment must focus only on if there are meaningful differences between [co
57
57
  'humanities/social science (9%), computer science/artificial intelligence (10%), '
58
58
  'engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions '
59
59
  'require the ability to understand both text and images, i.e., multi-modality. '
60
- '24% of the questions are multiple-choice; the rest are short-answer, exact-match questions. '
61
- 'To evaluate the performance of model without multi-modality capabilities, please set the extra_params["include_multi_modal"] to False.', # noqa: E501
60
+ '24% of the questions are multiple-choice; the rest are short-answer, exact-match questions. \n'
61
+ '**To evaluate the performance of model without multi-modality capabilities, '
62
+ 'please set the `extra_params["include_multi_modal"]` to `False`.**', # noqa: E501
62
63
  dataset_id='cais/hle',
63
64
  subset_list=SUBSET_LIST,
64
65
  metric_list=['acc'],
File without changes
@@ -0,0 +1,66 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator.state import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ PROMPT = """Answer the question according to the image using a single word or phrase.
16
+ {question}
17
+ The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.""" # noqa: E501
18
+
19
+
20
+ @register_benchmark(
21
+ BenchmarkMeta(
22
+ name='infovqa',
23
+ pretty_name='InfoVQA',
24
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
25
+ description=
26
+ 'InfoVQA (Information Visual Question Answering) is a benchmark designed to evaluate how well AI models can answer questions based on information-dense images, such as charts, graphs, diagrams, maps, and infographics.', # noqa: E501
27
+ dataset_id='lmms-lab/DocVQA',
28
+ subset_list=['InfographicVQA'],
29
+ metric_list=['anls'],
30
+ eval_split='validation',
31
+ prompt_template=PROMPT,
32
+ )
33
+ )
34
+ class InfoVQAAdapter(VisionLanguageAdapter):
35
+
36
+ def __init__(self, **kwargs):
37
+ super().__init__(**kwargs)
38
+ self.add_aggregation_name = False
39
+
40
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
41
+
42
+ input_text = PROMPT.format(question=record['question'])
43
+ content_list: List[Content] = [ContentText(text=input_text)]
44
+ image = record.get('image')
45
+ if image:
46
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
47
+ content_list.append(ContentImage(image=image_base64))
48
+ return Sample(
49
+ input=[ChatMessageUser(content=content_list)],
50
+ target=json.dumps(record.get('answers')), # answers is a list
51
+ metadata={
52
+ 'questionId': record.get('questionId'),
53
+ 'answer_type': record.get('answer_type'),
54
+ 'image_url': record.get('image_url'),
55
+ 'ocr': record.get('ocr'),
56
+ }
57
+ )
58
+
59
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
60
+ import re
61
+
62
+ pattern = r'ANSWER:\s*(.*)'
63
+ match = re.search(pattern, prediction)
64
+ if match:
65
+ return match.group(1).strip()
66
+ return prediction.strip()
@@ -35,7 +35,7 @@ class CCBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
35
35
 
36
36
  def record_to_sample(self, record: Dict[str, Any]) -> Sample:
37
37
  answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
38
- input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
38
+ input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
39
39
  content_list: List[Content] = [ContentText(text=input_text)]
40
40
  image = record.get('image')
41
41
  if image:
@@ -77,7 +77,7 @@ class MMBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
77
77
  answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
78
78
  answers_list = [ans for ans in answers_list if (ans.strip() and ans != 'nan')]
79
79
  question_hint = record['hint'] + record['question']
80
- input_text = prompt(question=question_hint, choices=answers_list, template=MULT_CHOICE_PROMPT)
80
+ input_text = prompt(question=question_hint, choices=answers_list, template=self.prompt_template)
81
81
  content_list: List[Content] = [ContentText(text=input_text)]
82
82
  image = record.get('image')
83
83
  if image:
@@ -122,7 +122,7 @@ class MMMUAdapter(VisionLanguageAdapter):
122
122
  match = re.search(pattern, prediction)
123
123
  if match:
124
124
  return match.group(1).strip()
125
- return ''
125
+ return prediction.strip()
126
126
  else:
127
127
  raise ValueError(f'Unsupported question type: {question_type}')
128
128
 
File without changes