evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (155) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/app/ui/multi_model.py +6 -1
  17. evalscope/app/ui/single_model.py +8 -2
  18. evalscope/app/utils/data_utils.py +3 -2
  19. evalscope/app/utils/visualization.py +2 -2
  20. evalscope/arguments.py +6 -0
  21. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  22. evalscope/benchmarks/amc/__init__.py +0 -0
  23. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  24. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  25. evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
  26. evalscope/benchmarks/bfcl/generation.py +7 -7
  27. evalscope/benchmarks/blink/__init__.py +0 -0
  28. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  29. evalscope/benchmarks/chartqa/__init__.py +0 -0
  30. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  31. evalscope/benchmarks/chartqa/utils.py +38 -0
  32. evalscope/benchmarks/docvqa/__init__.py +0 -0
  33. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  34. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  35. evalscope/benchmarks/general_arena/utils.py +2 -1
  36. evalscope/benchmarks/healthbench/__init__.py +0 -0
  37. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  38. evalscope/benchmarks/healthbench/utils.py +102 -0
  39. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  40. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  41. evalscope/benchmarks/humaneval/utils.py +235 -0
  42. evalscope/benchmarks/infovqa/__init__.py +0 -0
  43. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  44. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  45. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  46. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  47. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  48. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  49. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  50. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  51. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  52. evalscope/benchmarks/mm_star/__init__.py +0 -0
  53. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  54. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  55. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  56. evalscope/benchmarks/multi_if/__init__.py +0 -0
  57. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  58. evalscope/benchmarks/multi_if/metrics.py +120 -0
  59. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  60. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  61. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  62. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  63. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  64. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  65. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  66. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  67. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  68. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  69. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  74. evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  75. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  76. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  77. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  78. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  79. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  80. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  81. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  82. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  83. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  84. evalscope/config.py +24 -1
  85. evalscope/constants.py +3 -0
  86. evalscope/evaluator/evaluator.py +25 -7
  87. evalscope/metrics/metric.py +78 -2
  88. evalscope/metrics/metrics.py +16 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  102. evalscope/models/model_apis.py +10 -8
  103. evalscope/models/utils/openai.py +1 -2
  104. evalscope/perf/arguments.py +2 -0
  105. evalscope/perf/plugin/api/base.py +2 -2
  106. evalscope/perf/plugin/api/default_api.py +7 -7
  107. evalscope/perf/plugin/api/openai_api.py +83 -19
  108. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  109. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  110. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  111. evalscope/perf/utils/benchmark_util.py +1 -2
  112. evalscope/report/__init__.py +9 -1
  113. evalscope/report/combinator.py +45 -20
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +1 -1
  116. evalscope/utils/function_utils.py +41 -0
  117. evalscope/utils/import_utils.py +63 -13
  118. evalscope/utils/io_utils.py +19 -11
  119. evalscope/utils/json_schema.py +25 -2
  120. evalscope/utils/logger.py +19 -0
  121. evalscope/utils/model_utils.py +1 -1
  122. evalscope/utils/multi_choices.py +16 -1
  123. evalscope/version.py +2 -2
  124. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
  125. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
  126. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
  127. tests/__init__.py +0 -1
  128. tests/benchmark/__init__.py +0 -1
  129. tests/benchmark/test_eval.py +0 -385
  130. tests/benchmark/test_image_edit.py +0 -65
  131. tests/benchmark/test_t2i.py +0 -142
  132. tests/benchmark/test_vlm.py +0 -80
  133. tests/cli/__init__.py +0 -1
  134. tests/cli/test_all.py +0 -269
  135. tests/cli/test_collection.py +0 -99
  136. tests/cli/test_custom.py +0 -268
  137. tests/cli/test_reasoning.py +0 -81
  138. tests/common.py +0 -73
  139. tests/perf/__init__.py +0 -1
  140. tests/perf/test_perf.py +0 -178
  141. tests/rag/test_clip_benchmark.py +0 -87
  142. tests/rag/test_mteb.py +0 -213
  143. tests/rag/test_ragas.py +0 -128
  144. tests/swift/__init__.py +0 -1
  145. tests/swift/test_run_swift_eval.py +0 -146
  146. tests/swift/test_run_swift_vlm_eval.py +0 -128
  147. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  148. tests/test_run_all.py +0 -12
  149. tests/utils.py +0 -13
  150. tests/vlm/__init__.py +0 -1
  151. tests/vlm/test_vlmeval.py +0 -102
  152. {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
  153. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
  154. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
  155. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,120 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict, List, Optional, Tuple
15
+
16
+ from evalscope.utils import get_logger
17
+ from . import ifeval
18
+
19
+ logger = get_logger()
20
+
21
+
22
+ def gen_acc_strict(x: Dict[str, Any]) -> Dict[str, List]:
23
+ # reference: fbcode/gen_ai/github/fair_evals/evals/tasks/finetune/ifeval.py
24
+ response = str(x['response'])
25
+ instruction_list = x['instruction_id_list']
26
+ is_following_list = []
27
+ for index, instruction_id in enumerate(instruction_list):
28
+ instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
29
+ instruction = instruction_cls(instruction_id)
30
+
31
+ instruction.build_description(**x['kwargs'][index])
32
+ if response and instruction.check_following(response):
33
+ is_following_list.append(True)
34
+ else:
35
+ is_following_list.append(False)
36
+
37
+ return {
38
+ 'follow_instruction_list': is_following_list,
39
+ 'instruction_id_list': instruction_list,
40
+ }
41
+
42
+
43
+ def gen_acc_loose(x: Dict[str, Any]) -> Dict[str, List]:
44
+ response = str(x['response'])
45
+ r = response.split('\n')
46
+ response_remove_first = '\n'.join(r[1:]).strip()
47
+ response_remove_last = '\n'.join(r[:-1]).strip()
48
+ response_remove_both = '\n'.join(r[1:-1]).strip()
49
+ revised_response = response.replace('*', '')
50
+ revised_response_remove_first = response_remove_first.replace('*', '')
51
+ revised_response_remove_last = response_remove_last.replace('*', '')
52
+ revised_response_remove_both = response_remove_both.replace('*', '')
53
+ all_responses = [
54
+ response,
55
+ revised_response,
56
+ response_remove_first,
57
+ response_remove_last,
58
+ response_remove_both,
59
+ revised_response_remove_first,
60
+ revised_response_remove_last,
61
+ revised_response_remove_both,
62
+ ]
63
+ instruction_list = x['instruction_id_list']
64
+ is_following_list = []
65
+ for index, instruction_id in enumerate(instruction_list):
66
+ instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
67
+ instruction = instruction_cls(instruction_id)
68
+
69
+ instruction.build_description(**x['kwargs'][index])
70
+
71
+ is_following = False
72
+ for r in all_responses: # type: ignore
73
+ if r.strip() and instruction.check_following(r): # type: ignore
74
+ is_following = True
75
+ break
76
+
77
+ is_following_list.append(is_following)
78
+ return {
79
+ 'follow_instruction_list': is_following_list,
80
+ 'instruction_id_list': instruction_list,
81
+ }
82
+
83
+
84
+ def parse_result(outputs: List[Dict[str, Any]]) -> Tuple[float, float]:
85
+
86
+ prompt_total = 0
87
+ prompt_correct = 0
88
+ instruction_total = 0
89
+ instruction_correct = 0
90
+
91
+ for example in outputs:
92
+ follow_instruction_list = example['follow_instruction_list']
93
+ instruction_id_list = example['instruction_id_list']
94
+
95
+ prompt_total += 1
96
+ if all(follow_instruction_list):
97
+ prompt_correct += 1
98
+
99
+ instruction_total += len(instruction_id_list)
100
+ instruction_correct += sum(follow_instruction_list)
101
+
102
+ return prompt_correct / prompt_total if prompt_total > 0 else 0, \
103
+ instruction_correct / instruction_total if instruction_total > 0 else 0
104
+
105
+
106
+ def parse_result_no_reduce(outputs: List[Dict[str, Any]]) -> Tuple[List, List]:
107
+
108
+ prompt_res = []
109
+ inst_res = []
110
+
111
+ for example in outputs:
112
+ follow_instruction_list = example['follow_instruction_list']
113
+ instruction_id_list = example['instruction_id_list']
114
+ if all(follow_instruction_list):
115
+ prompt_res.append(1)
116
+ else:
117
+ prompt_res.append(0)
118
+ inst_res.append(sum(follow_instruction_list) / len(instruction_id_list) if instruction_id_list else 0.0)
119
+
120
+ return prompt_res, inst_res
@@ -0,0 +1,161 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, messages_pretty_str
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.model import Model
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
12
+ from evalscope.utils.import_utils import check_import
13
+ from evalscope.utils.logger import get_logger
14
+
15
+ logger = get_logger()
16
+
17
+ SUBSET_LIST = [
18
+ 'Chinese',
19
+ 'English',
20
+ 'German',
21
+ 'Italian',
22
+ 'Vietnamese',
23
+ 'Spanish',
24
+ 'Hindi',
25
+ 'Portuguese',
26
+ 'French',
27
+ 'Thai',
28
+ 'Russian',
29
+ ]
30
+
31
+
32
+ @register_benchmark(
33
+ BenchmarkMeta(
34
+ name='multi_if',
35
+ pretty_name='Multi-IF',
36
+ description=
37
+ 'Multi-IF is a benchmark designed to evaluate the performance of LLM models\' capabilities in multi-turn instruction following within a multilingual environment.', # noqa: E501
38
+ tags=[Tags.INSTRUCTION_FOLLOWING, Tags.MULTI_LINGUAL, Tags.MULTI_TURN],
39
+ dataset_id='facebook/Multi-IF',
40
+ subset_list=SUBSET_LIST,
41
+ metric_list=[
42
+ 'prompt_level_strict',
43
+ 'inst_level_strict',
44
+ 'prompt_level_loose',
45
+ 'inst_level_loose',
46
+ ],
47
+ few_shot_num=0,
48
+ train_split=None,
49
+ eval_split='train',
50
+ extra_params={
51
+ 'max_turns': 3, # maximum number of turns to evaluate
52
+ }
53
+ )
54
+ )
55
+ class MultiIFAdapter(DefaultDataAdapter):
56
+
57
+ def __init__(self, **kwargs):
58
+ super().__init__(**kwargs)
59
+
60
+ # Ensure required packages are installed
61
+ check_import(
62
+ module_name=['nltk', 'langdetect'],
63
+ package=['nltk', 'langdetect'],
64
+ raise_error=True,
65
+ feature_name=self.pretty_name
66
+ )
67
+ if 'Chinese' in self.subset_list:
68
+ check_import(module_name='emoji', package='emoji', raise_error=True, feature_name='Chinese subset')
69
+ if 'Thai' in self.subset_list:
70
+ check_import(module_name='pythainlp', package='pythainlp', raise_error=True, feature_name='Thai subset')
71
+
72
+ self.reformat_subset = True
73
+ self.max_turns = self.extra_params.get('max_turns', 3)
74
+ if not isinstance(self.max_turns, int) or self.max_turns < 1 or self.max_turns > 3:
75
+ logger.warning(f'max_turns should be an integer between 1 and 3, got {self.max_turns}, clamping to 3.')
76
+ self.max_turns = 3
77
+
78
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
79
+ return Sample(
80
+ input=[ChatMessageUser(content='')], # NOTE: we will build the multi turn conversation in the evaluator
81
+ target='',
82
+ subset_key=record['language'],
83
+ metadata=record,
84
+ )
85
+
86
+ def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
87
+ """
88
+ Run multi-turn inference with the model and sample.
89
+ """
90
+ record = sample.metadata
91
+ history = []
92
+ step_record = {}
93
+ for step in range(1, self.max_turns + 1):
94
+ current_prompt = json.loads(record[f'turn_{step}_prompt'])
95
+ history.append(ChatMessageUser(content=current_prompt['content']))
96
+ # Generate model output
97
+ model_output = model.generate(input=history, tools=sample.tools)
98
+
99
+ response = model_output.completion
100
+ instruction_id_list = json.loads(record[f'turn_{step}_instruction_id_list'])
101
+ kwargs_list = json.loads(record[f'turn_{step}_kwargs'])
102
+ _kwargs = [json.loads(kwarg) for kwarg in kwargs_list]
103
+
104
+ step_record[step] = {
105
+ 'prompt': messages_pretty_str(history),
106
+ 'response': response,
107
+ 'instruction_id_list': instruction_id_list,
108
+ 'kwargs': _kwargs
109
+ }
110
+
111
+ # Append model output to history for next turn
112
+ history.append(model_output.message)
113
+
114
+ sample.metadata['step_record'] = step_record
115
+ return TaskState(
116
+ model=model.name,
117
+ sample=sample,
118
+ messages=history,
119
+ output=model_output,
120
+ completed=True,
121
+ )
122
+
123
+ def match_score(
124
+ self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
125
+ ) -> Score:
126
+ """
127
+ Calculate evaluation scores by comparing prediction with reference.
128
+ """
129
+ from .metrics import gen_acc_loose, gen_acc_strict, parse_result
130
+
131
+ # Initialize the score object with prediction details
132
+ score = Score(
133
+ extracted_prediction=filtered_prediction,
134
+ prediction=original_prediction,
135
+ )
136
+
137
+ step_record = task_state.metadata['step_record']
138
+ results = {}
139
+ try:
140
+ for step, record in step_record.items():
141
+ outputs_strict = gen_acc_strict(record)
142
+ outputs_loose = gen_acc_loose(record)
143
+ prompt_level_strict, inst_level_strict = parse_result([outputs_strict])
144
+ prompt_level_loose, inst_level_loose = parse_result([outputs_loose])
145
+ results.update({
146
+ f'turn_{step}_prompt_level_strict': prompt_level_strict,
147
+ f'turn_{step}_inst_level_strict': inst_level_strict,
148
+ f'turn_{step}_prompt_level_loose': prompt_level_loose,
149
+ f'turn_{step}_inst_level_loose': inst_level_loose,
150
+ })
151
+ score.value.update(results)
152
+
153
+ # Set main score name
154
+ if results:
155
+ score.main_score_name = f'turn_{step}_prompt_level_strict'
156
+
157
+ except Exception as e:
158
+ logger.error(f'Error calculating ifeval metrics: {e}')
159
+ score.value = {}
160
+
161
+ return score
@@ -73,6 +73,7 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
73
73
  super().__init__(**kwargs)
74
74
 
75
75
  self._use_llm_judge = True
76
+ self.add_aggregation_name = False # Don't add aggregation name for needle haystack adapter
76
77
  # set extra params
77
78
  self.retrieval_question = self.extra_params.get(
78
79
  'retrieval_question', 'What is the best thing to do in San Francisco?'
@@ -359,10 +360,6 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
359
360
 
360
361
  return score
361
362
 
362
- def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
363
- # Don't add aggregation name for needle haystack adapter
364
- return super()._on_generate_report(scores, model_name, False)
365
-
366
363
  def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
367
364
  try:
368
365
  import os
File without changes
@@ -0,0 +1,101 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator.state import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.metric.scorer import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+ SUBSET_LIST = [
17
+ 'Regular Text Recognition', 'Irregular Text Recognition', 'Artistic Text Recognition', 'Handwriting Recognition',
18
+ 'Digit String Recognition', 'Non-Semantic Text Recognition', 'Scene Text-centric VQA', 'Doc-oriented VQA',
19
+ 'Key Information Extraction', 'Handwritten Mathematical Expression Recognition'
20
+ ]
21
+
22
+
23
+ @register_benchmark(
24
+ BenchmarkMeta(
25
+ name='ocr_bench',
26
+ pretty_name='OCRBench',
27
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
28
+ description=
29
+ 'OCRBench is a comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models. It comprises five components: Text Recognition, SceneText-Centric VQA, Document-Oriented VQA, Key Information Extraction, and Handwritten Mathematical Expression Recognition. The benchmark includes 1000 question-answer pairs, and all the answers undergo manual verification and correction to ensure a more precise evaluation.', # noqa: E501
30
+ dataset_id='evalscope/OCRBench',
31
+ subset_list=SUBSET_LIST,
32
+ metric_list=['acc'],
33
+ eval_split='test',
34
+ prompt_template='{question}',
35
+ )
36
+ )
37
+ class OCRBenchAdapter(VisionLanguageAdapter):
38
+
39
+ def __init__(self, **kwargs):
40
+ super().__init__(**kwargs)
41
+ self.add_aggregation_name = False
42
+ self.reformat_subset = True
43
+
44
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
45
+
46
+ input_text = self.prompt_template.format(question=record['question'])
47
+ content_list: List[Content] = [ContentText(text=input_text)]
48
+ image = record.get('image')
49
+ if image:
50
+ image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
51
+ content_list.append(ContentImage(image=image_base64))
52
+ return Sample(
53
+ input=[ChatMessageUser(content=content_list)],
54
+ target=json.dumps(record.get('answer'), ensure_ascii=False), # answers is a list
55
+ subset_key=record.get('question_type'),
56
+ metadata={
57
+ 'dataset': record.get('dataset'),
58
+ 'question_type': record.get('question_type'),
59
+ }
60
+ )
61
+
62
+ def match_score(
63
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
64
+ ) -> Score:
65
+
66
+ score = Score(
67
+ extracted_prediction=filtered_prediction,
68
+ prediction=original_prediction,
69
+ )
70
+
71
+ pred = filtered_prediction.lower().strip()
72
+ gt_ans = json.loads(reference)
73
+ dataset_name = task_state.metadata['dataset']
74
+
75
+ score_value = 0
76
+ if dataset_name == 'HME100k':
77
+ if isinstance(gt_ans, list):
78
+ for j in range(len(gt_ans)):
79
+ answer = gt_ans[j].strip().replace('\n', ' ').replace(' ', '')
80
+ predict = pred.strip().replace('\n', ' ').replace(' ', '')
81
+ if answer in predict:
82
+ score_value = 1
83
+ else:
84
+ answer = gt_ans.strip().replace('\n', ' ').replace(' ', '')
85
+ predict = pred.strip().replace('\n', ' ').replace(' ', '')
86
+ if answer in predict:
87
+ score_value = 1
88
+ else:
89
+ if isinstance(gt_ans, list):
90
+ for j in range(len(gt_ans)):
91
+ answer = gt_ans[j].lower().strip().replace('\n', ' ')
92
+ predict = pred.lower().strip().replace('\n', ' ')
93
+ if answer in predict:
94
+ score_value = 1
95
+ else:
96
+ answer = gt_ans.lower().strip().replace('\n', ' ')
97
+ predict = pred.lower().strip().replace('\n', ' ')
98
+ if answer in predict:
99
+ score_value = 1
100
+ score.value = {'acc': score_value}
101
+ return score
@@ -0,0 +1,87 @@
1
+ # flake8: noqa
2
+ import ast
3
+ import re
4
+
5
+ from .vqa_metric import vqa_evaluation
6
+
7
+
8
+ def calculate_iou(box1, box2):
9
+ try:
10
+ box1 = [int(coordinate) for coordinate in box1]
11
+ box2 = [int(coordinate) for coordinate in box2]
12
+ except:
13
+ return 0
14
+
15
+ x1_inter = max(box1[0], box2[0])
16
+ y1_inter = max(box1[1], box2[1])
17
+ x2_inter = min(box1[2], box2[2])
18
+ y2_inter = min(box1[3], box2[3])
19
+
20
+ inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
21
+
22
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
23
+ box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
24
+
25
+ union_area = box1_area + box2_area - inter_area
26
+
27
+ iou = inter_area / union_area if union_area != 0 else 0
28
+
29
+ return iou
30
+
31
+
32
+ def vqa_with_position_evaluation(predict, img_metas):
33
+ score_content, score_bbox = 0.0, 0.0
34
+ if 'answer' in predict.keys():
35
+ score_content = vqa_evaluation(predict['answer'], img_metas['answers'])
36
+ if 'bbox' in predict.keys():
37
+ gt_bbox = img_metas['bbox']
38
+ try:
39
+ predict_bbox_list = ast.literal_eval(predict['bbox'])
40
+ score_bbox = calculate_iou(predict_bbox_list, gt_bbox)
41
+ except:
42
+ score_bbox = 0
43
+ return 0.5 * score_content + 0.5 * score_bbox
44
+
45
+
46
+ def extract_coordinates(text):
47
+ # Regex pattern to match coordinates in either (x1, y1, x2, y2) or [x1, y1, x2, y2] format
48
+
49
+ pattern = r'[\(\[]\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*[\)\]]'
50
+
51
+ matches = list(re.finditer(pattern, text))
52
+ coords_list = []
53
+ coords_set = set()
54
+ for match in matches:
55
+ x1, y1, x2, y2 = map(int, match.groups())
56
+
57
+ if all(0 <= n <= 1000 for n in [x1, y1, x2, y2]):
58
+ coords = (x1, y1, x2, y2)
59
+
60
+ if coords in coords_set:
61
+ coords_list = [c for c in coords_list if c != coords]
62
+
63
+ coords_list.append(coords)
64
+ coords_set.add(coords)
65
+ if coords_list:
66
+ last_coords = coords_list[-1]
67
+ return list(last_coords)
68
+ else:
69
+ return None
70
+
71
+
72
+ if __name__ == '__main__':
73
+ print('Example for Text Grounding task.')
74
+ box1 = [50, 50, 150, 150]
75
+ box2 = [60, 60, 140, 140]
76
+ iou_score = calculate_iou(box1, box2)
77
+ print(f'IoU score: {iou_score}')
78
+
79
+ print('Example for VQA with position task.')
80
+ pred = {'content': 'The content is Hello Buddies', 'bbox': box1}
81
+ gt = {'content': 'Hello Buddies', 'bbox': box2}
82
+
83
+ vqa_score = vqa_evaluation(pred['content'], gt['content'])
84
+ iou_score = calculate_iou(pred['bbox'], gt['bbox'])
85
+
86
+ print(f'VQA score: {vqa_score}')
87
+ print(f'IoU score: {iou_score}')