evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
- evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
- evalscope/api/benchmark/benchmark.py +27 -2
- evalscope/api/benchmark/meta.py +3 -0
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +5 -0
- evalscope/api/messages/chat_message.py +6 -1
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +0 -3
- evalscope/api/model/model.py +1 -1
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +6 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
- evalscope/benchmarks/bfcl/generation.py +7 -7
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
- evalscope/config.py +24 -1
- evalscope/constants.py +3 -0
- evalscope/evaluator/evaluator.py +25 -7
- evalscope/metrics/metric.py +78 -2
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/model_apis.py +10 -8
- evalscope/models/utils/openai.py +1 -2
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/report/__init__.py +9 -1
- evalscope/report/combinator.py +45 -20
- evalscope/report/report.py +8 -4
- evalscope/run.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +63 -13
- evalscope/utils/io_utils.py +19 -11
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +1 -1
- evalscope/utils/multi_choices.py +16 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -385
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -80
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -178
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
15
|
+
|
|
16
|
+
from evalscope.utils import get_logger
|
|
17
|
+
from . import ifeval
|
|
18
|
+
|
|
19
|
+
logger = get_logger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def gen_acc_strict(x: Dict[str, Any]) -> Dict[str, List]:
|
|
23
|
+
# reference: fbcode/gen_ai/github/fair_evals/evals/tasks/finetune/ifeval.py
|
|
24
|
+
response = str(x['response'])
|
|
25
|
+
instruction_list = x['instruction_id_list']
|
|
26
|
+
is_following_list = []
|
|
27
|
+
for index, instruction_id in enumerate(instruction_list):
|
|
28
|
+
instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
|
|
29
|
+
instruction = instruction_cls(instruction_id)
|
|
30
|
+
|
|
31
|
+
instruction.build_description(**x['kwargs'][index])
|
|
32
|
+
if response and instruction.check_following(response):
|
|
33
|
+
is_following_list.append(True)
|
|
34
|
+
else:
|
|
35
|
+
is_following_list.append(False)
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
'follow_instruction_list': is_following_list,
|
|
39
|
+
'instruction_id_list': instruction_list,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def gen_acc_loose(x: Dict[str, Any]) -> Dict[str, List]:
|
|
44
|
+
response = str(x['response'])
|
|
45
|
+
r = response.split('\n')
|
|
46
|
+
response_remove_first = '\n'.join(r[1:]).strip()
|
|
47
|
+
response_remove_last = '\n'.join(r[:-1]).strip()
|
|
48
|
+
response_remove_both = '\n'.join(r[1:-1]).strip()
|
|
49
|
+
revised_response = response.replace('*', '')
|
|
50
|
+
revised_response_remove_first = response_remove_first.replace('*', '')
|
|
51
|
+
revised_response_remove_last = response_remove_last.replace('*', '')
|
|
52
|
+
revised_response_remove_both = response_remove_both.replace('*', '')
|
|
53
|
+
all_responses = [
|
|
54
|
+
response,
|
|
55
|
+
revised_response,
|
|
56
|
+
response_remove_first,
|
|
57
|
+
response_remove_last,
|
|
58
|
+
response_remove_both,
|
|
59
|
+
revised_response_remove_first,
|
|
60
|
+
revised_response_remove_last,
|
|
61
|
+
revised_response_remove_both,
|
|
62
|
+
]
|
|
63
|
+
instruction_list = x['instruction_id_list']
|
|
64
|
+
is_following_list = []
|
|
65
|
+
for index, instruction_id in enumerate(instruction_list):
|
|
66
|
+
instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
|
|
67
|
+
instruction = instruction_cls(instruction_id)
|
|
68
|
+
|
|
69
|
+
instruction.build_description(**x['kwargs'][index])
|
|
70
|
+
|
|
71
|
+
is_following = False
|
|
72
|
+
for r in all_responses: # type: ignore
|
|
73
|
+
if r.strip() and instruction.check_following(r): # type: ignore
|
|
74
|
+
is_following = True
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
is_following_list.append(is_following)
|
|
78
|
+
return {
|
|
79
|
+
'follow_instruction_list': is_following_list,
|
|
80
|
+
'instruction_id_list': instruction_list,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def parse_result(outputs: List[Dict[str, Any]]) -> Tuple[float, float]:
|
|
85
|
+
|
|
86
|
+
prompt_total = 0
|
|
87
|
+
prompt_correct = 0
|
|
88
|
+
instruction_total = 0
|
|
89
|
+
instruction_correct = 0
|
|
90
|
+
|
|
91
|
+
for example in outputs:
|
|
92
|
+
follow_instruction_list = example['follow_instruction_list']
|
|
93
|
+
instruction_id_list = example['instruction_id_list']
|
|
94
|
+
|
|
95
|
+
prompt_total += 1
|
|
96
|
+
if all(follow_instruction_list):
|
|
97
|
+
prompt_correct += 1
|
|
98
|
+
|
|
99
|
+
instruction_total += len(instruction_id_list)
|
|
100
|
+
instruction_correct += sum(follow_instruction_list)
|
|
101
|
+
|
|
102
|
+
return prompt_correct / prompt_total if prompt_total > 0 else 0, \
|
|
103
|
+
instruction_correct / instruction_total if instruction_total > 0 else 0
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def parse_result_no_reduce(outputs: List[Dict[str, Any]]) -> Tuple[List, List]:
|
|
107
|
+
|
|
108
|
+
prompt_res = []
|
|
109
|
+
inst_res = []
|
|
110
|
+
|
|
111
|
+
for example in outputs:
|
|
112
|
+
follow_instruction_list = example['follow_instruction_list']
|
|
113
|
+
instruction_id_list = example['instruction_id_list']
|
|
114
|
+
if all(follow_instruction_list):
|
|
115
|
+
prompt_res.append(1)
|
|
116
|
+
else:
|
|
117
|
+
prompt_res.append(0)
|
|
118
|
+
inst_res.append(sum(follow_instruction_list) / len(instruction_id_list) if instruction_id_list else 0.0)
|
|
119
|
+
|
|
120
|
+
return prompt_res, inst_res
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, messages_pretty_str
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.model import Model
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
12
|
+
from evalscope.utils.import_utils import check_import
|
|
13
|
+
from evalscope.utils.logger import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
16
|
+
|
|
17
|
+
SUBSET_LIST = [
|
|
18
|
+
'Chinese',
|
|
19
|
+
'English',
|
|
20
|
+
'German',
|
|
21
|
+
'Italian',
|
|
22
|
+
'Vietnamese',
|
|
23
|
+
'Spanish',
|
|
24
|
+
'Hindi',
|
|
25
|
+
'Portuguese',
|
|
26
|
+
'French',
|
|
27
|
+
'Thai',
|
|
28
|
+
'Russian',
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@register_benchmark(
|
|
33
|
+
BenchmarkMeta(
|
|
34
|
+
name='multi_if',
|
|
35
|
+
pretty_name='Multi-IF',
|
|
36
|
+
description=
|
|
37
|
+
'Multi-IF is a benchmark designed to evaluate the performance of LLM models\' capabilities in multi-turn instruction following within a multilingual environment.', # noqa: E501
|
|
38
|
+
tags=[Tags.INSTRUCTION_FOLLOWING, Tags.MULTI_LINGUAL, Tags.MULTI_TURN],
|
|
39
|
+
dataset_id='facebook/Multi-IF',
|
|
40
|
+
subset_list=SUBSET_LIST,
|
|
41
|
+
metric_list=[
|
|
42
|
+
'prompt_level_strict',
|
|
43
|
+
'inst_level_strict',
|
|
44
|
+
'prompt_level_loose',
|
|
45
|
+
'inst_level_loose',
|
|
46
|
+
],
|
|
47
|
+
few_shot_num=0,
|
|
48
|
+
train_split=None,
|
|
49
|
+
eval_split='train',
|
|
50
|
+
extra_params={
|
|
51
|
+
'max_turns': 3, # maximum number of turns to evaluate
|
|
52
|
+
}
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
class MultiIFAdapter(DefaultDataAdapter):
|
|
56
|
+
|
|
57
|
+
def __init__(self, **kwargs):
|
|
58
|
+
super().__init__(**kwargs)
|
|
59
|
+
|
|
60
|
+
# Ensure required packages are installed
|
|
61
|
+
check_import(
|
|
62
|
+
module_name=['nltk', 'langdetect'],
|
|
63
|
+
package=['nltk', 'langdetect'],
|
|
64
|
+
raise_error=True,
|
|
65
|
+
feature_name=self.pretty_name
|
|
66
|
+
)
|
|
67
|
+
if 'Chinese' in self.subset_list:
|
|
68
|
+
check_import(module_name='emoji', package='emoji', raise_error=True, feature_name='Chinese subset')
|
|
69
|
+
if 'Thai' in self.subset_list:
|
|
70
|
+
check_import(module_name='pythainlp', package='pythainlp', raise_error=True, feature_name='Thai subset')
|
|
71
|
+
|
|
72
|
+
self.reformat_subset = True
|
|
73
|
+
self.max_turns = self.extra_params.get('max_turns', 3)
|
|
74
|
+
if not isinstance(self.max_turns, int) or self.max_turns < 1 or self.max_turns > 3:
|
|
75
|
+
logger.warning(f'max_turns should be an integer between 1 and 3, got {self.max_turns}, clamping to 3.')
|
|
76
|
+
self.max_turns = 3
|
|
77
|
+
|
|
78
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
79
|
+
return Sample(
|
|
80
|
+
input=[ChatMessageUser(content='')], # NOTE: we will build the multi turn conversation in the evaluator
|
|
81
|
+
target='',
|
|
82
|
+
subset_key=record['language'],
|
|
83
|
+
metadata=record,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
|
|
87
|
+
"""
|
|
88
|
+
Run multi-turn inference with the model and sample.
|
|
89
|
+
"""
|
|
90
|
+
record = sample.metadata
|
|
91
|
+
history = []
|
|
92
|
+
step_record = {}
|
|
93
|
+
for step in range(1, self.max_turns + 1):
|
|
94
|
+
current_prompt = json.loads(record[f'turn_{step}_prompt'])
|
|
95
|
+
history.append(ChatMessageUser(content=current_prompt['content']))
|
|
96
|
+
# Generate model output
|
|
97
|
+
model_output = model.generate(input=history, tools=sample.tools)
|
|
98
|
+
|
|
99
|
+
response = model_output.completion
|
|
100
|
+
instruction_id_list = json.loads(record[f'turn_{step}_instruction_id_list'])
|
|
101
|
+
kwargs_list = json.loads(record[f'turn_{step}_kwargs'])
|
|
102
|
+
_kwargs = [json.loads(kwarg) for kwarg in kwargs_list]
|
|
103
|
+
|
|
104
|
+
step_record[step] = {
|
|
105
|
+
'prompt': messages_pretty_str(history),
|
|
106
|
+
'response': response,
|
|
107
|
+
'instruction_id_list': instruction_id_list,
|
|
108
|
+
'kwargs': _kwargs
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Append model output to history for next turn
|
|
112
|
+
history.append(model_output.message)
|
|
113
|
+
|
|
114
|
+
sample.metadata['step_record'] = step_record
|
|
115
|
+
return TaskState(
|
|
116
|
+
model=model.name,
|
|
117
|
+
sample=sample,
|
|
118
|
+
messages=history,
|
|
119
|
+
output=model_output,
|
|
120
|
+
completed=True,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def match_score(
|
|
124
|
+
self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
|
|
125
|
+
) -> Score:
|
|
126
|
+
"""
|
|
127
|
+
Calculate evaluation scores by comparing prediction with reference.
|
|
128
|
+
"""
|
|
129
|
+
from .metrics import gen_acc_loose, gen_acc_strict, parse_result
|
|
130
|
+
|
|
131
|
+
# Initialize the score object with prediction details
|
|
132
|
+
score = Score(
|
|
133
|
+
extracted_prediction=filtered_prediction,
|
|
134
|
+
prediction=original_prediction,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
step_record = task_state.metadata['step_record']
|
|
138
|
+
results = {}
|
|
139
|
+
try:
|
|
140
|
+
for step, record in step_record.items():
|
|
141
|
+
outputs_strict = gen_acc_strict(record)
|
|
142
|
+
outputs_loose = gen_acc_loose(record)
|
|
143
|
+
prompt_level_strict, inst_level_strict = parse_result([outputs_strict])
|
|
144
|
+
prompt_level_loose, inst_level_loose = parse_result([outputs_loose])
|
|
145
|
+
results.update({
|
|
146
|
+
f'turn_{step}_prompt_level_strict': prompt_level_strict,
|
|
147
|
+
f'turn_{step}_inst_level_strict': inst_level_strict,
|
|
148
|
+
f'turn_{step}_prompt_level_loose': prompt_level_loose,
|
|
149
|
+
f'turn_{step}_inst_level_loose': inst_level_loose,
|
|
150
|
+
})
|
|
151
|
+
score.value.update(results)
|
|
152
|
+
|
|
153
|
+
# Set main score name
|
|
154
|
+
if results:
|
|
155
|
+
score.main_score_name = f'turn_{step}_prompt_level_strict'
|
|
156
|
+
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f'Error calculating ifeval metrics: {e}')
|
|
159
|
+
score.value = {}
|
|
160
|
+
|
|
161
|
+
return score
|
|
@@ -73,6 +73,7 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
|
|
|
73
73
|
super().__init__(**kwargs)
|
|
74
74
|
|
|
75
75
|
self._use_llm_judge = True
|
|
76
|
+
self.add_aggregation_name = False # Don't add aggregation name for needle haystack adapter
|
|
76
77
|
# set extra params
|
|
77
78
|
self.retrieval_question = self.extra_params.get(
|
|
78
79
|
'retrieval_question', 'What is the best thing to do in San Francisco?'
|
|
@@ -359,10 +360,6 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
|
|
|
359
360
|
|
|
360
361
|
return score
|
|
361
362
|
|
|
362
|
-
def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
|
|
363
|
-
# Don't add aggregation name for needle haystack adapter
|
|
364
|
-
return super()._on_generate_report(scores, model_name, False)
|
|
365
|
-
|
|
366
363
|
def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
|
|
367
364
|
try:
|
|
368
365
|
import os
|
|
File without changes
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator.state import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
SUBSET_LIST = [
|
|
17
|
+
'Regular Text Recognition', 'Irregular Text Recognition', 'Artistic Text Recognition', 'Handwriting Recognition',
|
|
18
|
+
'Digit String Recognition', 'Non-Semantic Text Recognition', 'Scene Text-centric VQA', 'Doc-oriented VQA',
|
|
19
|
+
'Key Information Extraction', 'Handwritten Mathematical Expression Recognition'
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@register_benchmark(
|
|
24
|
+
BenchmarkMeta(
|
|
25
|
+
name='ocr_bench',
|
|
26
|
+
pretty_name='OCRBench',
|
|
27
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
28
|
+
description=
|
|
29
|
+
'OCRBench is a comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models. It comprises five components: Text Recognition, SceneText-Centric VQA, Document-Oriented VQA, Key Information Extraction, and Handwritten Mathematical Expression Recognition. The benchmark includes 1000 question-answer pairs, and all the answers undergo manual verification and correction to ensure a more precise evaluation.', # noqa: E501
|
|
30
|
+
dataset_id='evalscope/OCRBench',
|
|
31
|
+
subset_list=SUBSET_LIST,
|
|
32
|
+
metric_list=['acc'],
|
|
33
|
+
eval_split='test',
|
|
34
|
+
prompt_template='{question}',
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
class OCRBenchAdapter(VisionLanguageAdapter):
|
|
38
|
+
|
|
39
|
+
def __init__(self, **kwargs):
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
self.add_aggregation_name = False
|
|
42
|
+
self.reformat_subset = True
|
|
43
|
+
|
|
44
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
45
|
+
|
|
46
|
+
input_text = self.prompt_template.format(question=record['question'])
|
|
47
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
48
|
+
image = record.get('image')
|
|
49
|
+
if image:
|
|
50
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
|
|
51
|
+
content_list.append(ContentImage(image=image_base64))
|
|
52
|
+
return Sample(
|
|
53
|
+
input=[ChatMessageUser(content=content_list)],
|
|
54
|
+
target=json.dumps(record.get('answer'), ensure_ascii=False), # answers is a list
|
|
55
|
+
subset_key=record.get('question_type'),
|
|
56
|
+
metadata={
|
|
57
|
+
'dataset': record.get('dataset'),
|
|
58
|
+
'question_type': record.get('question_type'),
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def match_score(
|
|
63
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
64
|
+
) -> Score:
|
|
65
|
+
|
|
66
|
+
score = Score(
|
|
67
|
+
extracted_prediction=filtered_prediction,
|
|
68
|
+
prediction=original_prediction,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
pred = filtered_prediction.lower().strip()
|
|
72
|
+
gt_ans = json.loads(reference)
|
|
73
|
+
dataset_name = task_state.metadata['dataset']
|
|
74
|
+
|
|
75
|
+
score_value = 0
|
|
76
|
+
if dataset_name == 'HME100k':
|
|
77
|
+
if isinstance(gt_ans, list):
|
|
78
|
+
for j in range(len(gt_ans)):
|
|
79
|
+
answer = gt_ans[j].strip().replace('\n', ' ').replace(' ', '')
|
|
80
|
+
predict = pred.strip().replace('\n', ' ').replace(' ', '')
|
|
81
|
+
if answer in predict:
|
|
82
|
+
score_value = 1
|
|
83
|
+
else:
|
|
84
|
+
answer = gt_ans.strip().replace('\n', ' ').replace(' ', '')
|
|
85
|
+
predict = pred.strip().replace('\n', ' ').replace(' ', '')
|
|
86
|
+
if answer in predict:
|
|
87
|
+
score_value = 1
|
|
88
|
+
else:
|
|
89
|
+
if isinstance(gt_ans, list):
|
|
90
|
+
for j in range(len(gt_ans)):
|
|
91
|
+
answer = gt_ans[j].lower().strip().replace('\n', ' ')
|
|
92
|
+
predict = pred.lower().strip().replace('\n', ' ')
|
|
93
|
+
if answer in predict:
|
|
94
|
+
score_value = 1
|
|
95
|
+
else:
|
|
96
|
+
answer = gt_ans.lower().strip().replace('\n', ' ')
|
|
97
|
+
predict = pred.lower().strip().replace('\n', ' ')
|
|
98
|
+
if answer in predict:
|
|
99
|
+
score_value = 1
|
|
100
|
+
score.value = {'acc': score_value}
|
|
101
|
+
return score
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
import ast
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from .vqa_metric import vqa_evaluation
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def calculate_iou(box1, box2):
|
|
9
|
+
try:
|
|
10
|
+
box1 = [int(coordinate) for coordinate in box1]
|
|
11
|
+
box2 = [int(coordinate) for coordinate in box2]
|
|
12
|
+
except:
|
|
13
|
+
return 0
|
|
14
|
+
|
|
15
|
+
x1_inter = max(box1[0], box2[0])
|
|
16
|
+
y1_inter = max(box1[1], box2[1])
|
|
17
|
+
x2_inter = min(box1[2], box2[2])
|
|
18
|
+
y2_inter = min(box1[3], box2[3])
|
|
19
|
+
|
|
20
|
+
inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
|
|
21
|
+
|
|
22
|
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
|
23
|
+
box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
|
24
|
+
|
|
25
|
+
union_area = box1_area + box2_area - inter_area
|
|
26
|
+
|
|
27
|
+
iou = inter_area / union_area if union_area != 0 else 0
|
|
28
|
+
|
|
29
|
+
return iou
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def vqa_with_position_evaluation(predict, img_metas):
|
|
33
|
+
score_content, score_bbox = 0.0, 0.0
|
|
34
|
+
if 'answer' in predict.keys():
|
|
35
|
+
score_content = vqa_evaluation(predict['answer'], img_metas['answers'])
|
|
36
|
+
if 'bbox' in predict.keys():
|
|
37
|
+
gt_bbox = img_metas['bbox']
|
|
38
|
+
try:
|
|
39
|
+
predict_bbox_list = ast.literal_eval(predict['bbox'])
|
|
40
|
+
score_bbox = calculate_iou(predict_bbox_list, gt_bbox)
|
|
41
|
+
except:
|
|
42
|
+
score_bbox = 0
|
|
43
|
+
return 0.5 * score_content + 0.5 * score_bbox
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def extract_coordinates(text):
|
|
47
|
+
# Regex pattern to match coordinates in either (x1, y1, x2, y2) or [x1, y1, x2, y2] format
|
|
48
|
+
|
|
49
|
+
pattern = r'[\(\[]\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*[\)\]]'
|
|
50
|
+
|
|
51
|
+
matches = list(re.finditer(pattern, text))
|
|
52
|
+
coords_list = []
|
|
53
|
+
coords_set = set()
|
|
54
|
+
for match in matches:
|
|
55
|
+
x1, y1, x2, y2 = map(int, match.groups())
|
|
56
|
+
|
|
57
|
+
if all(0 <= n <= 1000 for n in [x1, y1, x2, y2]):
|
|
58
|
+
coords = (x1, y1, x2, y2)
|
|
59
|
+
|
|
60
|
+
if coords in coords_set:
|
|
61
|
+
coords_list = [c for c in coords_list if c != coords]
|
|
62
|
+
|
|
63
|
+
coords_list.append(coords)
|
|
64
|
+
coords_set.add(coords)
|
|
65
|
+
if coords_list:
|
|
66
|
+
last_coords = coords_list[-1]
|
|
67
|
+
return list(last_coords)
|
|
68
|
+
else:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
if __name__ == '__main__':
|
|
73
|
+
print('Example for Text Grounding task.')
|
|
74
|
+
box1 = [50, 50, 150, 150]
|
|
75
|
+
box2 = [60, 60, 140, 140]
|
|
76
|
+
iou_score = calculate_iou(box1, box2)
|
|
77
|
+
print(f'IoU score: {iou_score}')
|
|
78
|
+
|
|
79
|
+
print('Example for VQA with position task.')
|
|
80
|
+
pred = {'content': 'The content is Hello Buddies', 'bbox': box1}
|
|
81
|
+
gt = {'content': 'Hello Buddies', 'bbox': box2}
|
|
82
|
+
|
|
83
|
+
vqa_score = vqa_evaluation(pred['content'], gt['content'])
|
|
84
|
+
iou_score = calculate_iou(pred['bbox'], gt['bbox'])
|
|
85
|
+
|
|
86
|
+
print(f'VQA score: {vqa_score}')
|
|
87
|
+
print(f'IoU score: {iou_score}')
|