evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +8 -1
- evalscope/api/benchmark/adapters/__init__.py +1 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/benchmark.py +14 -0
- evalscope/api/dataset/dataset.py +21 -0
- evalscope/api/dataset/loader.py +6 -2
- evalscope/api/mixin/sandbox_mixin.py +32 -54
- evalscope/api/model/generate_config.py +6 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +111 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +72 -79
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +52 -1
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/utils/openai.py +4 -0
- evalscope/perf/arguments.py +24 -4
- evalscope/perf/benchmark.py +74 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +179 -79
- evalscope/perf/plugin/api/openai_api.py +4 -3
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/utils/benchmark_util.py +36 -22
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +0 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +11 -2
- evalscope/report/combinator.py +52 -2
- evalscope/run.py +4 -0
- evalscope/utils/function_utils.py +195 -12
- evalscope/utils/io_utils.py +74 -0
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/logger.py +49 -17
- evalscope/utils/multi_choices.py +16 -1
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# flake8: noqa: E501
|
|
3
|
+
import re
|
|
4
|
+
import urllib.request
|
|
5
|
+
import zipfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
10
|
+
from evalscope.api.dataset import Sample
|
|
11
|
+
from evalscope.api.evaluator import TaskState
|
|
12
|
+
from evalscope.api.messages import ChatMessageUser
|
|
13
|
+
from evalscope.api.metric import Score
|
|
14
|
+
from evalscope.api.registry import register_benchmark
|
|
15
|
+
from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, Tags
|
|
16
|
+
from evalscope.utils.logger import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
# Default judge prompt template
|
|
21
|
+
JUDGE_PROMPT = """Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT. For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.
|
|
22
|
+
|
|
23
|
+
The question, for reference only: {question}
|
|
24
|
+
The OFFICIAL ANSWER: {correct_answer}
|
|
25
|
+
CANDIDATE ANSWER TO ASSESS: {response}
|
|
26
|
+
|
|
27
|
+
Reply only with CORRECT or INCORRECT."""
|
|
28
|
+
|
|
29
|
+
PROMPT_TEMPLATE = """
|
|
30
|
+
BEGIN INPUT DOCUMENTS
|
|
31
|
+
|
|
32
|
+
{documents_text}
|
|
33
|
+
|
|
34
|
+
END INPUT DOCUMENTS
|
|
35
|
+
|
|
36
|
+
Answer the following question using the input documents provided above.
|
|
37
|
+
|
|
38
|
+
START QUESTION
|
|
39
|
+
|
|
40
|
+
{question}
|
|
41
|
+
|
|
42
|
+
END QUESTION
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# New constants for auto-download
|
|
46
|
+
DOWNLOAD_URL: str = (
|
|
47
|
+
'https://modelscope.cn/datasets/evalscope/AA-LCR/resolve/master/extracted_text/AA-LCR_extracted-text.zip'
|
|
48
|
+
)
|
|
49
|
+
DEFAULT_CACHE_SUBDIR: str = 'aa_lcr'
|
|
50
|
+
DEFAULT_ZIP_NAME: str = 'AA-LCR_extracted-text.zip'
|
|
51
|
+
DEFAULT_EXTRACTED_DIR_NAME: str = 'lcr'
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@register_benchmark(
|
|
55
|
+
BenchmarkMeta(
|
|
56
|
+
name='aa_lcr',
|
|
57
|
+
pretty_name='AA-LCR',
|
|
58
|
+
tags=[Tags.KNOWLEDGE, Tags.REASONING, Tags.LONG_CONTEXT],
|
|
59
|
+
description='AA-LCR (Artificial Analysis Long Context Retrieval) is a benchmark for evaluating long-context '
|
|
60
|
+
'retrieval and reasoning capabilities of language models across multiple documents.', # noqa: E501
|
|
61
|
+
dataset_id='evalscope/AA-LCR',
|
|
62
|
+
metric_list=['acc'],
|
|
63
|
+
few_shot_num=0,
|
|
64
|
+
train_split=None,
|
|
65
|
+
eval_split='test',
|
|
66
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
67
|
+
extra_params={'text_dir': None}
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
class AALCRAdapter(DefaultDataAdapter):
|
|
71
|
+
|
|
72
|
+
def __init__(self, *args, **kwargs):
|
|
73
|
+
super().__init__(*args, **kwargs)
|
|
74
|
+
|
|
75
|
+
self._use_llm_judge = True
|
|
76
|
+
|
|
77
|
+
# Get extra parameters
|
|
78
|
+
self.text_dir = self.extra_params.get('text_dir')
|
|
79
|
+
|
|
80
|
+
def load(self):
|
|
81
|
+
# Auto download and extract when text_dir is not provided
|
|
82
|
+
if not self.text_dir:
|
|
83
|
+
self.text_dir = self._ensure_text_dir_downloaded()
|
|
84
|
+
elif not Path(self.text_dir).exists():
|
|
85
|
+
raise ValueError(
|
|
86
|
+
'AA-LCR text_dir does not exist: '
|
|
87
|
+
f'{self.text_dir}. Please provide a valid directory or omit text_dir to auto-download.'
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
self.text_dir = Path(self.text_dir)
|
|
91
|
+
return super().load()
|
|
92
|
+
|
|
93
|
+
def _ensure_text_dir_downloaded(self) -> Path:
|
|
94
|
+
"""Ensure AA-LCR extracted texts are available locally; download and extract if missing."""
|
|
95
|
+
cache_root = Path(DEFAULT_EVALSCOPE_CACHE_DIR) / DEFAULT_CACHE_SUBDIR
|
|
96
|
+
extracted_dir = cache_root / DEFAULT_EXTRACTED_DIR_NAME
|
|
97
|
+
|
|
98
|
+
if extracted_dir.exists():
|
|
99
|
+
logger.info(f'AA-LCR documents found: {extracted_dir}')
|
|
100
|
+
return extracted_dir
|
|
101
|
+
|
|
102
|
+
cache_root.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
zip_path = cache_root / DEFAULT_ZIP_NAME
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
logger.info(f'Downloading AA-LCR documents from {DOWNLOAD_URL} to {zip_path}...')
|
|
107
|
+
urllib.request.urlretrieve(DOWNLOAD_URL, zip_path)
|
|
108
|
+
|
|
109
|
+
logger.info(f'Extracting {zip_path} to {cache_root}...')
|
|
110
|
+
with zipfile.ZipFile(zip_path, 'r') as zf:
|
|
111
|
+
zf.extractall(cache_root)
|
|
112
|
+
|
|
113
|
+
if not extracted_dir.exists():
|
|
114
|
+
raise ValueError(f'Extraction succeeded but target directory not found: {extracted_dir}')
|
|
115
|
+
|
|
116
|
+
logger.info(f'AA-LCR documents ready at {extracted_dir}')
|
|
117
|
+
return extracted_dir
|
|
118
|
+
except Exception as e:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f'Failed to download or extract AA-LCR documents: {e}. '
|
|
121
|
+
'You can also manually download and set extra_params["text_dir"].'
|
|
122
|
+
) from e
|
|
123
|
+
finally:
|
|
124
|
+
# Best-effort cleanup of the zip file
|
|
125
|
+
try:
|
|
126
|
+
if zip_path.exists():
|
|
127
|
+
zip_path.unlink()
|
|
128
|
+
except Exception:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
def _get_context(self, record: Dict[str, Any]) -> str:
|
|
132
|
+
doc_folder = self.text_dir / record['document_category'] / record['document_set_id']
|
|
133
|
+
|
|
134
|
+
# Check if the document folder exists
|
|
135
|
+
if not doc_folder.exists() or not doc_folder.is_dir():
|
|
136
|
+
logger.warning(f'Document folder not found: {doc_folder}. Returning empty context.')
|
|
137
|
+
return ''
|
|
138
|
+
|
|
139
|
+
doc_blocks = []
|
|
140
|
+
try:
|
|
141
|
+
for file_path in doc_folder.iterdir():
|
|
142
|
+
if file_path.is_file():
|
|
143
|
+
try:
|
|
144
|
+
content = file_path.read_text(encoding='utf-8').strip()
|
|
145
|
+
if content:
|
|
146
|
+
doc_blocks.append(content)
|
|
147
|
+
except (IOError, UnicodeDecodeError) as e:
|
|
148
|
+
logger.warning(f'Could not read file {file_path}, skipping: {e}')
|
|
149
|
+
except OSError as e:
|
|
150
|
+
logger.warning(f'Could not access document folder {doc_folder}: {e}')
|
|
151
|
+
return f"ERROR: Could not read documents for {record['document_category']}/{record['document_set_id']}"
|
|
152
|
+
|
|
153
|
+
documents_text = '\n\n'.join(
|
|
154
|
+
f'BEGIN DOCUMENT {i + 1}:\n{doc}\nEND DOCUMENT {i + 1}' for i, doc in enumerate(doc_blocks)
|
|
155
|
+
)
|
|
156
|
+
return documents_text
|
|
157
|
+
|
|
158
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
159
|
+
"""Convert a record to a Sample with long-context prompt."""
|
|
160
|
+
context = self._get_context(record)
|
|
161
|
+
prompt = self.prompt_template.format(documents_text=context, question=record['question'])
|
|
162
|
+
|
|
163
|
+
return Sample(
|
|
164
|
+
input=[ChatMessageUser(content=prompt)],
|
|
165
|
+
target=record['answer'],
|
|
166
|
+
metadata={
|
|
167
|
+
'question': record['question'],
|
|
168
|
+
'data_source_urls': record['data_source_urls'],
|
|
169
|
+
'input_tokens': record.get('input_tokens', 0),
|
|
170
|
+
}
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def llm_match_score(
|
|
174
|
+
self,
|
|
175
|
+
original_prediction: str,
|
|
176
|
+
filtered_prediction: str,
|
|
177
|
+
reference: str,
|
|
178
|
+
task_state: TaskState,
|
|
179
|
+
) -> Score:
|
|
180
|
+
score = Score(
|
|
181
|
+
extracted_prediction=filtered_prediction,
|
|
182
|
+
prediction=original_prediction,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
judge_prompt = JUDGE_PROMPT.format(
|
|
186
|
+
question=task_state.metadata['question'], correct_answer=reference, response=filtered_prediction
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Request judge and obtain score
|
|
190
|
+
judge_response = self.llm_judge.judge(prompt=judge_prompt)
|
|
191
|
+
|
|
192
|
+
# Parse judge response to get accuracy score
|
|
193
|
+
# Use word boundaries to avoid matching "CORRECT" within "INCORRECT"
|
|
194
|
+
is_correct = bool(re.search(r'\bCORRECT\b', judge_response, re.IGNORECASE))
|
|
195
|
+
score.value = {
|
|
196
|
+
'acc': 1.0 if is_correct else 0.0,
|
|
197
|
+
}
|
|
198
|
+
score.explanation = f'LLM judge: {judge_response}'
|
|
199
|
+
score.metadata = {
|
|
200
|
+
'source': 'llm_judge',
|
|
201
|
+
'judge_strategy': self.judge_strategy,
|
|
202
|
+
'model': self.llm_judge.model_id,
|
|
203
|
+
}
|
|
204
|
+
score.main_score_name = 'acc'
|
|
205
|
+
return score
|
|
@@ -22,7 +22,8 @@ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
|
22
22
|
name='ai2d',
|
|
23
23
|
pretty_name='AI2D',
|
|
24
24
|
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
25
|
-
description=
|
|
25
|
+
description=
|
|
26
|
+
'AI2D is a benchmark dataset for researching the understanding of diagrams by AI. It contains over 5,000 diverse diagrams from science textbooks (e.g., the water cycle, food webs). Each diagram is accompanied by multiple-choice questions that test an AI\'s ability to interpret visual elements, text labels, and their relationships. The benchmark is challenging because it requires jointly understanding the layout, symbols, and text to answer questions correctly.', # noqa: E501
|
|
26
27
|
dataset_id='lmms-lab/ai2d',
|
|
27
28
|
subset_list=SUBSET_LIST,
|
|
28
29
|
metric_list=['acc'],
|
|
@@ -37,7 +38,7 @@ class Ai2dAdapter(VisionLanguageAdapter):
|
|
|
37
38
|
|
|
38
39
|
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
39
40
|
answers_list: list[str] = record['options']
|
|
40
|
-
input_text = prompt(question=record['question'], choices=answers_list, template=
|
|
41
|
+
input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
|
|
41
42
|
content_list: list[Content] = [ContentText(text=input_text)]
|
|
42
43
|
image = record.get('image')
|
|
43
44
|
if image:
|
|
@@ -8,11 +8,10 @@ from evalscope.api.dataset import Sample
|
|
|
8
8
|
from evalscope.api.evaluator import TaskState
|
|
9
9
|
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
10
10
|
from evalscope.api.metric import Score
|
|
11
|
-
from evalscope.api.metric.scorer import AggScore
|
|
12
11
|
from evalscope.api.model import Model, ModelOutput
|
|
13
12
|
from evalscope.api.registry import register_benchmark
|
|
14
13
|
from evalscope.constants import Tags
|
|
15
|
-
from evalscope.report import Category, Report, Subset
|
|
14
|
+
from evalscope.report import Category, Report, Subset, unweighted_average_from_subsets, weighted_average_from_subsets
|
|
16
15
|
from evalscope.utils.import_utils import check_import
|
|
17
16
|
from evalscope.utils.logger import get_logger
|
|
18
17
|
|
|
@@ -50,7 +49,7 @@ SUBJECT_MAPPING = {
|
|
|
50
49
|
'functions. Unlike previous evaluations, '
|
|
51
50
|
'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
|
|
52
51
|
'Need to run `pip install bfcl-eval==2025.6.16` before evaluating. '
|
|
53
|
-
'[Usage Example](https://evalscope.readthedocs.io/
|
|
52
|
+
'[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v3.html)',
|
|
54
53
|
dataset_id='AI-ModelScope/bfcl_v3',
|
|
55
54
|
subset_list=list(SUBJECT_MAPPING.keys()),
|
|
56
55
|
metric_list=['acc'],
|
|
@@ -79,40 +78,6 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
79
78
|
self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
|
|
80
79
|
self.is_fc_model = self.extra_params.get('is_fc_model', True)
|
|
81
80
|
|
|
82
|
-
def _weighted_average_from_subsets(self, subset_names: List[str], subset_dict: Dict[str, Subset]) -> Subset:
|
|
83
|
-
"""Calculate weighted average for given subsets.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
Subset: A new Subset object with weighted average score
|
|
87
|
-
"""
|
|
88
|
-
total_score = 0
|
|
89
|
-
total_count = 0
|
|
90
|
-
for name in subset_names:
|
|
91
|
-
if name in subset_dict:
|
|
92
|
-
subset = subset_dict[name]
|
|
93
|
-
total_score += subset.score * subset.num
|
|
94
|
-
total_count += subset.num
|
|
95
|
-
|
|
96
|
-
weighted_avg = total_score / total_count if total_count > 0 else 0
|
|
97
|
-
return Subset(name='', score=weighted_avg, num=total_count)
|
|
98
|
-
|
|
99
|
-
def _unweighted_average_from_subsets(self, subset_names: List[str], subset_dict: Dict[str, Subset]) -> Subset:
|
|
100
|
-
"""Calculate unweighted average for given subsets.
|
|
101
|
-
|
|
102
|
-
Returns:
|
|
103
|
-
Subset: A new Subset object with unweighted average score
|
|
104
|
-
"""
|
|
105
|
-
scores = []
|
|
106
|
-
total_count = 0
|
|
107
|
-
for name in subset_names:
|
|
108
|
-
if name in subset_dict:
|
|
109
|
-
subset = subset_dict[name]
|
|
110
|
-
scores.append(subset.score)
|
|
111
|
-
total_count += subset.num
|
|
112
|
-
|
|
113
|
-
unweighted_avg = sum(scores) / len(scores) if scores else 0
|
|
114
|
-
return Subset(name='', score=unweighted_avg, num=total_count)
|
|
115
|
-
|
|
116
81
|
def preprocess_row(self, row: dict):
|
|
117
82
|
"""
|
|
118
83
|
Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
|
|
@@ -323,19 +288,19 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
323
288
|
|
|
324
289
|
# Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
|
|
325
290
|
simple_subsets = ['simple', 'java', 'javascript']
|
|
326
|
-
simple_ast =
|
|
291
|
+
simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
|
|
327
292
|
subset_dict['simple_ast'] = simple_ast
|
|
328
293
|
|
|
329
294
|
# Step 2.1: Calculate ast_non_live
|
|
330
295
|
# (simple_ast, multiple, parallel, parallel_multiple unweighted average)
|
|
331
296
|
ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
|
|
332
|
-
ast_non_live =
|
|
297
|
+
ast_non_live = unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
|
|
333
298
|
subset_dict['ast_non_live'] = ast_non_live
|
|
334
299
|
|
|
335
300
|
# Step 2.2: Calculate ast_live
|
|
336
301
|
# (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
|
|
337
302
|
live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
|
|
338
|
-
ast_live =
|
|
303
|
+
ast_live = weighted_average_from_subsets(live_subsets, subset_dict)
|
|
339
304
|
subset_dict['ast_live'] = ast_live
|
|
340
305
|
|
|
341
306
|
# Step 2.3: hallucination_non_live (irrelevance)
|
|
@@ -346,7 +311,7 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
346
311
|
|
|
347
312
|
# Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
|
|
348
313
|
hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
|
|
349
|
-
hallucination_live =
|
|
314
|
+
hallucination_live = weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
|
|
350
315
|
subset_dict['hallucination_live'] = hallucination_live
|
|
351
316
|
|
|
352
317
|
# Step 2.5: multi_turn_base
|
|
@@ -356,27 +321,27 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
356
321
|
# Step 2.6: Calculate multi_turn_augmented
|
|
357
322
|
# (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
|
|
358
323
|
multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
|
|
359
|
-
multi_turn_augmented =
|
|
324
|
+
multi_turn_augmented = weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
|
|
360
325
|
subset_dict['multi_turn_augmented'] = multi_turn_augmented
|
|
361
326
|
|
|
362
327
|
# Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
|
|
363
328
|
non_live_subsets = ['ast_non_live', 'hallucination_non_live']
|
|
364
|
-
non_live =
|
|
329
|
+
non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
|
|
365
330
|
subset_dict['non_live'] = non_live
|
|
366
331
|
|
|
367
332
|
# Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
|
|
368
333
|
live_agg_subsets = ['ast_live', 'hallucination_live']
|
|
369
|
-
live =
|
|
334
|
+
live = weighted_average_from_subsets(live_agg_subsets, subset_dict)
|
|
370
335
|
subset_dict['live'] = live
|
|
371
336
|
|
|
372
337
|
# Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
|
|
373
338
|
multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
|
|
374
|
-
multi_turn =
|
|
339
|
+
multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
|
|
375
340
|
subset_dict['multi_turn'] = multi_turn
|
|
376
341
|
|
|
377
342
|
# Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
|
|
378
343
|
overall_subsets = ['non_live', 'live', 'multi_turn']
|
|
379
|
-
overall =
|
|
344
|
+
overall = unweighted_average_from_subsets(overall_subsets, subset_dict)
|
|
380
345
|
subset_dict['overall'] = overall
|
|
381
346
|
|
|
382
347
|
# Add computed scores to the category
|
|
File without changes
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.multi_choices import format_letter_choices
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
MULT_CHOICE_PROMPT = r"""
|
|
16
|
+
Answer the following multiple choice question. The last line of your response should be of the following format:
|
|
17
|
+
'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
|
|
18
|
+
|
|
19
|
+
{question}
|
|
20
|
+
""".strip()
|
|
21
|
+
|
|
22
|
+
SUBSET_LIST = [
|
|
23
|
+
'Art_Style', 'Counting', 'Forensic_Detection', 'Functional_Correspondence', 'IQ_Test', 'Jigsaw',
|
|
24
|
+
'Multi-view_Reasoning', 'Object_Localization', 'Relative_Depth', 'Relative_Reflectance', 'Semantic_Correspondence',
|
|
25
|
+
'Spatial_Relation', 'Visual_Correspondence', 'Visual_Similarity'
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@register_benchmark(
|
|
30
|
+
BenchmarkMeta(
|
|
31
|
+
name='blink',
|
|
32
|
+
pretty_name='BLINK',
|
|
33
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
|
|
34
|
+
description=
|
|
35
|
+
'BLINK is a benchmark designed to evaluate the core visual perception abilities of multimodal large language models (MLLMs). It transforms 14 classic computer vision tasks into 3,807 multiple-choice questions, accompanied by single or multiple images and visual prompts.', # noqa: E501
|
|
36
|
+
dataset_id='evalscope/BLINK',
|
|
37
|
+
subset_list=SUBSET_LIST,
|
|
38
|
+
metric_list=['acc'],
|
|
39
|
+
eval_split='val',
|
|
40
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
class BLINKAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
|
|
44
|
+
MAX_IMAGES: int = 4
|
|
45
|
+
|
|
46
|
+
def __init__(self, **kwargs):
|
|
47
|
+
super().__init__(**kwargs)
|
|
48
|
+
|
|
49
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
50
|
+
choices = record.get('choices')
|
|
51
|
+
input_text = MULT_CHOICE_PROMPT.format(question=record['prompt'], letters=format_letter_choices(choices))
|
|
52
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
53
|
+
|
|
54
|
+
for i in range(1, self.MAX_IMAGES + 1):
|
|
55
|
+
image = record.get(f'image_{i}')
|
|
56
|
+
if image:
|
|
57
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
|
|
58
|
+
content_list.append(ContentImage(image=image_base64))
|
|
59
|
+
|
|
60
|
+
label_answer = record['answer'].strip('(').strip(')')
|
|
61
|
+
return Sample(input=[ChatMessageUser(content=content_list)], choices=choices, target=label_answer)
|
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
# flake8: noqa
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
OPEN_PROMPT = """
|
|
19
|
+
{question}
|
|
20
|
+
|
|
21
|
+
The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the a single word answer to the problem.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@register_benchmark(
|
|
26
|
+
BenchmarkMeta(
|
|
27
|
+
name='chartqa',
|
|
28
|
+
pretty_name='ChartQA',
|
|
29
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
30
|
+
description=
|
|
31
|
+
'ChartQA is a benchmark designed to evaluate question-answering capabilities about charts (e.g., bar charts, line graphs, pie charts), focusing on both visual and logical reasoning.', # noqa: E501
|
|
32
|
+
dataset_id='lmms-lab/ChartQA',
|
|
33
|
+
subset_list=['human_test', 'augmented_test'],
|
|
34
|
+
metric_list=['relaxed_acc'],
|
|
35
|
+
eval_split='test',
|
|
36
|
+
prompt_template=OPEN_PROMPT,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
class ChartQAAdapter(VisionLanguageAdapter):
|
|
40
|
+
|
|
41
|
+
def __init__(self, *args, **kwargs):
|
|
42
|
+
super().__init__(*args, **kwargs)
|
|
43
|
+
|
|
44
|
+
self.add_aggregation_name = False
|
|
45
|
+
self.reformat_subset = True
|
|
46
|
+
|
|
47
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
48
|
+
question = record['question']
|
|
49
|
+
image_data = record['image']
|
|
50
|
+
image_base64 = bytes_to_base64(image_data['bytes'], format='png', add_header=True)
|
|
51
|
+
|
|
52
|
+
content_list: List[Content] = [
|
|
53
|
+
ContentText(text=OPEN_PROMPT.format(question=question)),
|
|
54
|
+
ContentImage(image=image_base64)
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
return Sample(
|
|
58
|
+
input=[ChatMessageUser(content=content_list)],
|
|
59
|
+
target=record['answer'],
|
|
60
|
+
subset_key=record['type'], # 'human_test' or 'augmented_split'
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
64
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
65
|
+
match = re.search(pattern, prediction)
|
|
66
|
+
if match:
|
|
67
|
+
return match.group(1).strip()
|
|
68
|
+
return ''
|
|
69
|
+
|
|
70
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
71
|
+
from .utils import relaxed_correctness
|
|
72
|
+
|
|
73
|
+
score = relaxed_correctness(filtered_prediction, reference)
|
|
74
|
+
score = 1.0 if score else 0.0
|
|
75
|
+
|
|
76
|
+
return Score(
|
|
77
|
+
value={'relaxed_acc': score},
|
|
78
|
+
prediction=original_prediction,
|
|
79
|
+
extracted_prediction=filtered_prediction,
|
|
80
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
def relaxed_correctness(prediction: str, target: str, max_relative_change: float = 0.05) -> bool:
|
|
2
|
+
"""Calculates relaxed correctness.
|
|
3
|
+
|
|
4
|
+
The correctness tolerates certain error ratio defined by max_relative_change.
|
|
5
|
+
See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
|
|
6
|
+
“Following Methani et al. (2020), we use a relaxed accuracy measure for the
|
|
7
|
+
numeric answers to allow a minor inaccuracy that may result from the automatic
|
|
8
|
+
data extraction process. We consider an answer to be correct if it is within
|
|
9
|
+
5% of the gold answer. For non-numeric answers, we still need an exact match
|
|
10
|
+
to consider an answer to be correct.”
|
|
11
|
+
|
|
12
|
+
This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
|
|
13
|
+
Args:
|
|
14
|
+
target: List of target string.
|
|
15
|
+
prediction: List of predicted string.
|
|
16
|
+
max_relative_change: Maximum relative change.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Whether the prediction was correct given the specified tolerance.
|
|
20
|
+
""" # noqa: E501
|
|
21
|
+
|
|
22
|
+
def _to_float(text: str):
|
|
23
|
+
try:
|
|
24
|
+
if text.endswith('%'):
|
|
25
|
+
# Convert percentages to floats.
|
|
26
|
+
return float(text.rstrip('%')) / 100.0
|
|
27
|
+
else:
|
|
28
|
+
return float(text)
|
|
29
|
+
except ValueError:
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
prediction_float = _to_float(prediction)
|
|
33
|
+
target_float = _to_float(target)
|
|
34
|
+
if prediction_float is not None and target_float:
|
|
35
|
+
relative_change = abs(prediction_float - target_float) / abs(target_float)
|
|
36
|
+
return relative_change <= max_relative_change
|
|
37
|
+
else:
|
|
38
|
+
return prediction.lower() == target.lower()
|
|
@@ -20,11 +20,12 @@ logger = get_logger()
|
|
|
20
20
|
@register_benchmark(
|
|
21
21
|
BenchmarkMeta(
|
|
22
22
|
name=DataCollection.NAME,
|
|
23
|
+
pretty_name='Data-Collection',
|
|
23
24
|
dataset_id='', # dataset_id need to be set
|
|
24
25
|
description='Custom Data collection, mixing multiple evaluation datasets for '
|
|
25
26
|
'a unified evaluation, aiming to use less data to achieve a more comprehensive '
|
|
26
27
|
'assessment of the model\'s capabilities. '
|
|
27
|
-
'[Usage Reference](https://evalscope.readthedocs.io/
|
|
28
|
+
'[Usage Reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html)',
|
|
28
29
|
tags=[Tags.CUSTOM],
|
|
29
30
|
metric_list=['acc'],
|
|
30
31
|
eval_split='test',
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator.state import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
PROMPT = """Answer the question according to the image using a single word or phrase.
|
|
16
|
+
{question}
|
|
17
|
+
The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.""" # noqa: E501
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_benchmark(
|
|
21
|
+
BenchmarkMeta(
|
|
22
|
+
name='docvqa',
|
|
23
|
+
pretty_name='DocVQA',
|
|
24
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
25
|
+
description=
|
|
26
|
+
'DocVQA (Document Visual Question Answering) is a benchmark designed to evaluate AI systems on their ability to answer questions based on the content of document images, such as scanned pages, forms, or invoices. Unlike general visual question answering, it requires understanding not just the text extracted by OCR, but also the complex layout, structure, and visual elements of a document.', # noqa: E501
|
|
27
|
+
dataset_id='lmms-lab/DocVQA',
|
|
28
|
+
subset_list=['DocVQA'],
|
|
29
|
+
metric_list=['anls'],
|
|
30
|
+
eval_split='validation',
|
|
31
|
+
prompt_template=PROMPT,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
class DocVQAAdapter(VisionLanguageAdapter):
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
super().__init__(**kwargs)
|
|
38
|
+
self.add_aggregation_name = False
|
|
39
|
+
|
|
40
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
41
|
+
|
|
42
|
+
input_text = PROMPT.format(question=record['question'])
|
|
43
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
44
|
+
image = record.get('image')
|
|
45
|
+
if image:
|
|
46
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
47
|
+
content_list.append(ContentImage(image=image_base64))
|
|
48
|
+
return Sample(
|
|
49
|
+
input=[ChatMessageUser(content=content_list)],
|
|
50
|
+
target=json.dumps(record.get('answers')), # answers is a list
|
|
51
|
+
metadata={
|
|
52
|
+
'questionId': record.get('questionId'),
|
|
53
|
+
'question_types': record.get('question_types'),
|
|
54
|
+
'docId': record.get('docId'),
|
|
55
|
+
'ucsf_document_id': record.get('ucsf_document_id'),
|
|
56
|
+
'ucsf_document_page_no': record.get('ucsf_document_page_no'),
|
|
57
|
+
}
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
61
|
+
import re
|
|
62
|
+
|
|
63
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
64
|
+
match = re.search(pattern, prediction)
|
|
65
|
+
if match:
|
|
66
|
+
return match.group(1).strip()
|
|
67
|
+
return prediction.strip()
|
|
@@ -31,7 +31,7 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
|
|
|
31
31
|
'GeneralArena is a custom benchmark designed to evaluate the performance of large language models in a competitive setting, '
|
|
32
32
|
'where models are pitted against each other in custom tasks to determine their relative strengths and weaknesses. You should '
|
|
33
33
|
'provide the model outputs in the format of a list of dictionaries, where each dictionary contains the model name and its report path. '
|
|
34
|
-
'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/
|
|
34
|
+
'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html).',
|
|
35
35
|
dataset_id='general_arena',
|
|
36
36
|
metric_list=['winrate'],
|
|
37
37
|
few_shot_num=0,
|
|
@@ -34,7 +34,8 @@ def process_review_item(review_result: ReviewResult) -> list:
|
|
|
34
34
|
'Index': str(review_result.index),
|
|
35
35
|
'Input': review_result.input,
|
|
36
36
|
'Question': review_result.input, # Use input as question
|
|
37
|
-
'Generated':
|
|
37
|
+
'Generated':
|
|
38
|
+
prediction if prediction != extracted_prediction else extracted_prediction or '', # Ensure no None value
|
|
38
39
|
'Gold': target,
|
|
39
40
|
'Pred': extracted_prediction,
|
|
40
41
|
'Score': sample_score.score.model_dump(exclude_none=True),
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
name='general_mcq',
|
|
21
21
|
pretty_name='General-MCQ',
|
|
22
22
|
description='A general multiple-choice question answering dataset for custom evaluation. '
|
|
23
|
-
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/
|
|
23
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#mcq).',
|
|
24
24
|
tags=[Tags.MULTIPLE_CHOICE, Tags.CUSTOM],
|
|
25
25
|
dataset_id='general_mcq',
|
|
26
26
|
subset_list=['default'],
|
|
@@ -20,7 +20,7 @@ PROMPT_TEMPLATE = '请回答问题\n{question}'
|
|
|
20
20
|
name='general_qa',
|
|
21
21
|
pretty_name='General-QA',
|
|
22
22
|
description='A general question answering dataset for custom evaluation. '
|
|
23
|
-
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/
|
|
23
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
|
|
24
24
|
tags=[Tags.QA, Tags.CUSTOM],
|
|
25
25
|
dataset_id='general_qa',
|
|
26
26
|
metric_list=['BLEU', 'Rouge'],
|