evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
- evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
- evalscope/api/benchmark/benchmark.py +27 -2
- evalscope/api/benchmark/meta.py +3 -0
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +5 -0
- evalscope/api/messages/chat_message.py +6 -1
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +0 -3
- evalscope/api/model/model.py +1 -1
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +6 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
- evalscope/benchmarks/bfcl/generation.py +7 -7
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
- evalscope/config.py +24 -1
- evalscope/constants.py +3 -0
- evalscope/evaluator/evaluator.py +25 -7
- evalscope/metrics/metric.py +78 -2
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/model_apis.py +10 -8
- evalscope/models/utils/openai.py +1 -2
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/report/__init__.py +9 -1
- evalscope/report/combinator.py +45 -20
- evalscope/report/report.py +8 -4
- evalscope/run.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +63 -13
- evalscope/utils/io_utils.py +19 -11
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +1 -1
- evalscope/utils/multi_choices.py +16 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -385
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -80
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -178
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
SUBSET_LIST = ['default']
|
|
16
|
+
|
|
17
|
+
OPEN_PROMPT = (
|
|
18
|
+
'Read the picture and solve the following problem step by step.'
|
|
19
|
+
'The last line of your response should be of the form'
|
|
20
|
+
' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.\n\n'
|
|
21
|
+
'{question}\n\n'
|
|
22
|
+
'Remember to put your answer on its own line at the end in the form'
|
|
23
|
+
' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem,'
|
|
24
|
+
' and you do not need to use a \\boxed command.'
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@register_benchmark(
|
|
29
|
+
BenchmarkMeta(
|
|
30
|
+
name='real_world_qa',
|
|
31
|
+
pretty_name='RealWorldQA',
|
|
32
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
33
|
+
description=
|
|
34
|
+
'RealWorldQA is a benchmark designed to evaluate the real-world spatial understanding capabilities of multimodal AI models, contributed by XAI. It assesses how well these models comprehend physical environments. The benchmark consists of 700+ images, each accompanied by a question and a verifiable answer. These images are drawn from real-world scenarios, including those captured from vehicles. The goal is to advance AI models\' understanding of our physical world.', # noqa: E501
|
|
35
|
+
dataset_id='lmms-lab/RealWorldQA',
|
|
36
|
+
subset_list=SUBSET_LIST,
|
|
37
|
+
metric_list=['acc'],
|
|
38
|
+
eval_split='test',
|
|
39
|
+
prompt_template=OPEN_PROMPT,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
class RealWorldQAAdapter(VisionLanguageAdapter):
|
|
43
|
+
|
|
44
|
+
def __init__(self, **kwargs):
|
|
45
|
+
super().__init__(**kwargs)
|
|
46
|
+
|
|
47
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
48
|
+
content_list: list[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
|
|
49
|
+
image = record.get('image')
|
|
50
|
+
if image:
|
|
51
|
+
image_base64 = bytes_to_base64(image['bytes'], format='webp', add_header=True)
|
|
52
|
+
content_list.append(ContentImage(image=image_base64))
|
|
53
|
+
return Sample(
|
|
54
|
+
input=[ChatMessageUser(content=content_list)],
|
|
55
|
+
target=record['answer'],
|
|
56
|
+
metadata={'image_path': record['image_path']}
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
60
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
61
|
+
match = re.search(pattern, prediction)
|
|
62
|
+
if match:
|
|
63
|
+
return match.group(1).strip()
|
|
64
|
+
return ''
|
|
@@ -47,7 +47,12 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
47
47
|
def __init__(self, **kwargs):
|
|
48
48
|
super().__init__(**kwargs)
|
|
49
49
|
|
|
50
|
-
check_import(
|
|
50
|
+
check_import(
|
|
51
|
+
'tau_bench',
|
|
52
|
+
package='git+https://github.com/sierra-research/tau-bench',
|
|
53
|
+
raise_error=True,
|
|
54
|
+
feature_name=self.pretty_name
|
|
55
|
+
)
|
|
51
56
|
|
|
52
57
|
# setup user model args
|
|
53
58
|
self.user_model = self.extra_params.get('user_model', 'qwen-plus')
|
evalscope/config.py
CHANGED
|
@@ -18,6 +18,7 @@ from evalscope.constants import (
|
|
|
18
18
|
)
|
|
19
19
|
from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
|
|
20
20
|
from evalscope.utils.deprecation_utils import deprecated_warning
|
|
21
|
+
from evalscope.utils.import_utils import check_import
|
|
21
22
|
from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
|
|
22
23
|
from evalscope.utils.logger import get_logger
|
|
23
24
|
|
|
@@ -124,6 +125,19 @@ class TaskConfig(BaseArgument):
|
|
|
124
125
|
analysis_report: bool = False
|
|
125
126
|
"""Whether to generate detailed analysis reports after evaluation."""
|
|
126
127
|
|
|
128
|
+
# Sandbox configuration arguments
|
|
129
|
+
use_sandbox: bool = False
|
|
130
|
+
"""Whether to execute code in a sandboxed environment."""
|
|
131
|
+
|
|
132
|
+
sandbox_type: Optional[str] = 'docker'
|
|
133
|
+
"""Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
|
|
134
|
+
|
|
135
|
+
sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
|
|
136
|
+
"""Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
|
|
137
|
+
|
|
138
|
+
sandbox_config: Optional[Dict] = field(default_factory=dict)
|
|
139
|
+
"""Configuration for sandboxed code execution environments."""
|
|
140
|
+
|
|
127
141
|
def __post_init__(self):
|
|
128
142
|
self.__init_model_and_id()
|
|
129
143
|
|
|
@@ -132,6 +146,7 @@ class TaskConfig(BaseArgument):
|
|
|
132
146
|
# Set default generation_config and model_args
|
|
133
147
|
self.__init_default_generation_config()
|
|
134
148
|
self.__init_default_model_args()
|
|
149
|
+
self.__init_default_sandbox_config()
|
|
135
150
|
|
|
136
151
|
def __init_model_and_id(self):
|
|
137
152
|
# Set model to DummyCustomModel if not provided
|
|
@@ -223,6 +238,14 @@ class TaskConfig(BaseArgument):
|
|
|
223
238
|
'precision': 'torch.float16',
|
|
224
239
|
}
|
|
225
240
|
|
|
241
|
+
def __init_default_sandbox_config(self):
|
|
242
|
+
if not self.use_sandbox:
|
|
243
|
+
return
|
|
244
|
+
check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
|
|
245
|
+
|
|
246
|
+
if not self.sandbox_type:
|
|
247
|
+
self.sandbox_type = 'docker'
|
|
248
|
+
|
|
226
249
|
def update(self, other: Union['TaskConfig', dict]):
|
|
227
250
|
if isinstance(other, TaskConfig):
|
|
228
251
|
other = other.to_dict()
|
|
@@ -238,7 +261,7 @@ class TaskConfig(BaseArgument):
|
|
|
238
261
|
logger.warning(f'Failed to dump overall task config: {e}')
|
|
239
262
|
|
|
240
263
|
def to_dict(self):
|
|
241
|
-
result = copy.
|
|
264
|
+
result = copy.copy(self.__dict__)
|
|
242
265
|
del result['api_key'] # Do not expose api_key in the config
|
|
243
266
|
|
|
244
267
|
if isinstance(self.model, (Model, ModelAPI)):
|
evalscope/constants.py
CHANGED
|
@@ -15,6 +15,7 @@ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old versio
|
|
|
15
15
|
DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
|
|
16
16
|
os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
|
|
17
17
|
) # ~/.cache/evalscope
|
|
18
|
+
IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1' # To avoid some heavy dependencies when building doc
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class HubType:
|
|
@@ -130,6 +131,8 @@ class Tags:
|
|
|
130
131
|
TEXT_TO_IMAGE = 'TextToImage'
|
|
131
132
|
IMAGE_EDITING = 'ImageEditing'
|
|
132
133
|
MULTI_MODAL = 'MultiModal'
|
|
134
|
+
MULTI_LINGUAL = 'MultiLingual'
|
|
135
|
+
MULTI_TURN = 'MultiTurn'
|
|
133
136
|
|
|
134
137
|
|
|
135
138
|
class FileConstants:
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -8,8 +8,9 @@ and report generation.
|
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
import os
|
|
11
|
+
import traceback
|
|
11
12
|
from collections import defaultdict
|
|
12
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
|
|
13
14
|
from tqdm import tqdm
|
|
14
15
|
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
|
15
16
|
|
|
@@ -17,6 +18,7 @@ from evalscope.api.dataset import Dataset, DatasetDict, Sample
|
|
|
17
18
|
from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
|
|
18
19
|
from evalscope.api.metric import AggScore, SampleScore
|
|
19
20
|
from evalscope.report import Report, gen_table
|
|
21
|
+
from evalscope.utils.logger import get_logger
|
|
20
22
|
|
|
21
23
|
if TYPE_CHECKING:
|
|
22
24
|
from evalscope.api.benchmark import DataAdapter
|
|
@@ -24,8 +26,6 @@ if TYPE_CHECKING:
|
|
|
24
26
|
from evalscope.config import TaskConfig
|
|
25
27
|
from evalscope.utils.io_utils import OutputsStructure
|
|
26
28
|
|
|
27
|
-
from evalscope.utils.logger import get_logger
|
|
28
|
-
|
|
29
29
|
logger = get_logger()
|
|
30
30
|
|
|
31
31
|
|
|
@@ -104,6 +104,9 @@ class DefaultEvaluator(Evaluator):
|
|
|
104
104
|
|
|
105
105
|
# Generate the report based on aggregated scores
|
|
106
106
|
report = self.get_report(agg_score_dict)
|
|
107
|
+
|
|
108
|
+
# Finalize the evaluation process
|
|
109
|
+
self.finalize()
|
|
107
110
|
return report
|
|
108
111
|
|
|
109
112
|
def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
|
|
@@ -186,7 +189,10 @@ class DefaultEvaluator(Evaluator):
|
|
|
186
189
|
logger.debug(f'Model result: \n{model_result.pretty_print()}')
|
|
187
190
|
|
|
188
191
|
except Exception as exc:
|
|
189
|
-
|
|
192
|
+
tb_str = traceback.format_exc()
|
|
193
|
+
logger.error(
|
|
194
|
+
f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}'
|
|
195
|
+
)
|
|
190
196
|
if self.task_config.ignore_errors:
|
|
191
197
|
logger.warning('Error ignored, continuing with next sample.')
|
|
192
198
|
else:
|
|
@@ -253,7 +259,13 @@ class DefaultEvaluator(Evaluator):
|
|
|
253
259
|
for future in as_completed(future_to_task_state):
|
|
254
260
|
task_state = future_to_task_state[future]
|
|
255
261
|
try:
|
|
256
|
-
|
|
262
|
+
try:
|
|
263
|
+
sample_score = future.result()
|
|
264
|
+
except TimeoutError:
|
|
265
|
+
logger.warning(
|
|
266
|
+
f'Timeout when reviewing sample {task_state.sample_id}, setting score to zero.'
|
|
267
|
+
)
|
|
268
|
+
sample_score = SampleScore(sample_id=task_state.sample_id, scores={})
|
|
257
269
|
sample_score_list.append(sample_score)
|
|
258
270
|
|
|
259
271
|
# Save the review result to cache for future use
|
|
@@ -266,7 +278,10 @@ class DefaultEvaluator(Evaluator):
|
|
|
266
278
|
logger.debug(f'Review result: \n{review_result.pretty_print()}')
|
|
267
279
|
|
|
268
280
|
except Exception as exc:
|
|
269
|
-
|
|
281
|
+
tb_str = traceback.format_exc()
|
|
282
|
+
logger.error(
|
|
283
|
+
f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}'
|
|
284
|
+
)
|
|
270
285
|
if self.task_config.ignore_errors:
|
|
271
286
|
logger.warning('Error ignored, continuing with next sample.')
|
|
272
287
|
else:
|
|
@@ -319,7 +334,7 @@ class DefaultEvaluator(Evaluator):
|
|
|
319
334
|
|
|
320
335
|
# Generate and display a summary table of results
|
|
321
336
|
try:
|
|
322
|
-
report_table = gen_table(report_list=[report], add_overall_metric=
|
|
337
|
+
report_table = gen_table(report_list=[report], add_overall_metric=self.benchmark.add_overall_metric)
|
|
323
338
|
logger.info(f'\n{self.benchmark_name} report table:'
|
|
324
339
|
f'\n{report_table} \n')
|
|
325
340
|
except Exception:
|
|
@@ -337,3 +352,6 @@ class DefaultEvaluator(Evaluator):
|
|
|
337
352
|
report.to_json(report_file)
|
|
338
353
|
logger.info(f'Dump report to: {report_file} \n')
|
|
339
354
|
return report
|
|
355
|
+
|
|
356
|
+
def finalize(self, *args, **kwargs):
|
|
357
|
+
self.benchmark.finalize(*args, **kwargs)
|
evalscope/metrics/metric.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from collections import defaultdict
|
|
2
3
|
from typing import List
|
|
3
4
|
|
|
@@ -6,11 +7,19 @@ from evalscope.api.registry import register_aggregation, register_metric
|
|
|
6
7
|
from .metrics import mean
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
def normalize_text(text: str) -> str:
|
|
11
|
+
"""Normalize text by lowering case and stripping whitespace."""
|
|
12
|
+
return text.strip().lower()
|
|
13
|
+
|
|
14
|
+
|
|
9
15
|
@register_metric(name='exact_match')
|
|
10
16
|
class ExactMatch(Metric):
|
|
11
17
|
|
|
12
18
|
def apply(self, predictions, references):
|
|
13
|
-
return [
|
|
19
|
+
return [
|
|
20
|
+
float(normalize_text(prediction) == normalize_text(reference))
|
|
21
|
+
for prediction, reference in zip(predictions, references)
|
|
22
|
+
]
|
|
14
23
|
|
|
15
24
|
|
|
16
25
|
@register_metric(name='acc')
|
|
@@ -92,6 +101,56 @@ class MultiChoiceAcc(Metric):
|
|
|
92
101
|
return res
|
|
93
102
|
|
|
94
103
|
|
|
104
|
+
@register_metric(name='anls')
|
|
105
|
+
class ANLS(Metric):
|
|
106
|
+
|
|
107
|
+
def __init__(self, thresh_hold=0.5):
|
|
108
|
+
self.thresh_hold = thresh_hold
|
|
109
|
+
|
|
110
|
+
def apply(self, predictions, references):
|
|
111
|
+
"""
|
|
112
|
+
Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
|
|
113
|
+
This implementation is adapted from
|
|
114
|
+
https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
references (List[str]): List of correct answers. Each answer can be a string of json.
|
|
118
|
+
predictions (List[str]): List of predicted answers.
|
|
119
|
+
"""
|
|
120
|
+
from .metrics import levenshtein_distance
|
|
121
|
+
|
|
122
|
+
res = []
|
|
123
|
+
# Unwrap predictions if it's a nested list
|
|
124
|
+
for prediction, reference in zip(predictions, references):
|
|
125
|
+
# Parse the reference which is a json string
|
|
126
|
+
try:
|
|
127
|
+
answer = json.loads(reference)
|
|
128
|
+
except json.JSONDecodeError:
|
|
129
|
+
answer = reference
|
|
130
|
+
if isinstance(answer, str):
|
|
131
|
+
answer = [answer]
|
|
132
|
+
assert isinstance(answer, list), 'The reference answer should be a list of answers.'
|
|
133
|
+
|
|
134
|
+
# Calculate ANLS for each reference answer
|
|
135
|
+
values = []
|
|
136
|
+
for ans in answer:
|
|
137
|
+
# preprocess both the answers - gt and prediction
|
|
138
|
+
gt_answer = ' '.join(ans.strip().lower().split())
|
|
139
|
+
det_answer = ' '.join(prediction.strip().lower().split())
|
|
140
|
+
|
|
141
|
+
dist = levenshtein_distance(gt_answer, det_answer)
|
|
142
|
+
length = max(len(ans.upper()), len(prediction.upper()))
|
|
143
|
+
values.append(0.0 if length == 0 else float(dist) / float(length))
|
|
144
|
+
|
|
145
|
+
question_result = 0.0
|
|
146
|
+
if values:
|
|
147
|
+
question_result = 1 - min(values)
|
|
148
|
+
if question_result < self.thresh_hold:
|
|
149
|
+
question_result = 0.0
|
|
150
|
+
res.append(question_result)
|
|
151
|
+
return res
|
|
152
|
+
|
|
153
|
+
|
|
95
154
|
# ##################
|
|
96
155
|
# T2I Metrics ######
|
|
97
156
|
####################
|
|
@@ -202,6 +261,9 @@ class Mean(Aggregator):
|
|
|
202
261
|
|
|
203
262
|
name = 'mean'
|
|
204
263
|
|
|
264
|
+
def agg_func(self, values: List[float]) -> float:
|
|
265
|
+
return mean(values)
|
|
266
|
+
|
|
205
267
|
def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
|
|
206
268
|
"""Aggregate scores by computing the mean for each metric.
|
|
207
269
|
|
|
@@ -230,7 +292,7 @@ class Mean(Aggregator):
|
|
|
230
292
|
if values: # Only process non-empty value lists
|
|
231
293
|
aggregated_scores.append(
|
|
232
294
|
AggScore(
|
|
233
|
-
score=
|
|
295
|
+
score=self.agg_func(values),
|
|
234
296
|
metric_name=metric_name,
|
|
235
297
|
aggregation_name=self.name,
|
|
236
298
|
num=len(values),
|
|
@@ -241,6 +303,20 @@ class Mean(Aggregator):
|
|
|
241
303
|
return aggregated_scores
|
|
242
304
|
|
|
243
305
|
|
|
306
|
+
@register_aggregation(name='clipped_mean')
|
|
307
|
+
class ClippedMean(Mean):
|
|
308
|
+
|
|
309
|
+
name = 'clipped_mean'
|
|
310
|
+
|
|
311
|
+
def __init__(self, clip_min: float = 0.0, clip_max: float = 1.0):
|
|
312
|
+
self.clip_min = clip_min
|
|
313
|
+
self.clip_max = clip_max
|
|
314
|
+
|
|
315
|
+
def agg_func(self, values: List[float]) -> float:
|
|
316
|
+
clipped_values = min(max(mean(values), self.clip_min), self.clip_max)
|
|
317
|
+
return clipped_values
|
|
318
|
+
|
|
319
|
+
|
|
244
320
|
@register_aggregation(name='pass_at_k')
|
|
245
321
|
class PassAtK(Aggregator):
|
|
246
322
|
|
evalscope/metrics/metrics.py
CHANGED
|
@@ -467,3 +467,19 @@ def calculate_pass_at_k(
|
|
|
467
467
|
num_samples_it = iter(num_samples)
|
|
468
468
|
|
|
469
469
|
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def levenshtein_distance(s1, s2):
|
|
473
|
+
if len(s1) > len(s2):
|
|
474
|
+
s1, s2 = s2, s1
|
|
475
|
+
|
|
476
|
+
distances = range(len(s1) + 1)
|
|
477
|
+
for i2, c2 in enumerate(s2):
|
|
478
|
+
distances_ = [i2 + 1]
|
|
479
|
+
for i1, c1 in enumerate(s1):
|
|
480
|
+
if c1 == c2:
|
|
481
|
+
distances_.append(distances[i1])
|
|
482
|
+
else:
|
|
483
|
+
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
|
|
484
|
+
distances = distances_
|
|
485
|
+
return distances[-1]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -30,13 +30,9 @@ from transformers.modeling_outputs import (
|
|
|
30
30
|
SequenceClassifierOutput,
|
|
31
31
|
TokenClassifierOutput,
|
|
32
32
|
)
|
|
33
|
-
from transformers.modeling_utils import
|
|
34
|
-
PreTrainedModel,
|
|
35
|
-
apply_chunking_to_forward,
|
|
36
|
-
find_pruneable_heads_and_indices,
|
|
37
|
-
prune_linear_layer,
|
|
38
|
-
)
|
|
33
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
39
34
|
from transformers.models.bert.configuration_bert import BertConfig
|
|
35
|
+
from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
|
40
36
|
from transformers.utils import logging
|
|
41
37
|
from typing import Any, Dict, Optional, Tuple
|
|
42
38
|
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py
CHANGED
|
@@ -14,13 +14,9 @@ from transformers.modeling_outputs import (
|
|
|
14
14
|
BaseModelOutputWithPastAndCrossAttentions,
|
|
15
15
|
BaseModelOutputWithPoolingAndCrossAttentions,
|
|
16
16
|
)
|
|
17
|
-
from transformers.modeling_utils import
|
|
18
|
-
PreTrainedModel,
|
|
19
|
-
apply_chunking_to_forward,
|
|
20
|
-
find_pruneable_heads_and_indices,
|
|
21
|
-
prune_linear_layer,
|
|
22
|
-
)
|
|
17
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
23
18
|
from transformers.models.bert.configuration_bert import BertConfig
|
|
19
|
+
from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
|
24
20
|
from transformers.utils import logging
|
|
25
21
|
from typing import Tuple
|
|
26
22
|
|
|
@@ -31,13 +31,9 @@ from transformers.modeling_outputs import (
|
|
|
31
31
|
SequenceClassifierOutput,
|
|
32
32
|
TokenClassifierOutput,
|
|
33
33
|
)
|
|
34
|
-
from transformers.modeling_utils import
|
|
35
|
-
PreTrainedModel,
|
|
36
|
-
apply_chunking_to_forward,
|
|
37
|
-
find_pruneable_heads_and_indices,
|
|
38
|
-
prune_linear_layer,
|
|
39
|
-
)
|
|
34
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
40
35
|
from transformers.models.bert.configuration_bert import BertConfig
|
|
36
|
+
from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
|
41
37
|
from transformers.utils import logging
|
|
42
38
|
from typing import Optional, Tuple
|
|
43
39
|
|
evalscope/models/model_apis.py
CHANGED
|
@@ -28,7 +28,7 @@ def server() -> type[ModelAPI]:
|
|
|
28
28
|
|
|
29
29
|
@register_model_api(name='llm_ckpt')
|
|
30
30
|
def llm_ckpt() -> type[ModelAPI]:
|
|
31
|
-
check_import('torch', package='torch', raise_error=True)
|
|
31
|
+
check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
|
|
32
32
|
|
|
33
33
|
from .modelscope import ModelScopeAPI
|
|
34
34
|
|
|
@@ -38,7 +38,7 @@ def llm_ckpt() -> type[ModelAPI]:
|
|
|
38
38
|
@register_model_api(name='checkpoint')
|
|
39
39
|
@deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
|
|
40
40
|
def checkpoint() -> type[ModelAPI]:
|
|
41
|
-
check_import('torch', package='torch', raise_error=True)
|
|
41
|
+
check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
|
|
42
42
|
|
|
43
43
|
from .modelscope import ModelScopeAPI
|
|
44
44
|
|
|
@@ -47,9 +47,10 @@ def checkpoint() -> type[ModelAPI]:
|
|
|
47
47
|
|
|
48
48
|
@register_model_api(name='text2image')
|
|
49
49
|
def text2image() -> type[ModelAPI]:
|
|
50
|
-
check_import('torch',
|
|
51
|
-
|
|
52
|
-
|
|
50
|
+
check_import(['torch', 'torchvision', 'diffusers'],
|
|
51
|
+
package='evalscope[aigc]',
|
|
52
|
+
raise_error=True,
|
|
53
|
+
feature_name='text2image')
|
|
53
54
|
|
|
54
55
|
from .text2image_model import Text2ImageAPI
|
|
55
56
|
|
|
@@ -58,9 +59,10 @@ def text2image() -> type[ModelAPI]:
|
|
|
58
59
|
|
|
59
60
|
@register_model_api(name='image_editing')
|
|
60
61
|
def image_editing() -> type[ModelAPI]:
|
|
61
|
-
check_import('torch',
|
|
62
|
-
|
|
63
|
-
|
|
62
|
+
check_import(['torch', 'torchvision', 'diffusers'],
|
|
63
|
+
package='evalscope[aigc]',
|
|
64
|
+
raise_error=True,
|
|
65
|
+
feature_name='image_editing')
|
|
64
66
|
|
|
65
67
|
from .image_edit_model import ImageEditAPI
|
|
66
68
|
|
evalscope/models/utils/openai.py
CHANGED
|
@@ -104,10 +104,9 @@ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartPa
|
|
|
104
104
|
)
|
|
105
105
|
elif content.type == 'audio':
|
|
106
106
|
audio_data_uri = file_as_data_uri(content.audio)
|
|
107
|
-
audio_data = audio_data_uri.split('base64,')[1]
|
|
108
107
|
|
|
109
108
|
return ChatCompletionContentPartInputAudioParam(
|
|
110
|
-
type='input_audio', input_audio=dict(data=
|
|
109
|
+
type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
|
|
111
110
|
)
|
|
112
111
|
|
|
113
112
|
else:
|
evalscope/perf/arguments.py
CHANGED
|
@@ -55,6 +55,7 @@ class Arguments(BaseArgument):
|
|
|
55
55
|
image_height: int = 224 # Height of the image for random VL dataset
|
|
56
56
|
image_format: str = 'RGB' # Image format for random VL dataset
|
|
57
57
|
image_num: int = 1 # Number of images for random VL dataset
|
|
58
|
+
image_patch_size: int = 28 # Patch size for image tokenizer, only for local image token calculation
|
|
58
59
|
|
|
59
60
|
# Dataset settings
|
|
60
61
|
dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
|
|
@@ -171,6 +172,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
171
172
|
parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
|
|
172
173
|
parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
|
|
173
174
|
parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
|
|
175
|
+
parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation') # noqa: E501
|
|
174
176
|
|
|
175
177
|
# Output settings
|
|
176
178
|
parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
|
|
@@ -43,7 +43,7 @@ class ApiPluginBase:
|
|
|
43
43
|
|
|
44
44
|
@abstractmethod
|
|
45
45
|
async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
|
|
46
|
-
body: Dict) -> AsyncGenerator[Tuple[bool, int,
|
|
46
|
+
body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
|
|
47
47
|
"""Process the HTTP request and handle the response.
|
|
48
48
|
|
|
49
49
|
Args:
|
|
@@ -53,7 +53,7 @@ class ApiPluginBase:
|
|
|
53
53
|
body: The request body
|
|
54
54
|
|
|
55
55
|
Yields:
|
|
56
|
-
Tuple[bool, int,
|
|
56
|
+
Tuple[bool, int, Any]: (is_error, status_code, response_data)
|
|
57
57
|
"""
|
|
58
58
|
raise NotImplementedError
|
|
59
59
|
|
|
@@ -18,7 +18,7 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
18
18
|
super().__init__(param)
|
|
19
19
|
|
|
20
20
|
async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
|
|
21
|
-
body: Dict) -> AsyncGenerator[Tuple[bool, int,
|
|
21
|
+
body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
|
|
22
22
|
"""Process the HTTP request and handle the response.
|
|
23
23
|
|
|
24
24
|
Args:
|
|
@@ -28,7 +28,7 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
28
28
|
body: The request body
|
|
29
29
|
|
|
30
30
|
Yields:
|
|
31
|
-
Tuple[bool, int,
|
|
31
|
+
Tuple[bool, int, Any]: (is_error, status_code, response_data)
|
|
32
32
|
"""
|
|
33
33
|
try:
|
|
34
34
|
headers = {'Content-Type': 'application/json', **headers}
|
|
@@ -40,7 +40,7 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
40
40
|
logger.error(f'Error in process_request: {e}')
|
|
41
41
|
yield (True, None, str(e))
|
|
42
42
|
|
|
43
|
-
async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int,
|
|
43
|
+
async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
|
|
44
44
|
"""Handle streaming response from server-sent events.
|
|
45
45
|
|
|
46
46
|
Args:
|
|
@@ -71,14 +71,14 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
71
71
|
logger.error(f'Error in _handle_stream: {e}')
|
|
72
72
|
yield True, response.status, str(e)
|
|
73
73
|
|
|
74
|
-
async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int,
|
|
74
|
+
async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
|
|
75
75
|
"""Handle the HTTP response based on content type and status.
|
|
76
76
|
|
|
77
77
|
Args:
|
|
78
78
|
response: The aiohttp response object
|
|
79
79
|
|
|
80
80
|
Yields:
|
|
81
|
-
Tuple[bool, int,
|
|
81
|
+
Tuple[bool, int, Any]: (is_error, status_code, response_data)
|
|
82
82
|
"""
|
|
83
83
|
response_status = response.status
|
|
84
84
|
response_content_type = response.content_type
|
|
@@ -94,7 +94,7 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
94
94
|
# Handle successful response with 'application/json' content type
|
|
95
95
|
elif content_type_json in response_content_type:
|
|
96
96
|
content = await response.json()
|
|
97
|
-
yield (False, response_status,
|
|
97
|
+
yield (False, response_status, content)
|
|
98
98
|
# Handle other successful responses
|
|
99
99
|
else:
|
|
100
100
|
content = await response.read()
|
|
@@ -102,4 +102,4 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
102
102
|
else:
|
|
103
103
|
# error is always in JSON format
|
|
104
104
|
error = await response.json()
|
|
105
|
-
yield (True, response_status,
|
|
105
|
+
yield (True, response_status, error)
|