evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +8 -1
- evalscope/api/benchmark/adapters/__init__.py +1 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/benchmark.py +14 -0
- evalscope/api/dataset/dataset.py +21 -0
- evalscope/api/dataset/loader.py +6 -2
- evalscope/api/mixin/sandbox_mixin.py +32 -54
- evalscope/api/model/generate_config.py +6 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +111 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +72 -79
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +52 -1
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/utils/openai.py +4 -0
- evalscope/perf/arguments.py +24 -4
- evalscope/perf/benchmark.py +74 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +179 -79
- evalscope/perf/plugin/api/openai_api.py +4 -3
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/utils/benchmark_util.py +36 -22
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +0 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +11 -2
- evalscope/report/combinator.py +52 -2
- evalscope/run.py +4 -0
- evalscope/utils/function_utils.py +195 -12
- evalscope/utils/io_utils.py +74 -0
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/logger.py +49 -17
- evalscope/utils/multi_choices.py +16 -1
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
from evalscope.utils.multi_choices import parse_answers
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
MULT_CHOICE_PROMPT = """
|
|
17
|
+
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A, B, C, D. Think step by step before answering.
|
|
18
|
+
|
|
19
|
+
{question}
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
SUBSET_LIST = [
|
|
23
|
+
'Quantitative Reasoning', 'Other', 'Positional Reasoning', 'Stylistic Reasoning', 'Spatial Reasoning',
|
|
24
|
+
'Attribute Reasoning'
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@register_benchmark(
|
|
29
|
+
BenchmarkMeta(
|
|
30
|
+
name='visulogic',
|
|
31
|
+
pretty_name='VisuLogic',
|
|
32
|
+
dataset_id='evalscope/VisuLogic',
|
|
33
|
+
tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
|
|
34
|
+
description=
|
|
35
|
+
'VisuLogic is a benchmark aimed at evaluating the visual reasoning capabilities of Multi-modal Large Language Models (MLLMs), independent of textual reasoning processes. It features carefully constructed visual reasoning tasks spanning multiple categories, divided into six types based on required reasoning skills (e.g., Quantitative Reasoning, which involves understanding and deducing changes in the quantity of elements in images). Unlike existing benchmarks, VisuLogic is a challenging visual reasoning benchmark that is inherently difficult to articulate using language, providing a more rigorous evaluation of the visual reasoning capabilities of MLLMs.',
|
|
36
|
+
subset_list=SUBSET_LIST,
|
|
37
|
+
metric_list=['acc'],
|
|
38
|
+
eval_split='test',
|
|
39
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
class VisuLogicAdapter(VisionLanguageAdapter):
|
|
43
|
+
|
|
44
|
+
def __init__(self, **kwargs):
|
|
45
|
+
super().__init__(**kwargs)
|
|
46
|
+
self.reformat_subset = True
|
|
47
|
+
|
|
48
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
49
|
+
question = record.get('question', '')
|
|
50
|
+
content_list: List[Content] = []
|
|
51
|
+
prompt_text = self.prompt_template.format(question=question).strip()
|
|
52
|
+
content_list.append(ContentText(text=prompt_text))
|
|
53
|
+
|
|
54
|
+
image = record.get('image')
|
|
55
|
+
if image and isinstance(image, dict):
|
|
56
|
+
image_bytes = image.get('bytes')
|
|
57
|
+
if image_bytes:
|
|
58
|
+
image_base64 = bytes_to_base64(image_bytes, format='png', add_header=True)
|
|
59
|
+
content_list.append(ContentImage(image=image_base64))
|
|
60
|
+
|
|
61
|
+
metadata = {
|
|
62
|
+
'id': record['id'],
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return Sample(
|
|
66
|
+
input=[ChatMessageUser(content=content_list)],
|
|
67
|
+
target=record['label'],
|
|
68
|
+
choices=['A', 'B', 'C', 'D'],
|
|
69
|
+
subset_key=record['tag'],
|
|
70
|
+
metadata=metadata,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
74
|
+
answers = parse_answers(task_state)
|
|
75
|
+
return ''.join(sorted(list(answers)))
|
|
File without changes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64, compress_image_to_limit
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
# 定义提示模板
|
|
15
|
+
PROMPT_TEMPLATE = """{question}
|
|
16
|
+
\n\n\nLet's think step by step and give the final answer in curly braces,
|
|
17
|
+
like this: {{final answer}}"
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
SUBSET_LIST = ['default']
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@register_benchmark(
|
|
24
|
+
BenchmarkMeta(
|
|
25
|
+
name='zerobench',
|
|
26
|
+
pretty_name='ZeroBench',
|
|
27
|
+
dataset_id='evalscope/zerobench',
|
|
28
|
+
tags=[Tags.KNOWLEDGE, Tags.QA, Tags.MULTI_MODAL],
|
|
29
|
+
description=
|
|
30
|
+
'ZeroBench is a challenging visual reasoning benchmark for Large Multimodal Models (LMMs). It consists of a main set of 100 high-quality, manually curated questions covering numerous domains, reasoning types and image type. Questions in ZeroBench have been designed and calibrated to be beyond the capabilities of current frontier models. As such, none of the evaluated models achieves a non-zero pass@1 (with greedy decoding) or 5/5 reliability score.',
|
|
31
|
+
subset_list=SUBSET_LIST,
|
|
32
|
+
metric_list=['acc'],
|
|
33
|
+
eval_split='zerobench',
|
|
34
|
+
train_split='zerobench_subquestions',
|
|
35
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
class ZeroBenchAdapter(VisionLanguageAdapter):
|
|
39
|
+
|
|
40
|
+
def __init__(self, *args, **kwargs):
|
|
41
|
+
super().__init__(*args, **kwargs)
|
|
42
|
+
|
|
43
|
+
self._use_llm_judge = True
|
|
44
|
+
|
|
45
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
46
|
+
question = record['question_text']
|
|
47
|
+
content_list: List[Content] = [ContentText(text=self.prompt_template.format(question=question))]
|
|
48
|
+
image = record['question_images_decoded']
|
|
49
|
+
if len(image) > 0:
|
|
50
|
+
for img in image:
|
|
51
|
+
# Ensure image is under OpenAI's 10MB data-URI limit by compressing if needed
|
|
52
|
+
processed_bytes, fmt = compress_image_to_limit(img['bytes'], 10_000_000)
|
|
53
|
+
image_base64 = bytes_to_base64(processed_bytes, format=fmt, add_header=True)
|
|
54
|
+
content_list.append(ContentImage(image=image_base64))
|
|
55
|
+
|
|
56
|
+
metadata = {
|
|
57
|
+
'question_id': record['question_id'],
|
|
58
|
+
'question_images': record['question_images'],
|
|
59
|
+
'image_attribution': record['image_attribution']
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return Sample(
|
|
63
|
+
input=[ChatMessageUser(content=content_list)], target=record['question_answer'], metadata=metadata
|
|
64
|
+
)
|
evalscope/constants.py
CHANGED
|
@@ -16,6 +16,7 @@ DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
|
|
|
16
16
|
os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
|
|
17
17
|
) # ~/.cache/evalscope
|
|
18
18
|
IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1' # To avoid some heavy dependencies when building doc
|
|
19
|
+
HEARTBEAT_INTERVAL_SEC = 60 # 60 seconds
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class HubType:
|
|
@@ -121,6 +122,7 @@ class Tags:
|
|
|
121
122
|
CHINESE = 'Chinese'
|
|
122
123
|
COMMONSENSE = 'Commonsense'
|
|
123
124
|
QA = 'QA'
|
|
125
|
+
NER = 'NER'
|
|
124
126
|
READING_COMPREHENSION = 'ReadingComprehension'
|
|
125
127
|
CUSTOM = 'Custom'
|
|
126
128
|
INSTRUCTION_FOLLOWING = 'InstructionFollowing'
|
|
@@ -133,6 +135,8 @@ class Tags:
|
|
|
133
135
|
MULTI_MODAL = 'MultiModal'
|
|
134
136
|
MULTI_LINGUAL = 'MultiLingual'
|
|
135
137
|
MULTI_TURN = 'MultiTurn'
|
|
138
|
+
YES_NO = 'Yes/No'
|
|
139
|
+
HALLUCINATION = 'Hallucination'
|
|
136
140
|
|
|
137
141
|
|
|
138
142
|
class FileConstants:
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -10,14 +10,14 @@ and report generation.
|
|
|
10
10
|
import os
|
|
11
11
|
import traceback
|
|
12
12
|
from collections import defaultdict
|
|
13
|
-
from
|
|
14
|
-
from tqdm import tqdm
|
|
15
|
-
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
|
13
|
+
from typing import TYPE_CHECKING, Dict, List
|
|
16
14
|
|
|
17
15
|
from evalscope.api.dataset import Dataset, DatasetDict, Sample
|
|
18
16
|
from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
|
|
19
17
|
from evalscope.api.metric import AggScore, SampleScore
|
|
18
|
+
from evalscope.constants import HEARTBEAT_INTERVAL_SEC
|
|
20
19
|
from evalscope.report import Report, gen_table
|
|
20
|
+
from evalscope.utils.function_utils import run_in_threads_with_progress
|
|
21
21
|
from evalscope.utils.logger import get_logger
|
|
22
22
|
|
|
23
23
|
if TYPE_CHECKING:
|
|
@@ -91,22 +91,27 @@ class DefaultEvaluator(Evaluator):
|
|
|
91
91
|
Report: The complete evaluation report containing all metrics and results.
|
|
92
92
|
"""
|
|
93
93
|
# Load the dataset and evaluate each subset
|
|
94
|
+
logger.info(f'Start evaluating benchmark: {self.benchmark_name}')
|
|
94
95
|
dataset_dict = self.benchmark.load_dataset()
|
|
95
96
|
agg_score_dict = defaultdict(list)
|
|
96
97
|
|
|
97
98
|
# Process each subset (e.g., test, validation) independently
|
|
99
|
+
logger.info('Evaluating all subsets of the dataset...')
|
|
98
100
|
for subset, dataset in dataset_dict.items():
|
|
99
101
|
if len(dataset) == 0:
|
|
100
102
|
logger.info(f'No samples found in subset: {subset}, skipping.')
|
|
101
103
|
continue
|
|
104
|
+
logger.info(f'Evaluating subset: {subset}')
|
|
102
105
|
subset_score = self.evaluate_subset(subset, dataset)
|
|
103
106
|
agg_score_dict[subset] = subset_score
|
|
104
107
|
|
|
105
108
|
# Generate the report based on aggregated scores
|
|
109
|
+
logger.info('Generating report...')
|
|
106
110
|
report = self.get_report(agg_score_dict)
|
|
107
111
|
|
|
108
112
|
# Finalize the evaluation process
|
|
109
113
|
self.finalize()
|
|
114
|
+
logger.info(f'Benchmark {self.benchmark_name} evaluation finished.')
|
|
110
115
|
return report
|
|
111
116
|
|
|
112
117
|
def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
|
|
@@ -126,12 +131,15 @@ class DefaultEvaluator(Evaluator):
|
|
|
126
131
|
List[AggScore]: Aggregated scores for this subset.
|
|
127
132
|
"""
|
|
128
133
|
# Get model predictions for all samples in the subset
|
|
134
|
+
logger.info(f'Getting predictions for subset: {subset}')
|
|
129
135
|
task_states = self.get_answers(subset, dataset)
|
|
130
136
|
|
|
131
137
|
# Calculate evaluation metrics for each prediction
|
|
138
|
+
logger.info(f'Getting reviews for subset: {subset}')
|
|
132
139
|
sample_scores = self.get_reviews(subset, task_states)
|
|
133
140
|
|
|
134
141
|
# Aggregate individual sample scores into subset-level metrics
|
|
142
|
+
logger.info(f'Aggregating scores for subset: {subset}')
|
|
135
143
|
agg_scores = self.benchmark.aggregate_scores(sample_scores=sample_scores)
|
|
136
144
|
return agg_scores
|
|
137
145
|
|
|
@@ -162,44 +170,38 @@ class DefaultEvaluator(Evaluator):
|
|
|
162
170
|
|
|
163
171
|
# Convert dataset to list for parallel processing
|
|
164
172
|
dataset_list = list(dataset)
|
|
165
|
-
|
|
166
173
|
if not dataset_list:
|
|
167
174
|
return task_state_list
|
|
168
175
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
if self.task_config.ignore_errors:
|
|
197
|
-
logger.warning('Error ignored, continuing with next sample.')
|
|
198
|
-
else:
|
|
199
|
-
raise exc
|
|
200
|
-
finally:
|
|
201
|
-
pbar.update(1)
|
|
176
|
+
logger.info(f'Processing {len(dataset_list)} samples, if data is large, it may take a while.')
|
|
177
|
+
|
|
178
|
+
def worker(sample: Sample) -> TaskState:
|
|
179
|
+
return self._predict_sample(sample, model_prediction_dir)
|
|
180
|
+
|
|
181
|
+
def on_result(sample: Sample, task_state: TaskState) -> None:
|
|
182
|
+
model_result = self.cache_manager.save_prediction_cache(subset, task_state, self.benchmark.save_metadata)
|
|
183
|
+
logger.debug(f'Model result: \n{model_result.pretty_print()}')
|
|
184
|
+
|
|
185
|
+
def on_error(sample: Sample, exc: Exception) -> None:
|
|
186
|
+
tb_str = traceback.format_exc()
|
|
187
|
+
logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}')
|
|
188
|
+
if self.task_config.ignore_errors:
|
|
189
|
+
logger.warning('Error ignored, continuing with next sample.')
|
|
190
|
+
return
|
|
191
|
+
raise exc
|
|
192
|
+
|
|
193
|
+
new_task_states = run_in_threads_with_progress(
|
|
194
|
+
dataset_list,
|
|
195
|
+
worker,
|
|
196
|
+
desc=f'Predicting[{self.benchmark_name}@{subset}]: ',
|
|
197
|
+
max_workers=self.task_config.eval_batch_size,
|
|
198
|
+
heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
|
|
199
|
+
on_result=on_result,
|
|
200
|
+
on_error=on_error,
|
|
201
|
+
)
|
|
202
|
+
task_state_list.extend(new_task_states)
|
|
202
203
|
|
|
204
|
+
logger.info(f'Finished getting predictions for subset: {subset}.')
|
|
203
205
|
return task_state_list
|
|
204
206
|
|
|
205
207
|
def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
|
|
@@ -246,49 +248,40 @@ class DefaultEvaluator(Evaluator):
|
|
|
246
248
|
if not task_states:
|
|
247
249
|
return sample_score_list
|
|
248
250
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
tb_str = traceback.format_exc()
|
|
282
|
-
logger.error(
|
|
283
|
-
f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}'
|
|
284
|
-
)
|
|
285
|
-
if self.task_config.ignore_errors:
|
|
286
|
-
logger.warning('Error ignored, continuing with next sample.')
|
|
287
|
-
else:
|
|
288
|
-
raise exc
|
|
289
|
-
finally:
|
|
290
|
-
pbar.update(1)
|
|
251
|
+
logger.info(f'Reviewing {len(task_states)} samples, if data is large, it may take a while.')
|
|
252
|
+
|
|
253
|
+
def worker(task_state: TaskState) -> SampleScore:
|
|
254
|
+
return self._review_task_state(task_state)
|
|
255
|
+
|
|
256
|
+
def on_result(task_state: TaskState, sample_score: SampleScore) -> None:
|
|
257
|
+
review_result = self.cache_manager.save_review_cache(
|
|
258
|
+
subset=subset,
|
|
259
|
+
task_state=task_state,
|
|
260
|
+
sample_score=sample_score,
|
|
261
|
+
save_metadata=self.benchmark.save_metadata
|
|
262
|
+
)
|
|
263
|
+
logger.debug(f'Review result: \n{review_result.pretty_print()}')
|
|
264
|
+
|
|
265
|
+
def on_error(task_state: TaskState, exc: Exception) -> None:
|
|
266
|
+
tb_str = traceback.format_exc()
|
|
267
|
+
logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}')
|
|
268
|
+
if self.task_config.ignore_errors:
|
|
269
|
+
logger.warning('Error ignored, continuing with next sample.')
|
|
270
|
+
return
|
|
271
|
+
raise exc
|
|
272
|
+
|
|
273
|
+
new_scores = run_in_threads_with_progress(
|
|
274
|
+
task_states,
|
|
275
|
+
worker,
|
|
276
|
+
desc=f'Reviewing[{self.benchmark_name}@{subset}]: ',
|
|
277
|
+
max_workers=self.task_config.judge_worker_num,
|
|
278
|
+
heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
|
|
279
|
+
on_result=on_result,
|
|
280
|
+
on_error=on_error,
|
|
281
|
+
)
|
|
282
|
+
sample_score_list.extend(new_scores)
|
|
291
283
|
|
|
284
|
+
logger.info(f'Finished reviewing subset: {subset}. Total reviewed: {len(sample_score_list)}')
|
|
292
285
|
return sample_score_list
|
|
293
286
|
|
|
294
287
|
def _review_task_state(self, task_state: TaskState) -> SampleScore:
|
evalscope/metrics/math_parser.py
CHANGED
|
@@ -211,6 +211,11 @@ def strip_answer_string(string):
|
|
|
211
211
|
# Remove grade level (e.g., 12th grade) and just maintain the integer
|
|
212
212
|
string = re.sub(r'thgrade$', '', string)
|
|
213
213
|
|
|
214
|
+
# Normalize thousands-formatted numbers (e.g., 70,000 or -1,234,567.89) by removing commas
|
|
215
|
+
# This must run before the "list of integers" sorting to avoid misclassifying numbers with thousand separators.
|
|
216
|
+
if re.fullmatch(r'\s*-?\d{1,3}(?:,\d{3})+(?:\.\d+)?\s*', string):
|
|
217
|
+
string = string.replace(',', '')
|
|
218
|
+
|
|
214
219
|
# If the answer is a list of integers (without parenthesis), sort them
|
|
215
220
|
if re.fullmatch(r'(\s*-?\d+\s*,)*\s*-?\d+\s*', string):
|
|
216
221
|
# Split the string into a list of integers
|
|
@@ -262,6 +267,8 @@ def extract_answer(pred_str, use_last_number=True):
|
|
|
262
267
|
elif '答案是' in pred_str:
|
|
263
268
|
# Handle Chinese few-shot multiple choice problem answer extraction
|
|
264
269
|
pred = pred_str.split('答案是')[1].strip().split('\n\n')[0].strip()
|
|
270
|
+
elif 'ANSWER:' in pred_str:
|
|
271
|
+
pred = pred_str.split('ANSWER:')[-1].strip()
|
|
265
272
|
else: # use the last number
|
|
266
273
|
if use_last_number:
|
|
267
274
|
pattern = '-?\d*\.?\d+'
|
|
@@ -529,3 +536,10 @@ def symbolic_equal(a, b):
|
|
|
529
536
|
pass
|
|
530
537
|
|
|
531
538
|
return False
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
if __name__ == '__main__':
|
|
542
|
+
print(math_equal('\n\\boxed{70,\\!000}\n', '70000'))
|
|
543
|
+
print(extract_answer('The answer is \\boxed{70,\\!000}'))
|
|
544
|
+
print(strip_answer_string(extract_answer('The answer is \\boxed{70,\\!000}')))
|
|
545
|
+
print(math_equal(extract_answer('The answer is \\boxed{70,\\!000}'), '70000'))
|
evalscope/metrics/metric.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from collections import defaultdict
|
|
2
3
|
from typing import List
|
|
3
4
|
|
|
@@ -42,7 +43,7 @@ class Accuracy(ExactMatch):
|
|
|
42
43
|
|
|
43
44
|
results = []
|
|
44
45
|
for prediction, reference in zip(predictions, references):
|
|
45
|
-
pred_answer =
|
|
46
|
+
pred_answer = extract_answer(prediction)
|
|
46
47
|
ref_answer = strip_answer_string(reference)
|
|
47
48
|
results.append(float(math_equal(pred_answer, ref_answer)))
|
|
48
49
|
|
|
@@ -100,6 +101,56 @@ class MultiChoiceAcc(Metric):
|
|
|
100
101
|
return res
|
|
101
102
|
|
|
102
103
|
|
|
104
|
+
@register_metric(name='anls')
|
|
105
|
+
class ANLS(Metric):
|
|
106
|
+
|
|
107
|
+
def __init__(self, thresh_hold=0.5):
|
|
108
|
+
self.thresh_hold = thresh_hold
|
|
109
|
+
|
|
110
|
+
def apply(self, predictions, references):
|
|
111
|
+
"""
|
|
112
|
+
Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
|
|
113
|
+
This implementation is adapted from
|
|
114
|
+
https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
references (List[str]): List of correct answers. Each answer can be a string of json.
|
|
118
|
+
predictions (List[str]): List of predicted answers.
|
|
119
|
+
"""
|
|
120
|
+
from .metrics import levenshtein_distance
|
|
121
|
+
|
|
122
|
+
res = []
|
|
123
|
+
# Unwrap predictions if it's a nested list
|
|
124
|
+
for prediction, reference in zip(predictions, references):
|
|
125
|
+
# Parse the reference which is a json string
|
|
126
|
+
try:
|
|
127
|
+
answer = json.loads(reference)
|
|
128
|
+
except json.JSONDecodeError:
|
|
129
|
+
answer = reference
|
|
130
|
+
if isinstance(answer, str):
|
|
131
|
+
answer = [answer]
|
|
132
|
+
assert isinstance(answer, list), 'The reference answer should be a list of answers.'
|
|
133
|
+
|
|
134
|
+
# Calculate ANLS for each reference answer
|
|
135
|
+
values = []
|
|
136
|
+
for ans in answer:
|
|
137
|
+
# preprocess both the answers - gt and prediction
|
|
138
|
+
gt_answer = ' '.join(ans.strip().lower().split())
|
|
139
|
+
det_answer = ' '.join(prediction.strip().lower().split())
|
|
140
|
+
|
|
141
|
+
dist = levenshtein_distance(gt_answer, det_answer)
|
|
142
|
+
length = max(len(ans.upper()), len(prediction.upper()))
|
|
143
|
+
values.append(0.0 if length == 0 else float(dist) / float(length))
|
|
144
|
+
|
|
145
|
+
question_result = 0.0
|
|
146
|
+
if values:
|
|
147
|
+
question_result = 1 - min(values)
|
|
148
|
+
if question_result < self.thresh_hold:
|
|
149
|
+
question_result = 0.0
|
|
150
|
+
res.append(question_result)
|
|
151
|
+
return res
|
|
152
|
+
|
|
153
|
+
|
|
103
154
|
# ##################
|
|
104
155
|
# T2I Metrics ######
|
|
105
156
|
####################
|
evalscope/metrics/metrics.py
CHANGED
|
@@ -467,3 +467,19 @@ def calculate_pass_at_k(
|
|
|
467
467
|
num_samples_it = iter(num_samples)
|
|
468
468
|
|
|
469
469
|
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def levenshtein_distance(s1, s2):
|
|
473
|
+
if len(s1) > len(s2):
|
|
474
|
+
s1, s2 = s2, s1
|
|
475
|
+
|
|
476
|
+
distances = range(len(s1) + 1)
|
|
477
|
+
for i2, c2 in enumerate(s2):
|
|
478
|
+
distances_ = [i2 + 1]
|
|
479
|
+
for i1, c1 in enumerate(s1):
|
|
480
|
+
if c1 == c2:
|
|
481
|
+
distances_.append(distances[i1])
|
|
482
|
+
else:
|
|
483
|
+
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
|
|
484
|
+
distances = distances_
|
|
485
|
+
return distances[-1]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -30,13 +30,9 @@ from transformers.modeling_outputs import (
|
|
|
30
30
|
SequenceClassifierOutput,
|
|
31
31
|
TokenClassifierOutput,
|
|
32
32
|
)
|
|
33
|
-
from transformers.modeling_utils import
|
|
34
|
-
PreTrainedModel,
|
|
35
|
-
apply_chunking_to_forward,
|
|
36
|
-
find_pruneable_heads_and_indices,
|
|
37
|
-
prune_linear_layer,
|
|
38
|
-
)
|
|
33
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
39
34
|
from transformers.models.bert.configuration_bert import BertConfig
|
|
35
|
+
from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
|
40
36
|
from transformers.utils import logging
|
|
41
37
|
from typing import Any, Dict, Optional, Tuple
|
|
42
38
|
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py
CHANGED
|
@@ -14,13 +14,9 @@ from transformers.modeling_outputs import (
|
|
|
14
14
|
BaseModelOutputWithPastAndCrossAttentions,
|
|
15
15
|
BaseModelOutputWithPoolingAndCrossAttentions,
|
|
16
16
|
)
|
|
17
|
-
from transformers.modeling_utils import
|
|
18
|
-
PreTrainedModel,
|
|
19
|
-
apply_chunking_to_forward,
|
|
20
|
-
find_pruneable_heads_and_indices,
|
|
21
|
-
prune_linear_layer,
|
|
22
|
-
)
|
|
17
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
23
18
|
from transformers.models.bert.configuration_bert import BertConfig
|
|
19
|
+
from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
|
24
20
|
from transformers.utils import logging
|
|
25
21
|
from typing import Tuple
|
|
26
22
|
|
|
@@ -31,13 +31,9 @@ from transformers.modeling_outputs import (
|
|
|
31
31
|
SequenceClassifierOutput,
|
|
32
32
|
TokenClassifierOutput,
|
|
33
33
|
)
|
|
34
|
-
from transformers.modeling_utils import
|
|
35
|
-
PreTrainedModel,
|
|
36
|
-
apply_chunking_to_forward,
|
|
37
|
-
find_pruneable_heads_and_indices,
|
|
38
|
-
prune_linear_layer,
|
|
39
|
-
)
|
|
34
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
40
35
|
from transformers.models.bert.configuration_bert import BertConfig
|
|
36
|
+
from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
|
41
37
|
from transformers.utils import logging
|
|
42
38
|
from typing import Optional, Tuple
|
|
43
39
|
|
evalscope/models/utils/openai.py
CHANGED
|
@@ -204,6 +204,10 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
|
|
|
204
204
|
)
|
|
205
205
|
if config.extra_body:
|
|
206
206
|
params['extra_body'] = config.extra_body
|
|
207
|
+
if config.extra_query:
|
|
208
|
+
params['extra_query'] = config.extra_query
|
|
209
|
+
if config.extra_headers:
|
|
210
|
+
params['extra_headers'] = config.extra_headers
|
|
207
211
|
|
|
208
212
|
return params
|
|
209
213
|
|