evalscope 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +8 -1
- evalscope/api/benchmark/adapters/__init__.py +1 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/benchmark.py +14 -0
- evalscope/api/dataset/dataset.py +21 -0
- evalscope/api/dataset/loader.py +6 -2
- evalscope/api/mixin/sandbox_mixin.py +32 -54
- evalscope/api/model/generate_config.py +6 -0
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +1 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +111 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +72 -79
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +1 -1
- evalscope/models/utils/openai.py +4 -0
- evalscope/perf/arguments.py +24 -4
- evalscope/perf/benchmark.py +74 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +179 -79
- evalscope/perf/plugin/api/openai_api.py +4 -3
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/utils/benchmark_util.py +36 -22
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +0 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +2 -1
- evalscope/run.py +4 -0
- evalscope/utils/function_utils.py +195 -12
- evalscope/utils/io_utils.py +74 -0
- evalscope/utils/logger.py +49 -17
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/METADATA +235 -363
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/RECORD +100 -55
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -20,11 +20,12 @@ logger = get_logger()
|
|
|
20
20
|
@register_benchmark(
|
|
21
21
|
BenchmarkMeta(
|
|
22
22
|
name=DataCollection.NAME,
|
|
23
|
+
pretty_name='Data-Collection',
|
|
23
24
|
dataset_id='', # dataset_id need to be set
|
|
24
25
|
description='Custom Data collection, mixing multiple evaluation datasets for '
|
|
25
26
|
'a unified evaluation, aiming to use less data to achieve a more comprehensive '
|
|
26
27
|
'assessment of the model\'s capabilities. '
|
|
27
|
-
'[Usage Reference](https://evalscope.readthedocs.io/
|
|
28
|
+
'[Usage Reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html)',
|
|
28
29
|
tags=[Tags.CUSTOM],
|
|
29
30
|
metric_list=['acc'],
|
|
30
31
|
eval_split='test',
|
|
@@ -31,7 +31,7 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
|
|
|
31
31
|
'GeneralArena is a custom benchmark designed to evaluate the performance of large language models in a competitive setting, '
|
|
32
32
|
'where models are pitted against each other in custom tasks to determine their relative strengths and weaknesses. You should '
|
|
33
33
|
'provide the model outputs in the format of a list of dictionaries, where each dictionary contains the model name and its report path. '
|
|
34
|
-
'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/
|
|
34
|
+
'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html).',
|
|
35
35
|
dataset_id='general_arena',
|
|
36
36
|
metric_list=['winrate'],
|
|
37
37
|
few_shot_num=0,
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
name='general_mcq',
|
|
21
21
|
pretty_name='General-MCQ',
|
|
22
22
|
description='A general multiple-choice question answering dataset for custom evaluation. '
|
|
23
|
-
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/
|
|
23
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#mcq).',
|
|
24
24
|
tags=[Tags.MULTIPLE_CHOICE, Tags.CUSTOM],
|
|
25
25
|
dataset_id='general_mcq',
|
|
26
26
|
subset_list=['default'],
|
|
@@ -20,7 +20,7 @@ PROMPT_TEMPLATE = '请回答问题\n{question}'
|
|
|
20
20
|
name='general_qa',
|
|
21
21
|
pretty_name='General-QA',
|
|
22
22
|
description='A general question answering dataset for custom evaluation. '
|
|
23
|
-
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/
|
|
23
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
|
|
24
24
|
tags=[Tags.QA, Tags.CUSTOM],
|
|
25
25
|
dataset_id='general_qa',
|
|
26
26
|
metric_list=['BLEU', 'Rouge'],
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import re
|
|
3
4
|
from typing import Any, Dict
|
|
4
5
|
|
|
5
6
|
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
@@ -12,13 +13,26 @@ from evalscope.utils.logger import get_logger
|
|
|
12
13
|
logger = get_logger()
|
|
13
14
|
|
|
14
15
|
PROMPT_TEMPLATE = """
|
|
15
|
-
Solve the following math problem step by step. The last line of your response should
|
|
16
|
+
Solve the following math problem step by step. The last line of your response should display the answer enclosed within \\boxed{{\\text{{$ANSWER}}}}.
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
Example:
|
|
19
|
+
|
|
20
|
+
Let's solve the problem step by step.
|
|
21
|
+
|
|
22
|
+
Problem: Eliza's rate per hour for the first 40 hours she works each week is $10. She also receives an overtime pay of 1.2 times her regular hourly rate. If Eliza worked for 45 hours this week, how much are her earnings for this week?
|
|
23
|
+
|
|
24
|
+
Step 1: Calculate Eliza's earnings for the first 40 hours. Eliza's hourly rate is $10, so her earnings for the first 40 hours are $10/hour x 40 hours = $400.
|
|
25
|
+
Step 2: Calculate Eliza's overtime pay rate. Eliza's overtime pay rate is 1.2 times her regular hourly rate, so her overtime pay rate is $10/hour x 1.2 = $12/hour.
|
|
26
|
+
Step 3: Calculate Eliza's earnings for the overtime hours. Eliza worked for 45 hours, so her overtime hours are 45 hours - 40 hours = 5 hours. Her earnings for the overtime hours are $12/hour x 5 hours = $60.
|
|
27
|
+
Step 4: Calculate Eliza's total earnings for the week. Eliza's total earnings for the week are her earnings for the first 40 hours plus her earnings for the overtime hours, which is $400 + $60 = $460.
|
|
18
28
|
|
|
19
|
-
|
|
29
|
+
Answer:
|
|
30
|
+
\\boxed{{\\text{{460}}}}
|
|
20
31
|
|
|
21
|
-
|
|
32
|
+
question:
|
|
33
|
+
{question}
|
|
34
|
+
|
|
35
|
+
Remember to put your answer on its own line at the end in the form "\\boxed{{\\text{{$ANSWER}}}}" (without quotes), where $ANSWER is replaced by the actual answer to the problem.
|
|
22
36
|
""".lstrip() # noqa: E501
|
|
23
37
|
|
|
24
38
|
FEWSHOT_TEMPLATE = """
|
|
@@ -69,6 +83,11 @@ class GSM8KAdapter(DefaultDataAdapter):
|
|
|
69
83
|
return ''
|
|
70
84
|
|
|
71
85
|
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
86
|
+
boxed_match = re.search(r'\\boxed\\{\\text\\{([^}]*)\\}\\}', prediction)
|
|
87
|
+
if boxed_match:
|
|
88
|
+
result = boxed_match.group(1).strip()
|
|
89
|
+
return result.strip()
|
|
90
|
+
|
|
72
91
|
from evalscope.filters.extraction import RegexFilter
|
|
73
92
|
|
|
74
93
|
regex = RegexFilter(regex_pattern=r'(-?[0-9.,]{2,})|(-?[0-9]+)', group_select=-1)
|
|
File without changes
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator.state import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@register_benchmark(
|
|
18
|
+
BenchmarkMeta(
|
|
19
|
+
name='hallusion_bench',
|
|
20
|
+
pretty_name='HallusionBench',
|
|
21
|
+
tags=[Tags.MULTI_MODAL, Tags.HALLUCINATION, Tags.YES_NO],
|
|
22
|
+
description=
|
|
23
|
+
'HallusionBench is an advanced diagnostic benchmark designed to evaluate image-context reasoning, analyze models\' tendencies for language hallucination and visual illusion in large vision-language models (LVLMs).', # noqa: E501
|
|
24
|
+
dataset_id='lmms-lab/HallusionBench',
|
|
25
|
+
metric_list=['aAcc', 'qAcc', 'fAcc'],
|
|
26
|
+
eval_split='image',
|
|
27
|
+
prompt_template='{question}\nPlease answer YES or NO without an explanation.',
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
class HallusionBenchAdapter(VisionLanguageAdapter):
|
|
31
|
+
|
|
32
|
+
def __init__(self, **kwargs):
|
|
33
|
+
super().__init__(**kwargs)
|
|
34
|
+
|
|
35
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
36
|
+
|
|
37
|
+
input_text = self.prompt_template.format(question=record['question'])
|
|
38
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
39
|
+
image = record.get('image')
|
|
40
|
+
if image:
|
|
41
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
42
|
+
content_list.append(ContentImage(image=image_base64))
|
|
43
|
+
answer = 'NO' if str(record.get('answer', '0')) == '1' else 'YES'
|
|
44
|
+
return Sample(
|
|
45
|
+
input=[ChatMessageUser(content=content_list)],
|
|
46
|
+
target=answer,
|
|
47
|
+
metadata={
|
|
48
|
+
'category': record.get('category'),
|
|
49
|
+
'subcategory': record.get('subcategory'),
|
|
50
|
+
'visual_input': record.get('visual_input'),
|
|
51
|
+
'set_id': record.get('set_id'),
|
|
52
|
+
'figure_id': record.get('figure_id'),
|
|
53
|
+
'question_id': record.get('question_id'),
|
|
54
|
+
}
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
58
|
+
score = Score(
|
|
59
|
+
extracted_prediction=filtered_prediction,
|
|
60
|
+
prediction=original_prediction,
|
|
61
|
+
)
|
|
62
|
+
# Check if the reference answer is in the filtered prediction
|
|
63
|
+
result = 1 if reference in filtered_prediction.strip().upper() else 0
|
|
64
|
+
score.value = {'acc': result}
|
|
65
|
+
return score
|
|
66
|
+
|
|
67
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
68
|
+
|
|
69
|
+
def compute_aAcc(scores: List[SampleScore]):
|
|
70
|
+
total = len(scores)
|
|
71
|
+
if total == 0:
|
|
72
|
+
return 0.0, 0
|
|
73
|
+
correct = sum(ss.score.main_value for ss in scores)
|
|
74
|
+
return (correct / total), total
|
|
75
|
+
|
|
76
|
+
def compute_group_accuracy(scores: List[SampleScore], group_type: str):
|
|
77
|
+
# group_type: 'figure' or 'question'
|
|
78
|
+
groups = defaultdict(list)
|
|
79
|
+
for ss in scores:
|
|
80
|
+
md = ss.sample_metadata
|
|
81
|
+
subcategory = md.get('subcategory')
|
|
82
|
+
set_id = md.get('set_id')
|
|
83
|
+
group_id = md.get('figure_id') if group_type == 'figure' else md.get('question_id')
|
|
84
|
+
if subcategory is None or set_id is None or group_id is None:
|
|
85
|
+
# Skip incomplete records for this grouping
|
|
86
|
+
continue
|
|
87
|
+
key = f'{subcategory}_{set_id}_{group_id}'
|
|
88
|
+
groups[key].append(ss.score.main_value)
|
|
89
|
+
if not groups:
|
|
90
|
+
return 0.0, 0
|
|
91
|
+
num_correct_groups = sum(1 for vals in groups.values() if all(vals))
|
|
92
|
+
num_groups = len(groups)
|
|
93
|
+
return (num_correct_groups / num_groups), num_groups
|
|
94
|
+
|
|
95
|
+
def compute_metrics(scores: List[SampleScore]) -> Dict[str, Dict[str, float]]:
|
|
96
|
+
a_acc, a_n = compute_aAcc(scores)
|
|
97
|
+
f_acc, f_n = compute_group_accuracy(scores, 'figure')
|
|
98
|
+
q_acc, q_n = compute_group_accuracy(scores, 'question')
|
|
99
|
+
return {
|
|
100
|
+
'aAcc': {
|
|
101
|
+
'score': a_acc,
|
|
102
|
+
'num': a_n
|
|
103
|
+
},
|
|
104
|
+
'fAcc': {
|
|
105
|
+
'score': f_acc,
|
|
106
|
+
'num': f_n
|
|
107
|
+
},
|
|
108
|
+
'qAcc': {
|
|
109
|
+
'score': q_acc,
|
|
110
|
+
'num': q_n
|
|
111
|
+
},
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
outputs: List[AggScore] = []
|
|
115
|
+
|
|
116
|
+
# By subcategory
|
|
117
|
+
subcategories = sorted({ss.sample_metadata.get('subcategory') for ss in sample_scores})
|
|
118
|
+
for subcategory in subcategories:
|
|
119
|
+
subset = [ss for ss in sample_scores if ss.sample_metadata.get('subcategory') == subcategory]
|
|
120
|
+
stats = compute_metrics(subset)
|
|
121
|
+
for metric in ['aAcc', 'fAcc', 'qAcc']:
|
|
122
|
+
outputs.append(
|
|
123
|
+
AggScore(
|
|
124
|
+
score=stats[metric]['score'],
|
|
125
|
+
metric_name=metric,
|
|
126
|
+
aggregation_name=str(subcategory),
|
|
127
|
+
num=stats[metric]['num'],
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# By category
|
|
132
|
+
categories = sorted({ss.sample_metadata.get('category') for ss in sample_scores})
|
|
133
|
+
for category in categories:
|
|
134
|
+
subset = [ss for ss in sample_scores if ss.sample_metadata.get('category') == category]
|
|
135
|
+
stats = compute_metrics(subset)
|
|
136
|
+
for metric in ['aAcc', 'fAcc', 'qAcc']:
|
|
137
|
+
outputs.append(
|
|
138
|
+
AggScore(
|
|
139
|
+
score=stats[metric]['score'],
|
|
140
|
+
metric_name=metric,
|
|
141
|
+
aggregation_name=str(category),
|
|
142
|
+
num=stats[metric]['num'],
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Overall
|
|
147
|
+
overall = compute_metrics(sample_scores)
|
|
148
|
+
for metric in ['aAcc', 'fAcc', 'qAcc']:
|
|
149
|
+
outputs.append(
|
|
150
|
+
AggScore(
|
|
151
|
+
score=overall[metric]['score'],
|
|
152
|
+
metric_name=metric,
|
|
153
|
+
aggregation_name='Overall',
|
|
154
|
+
num=overall[metric]['num'],
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
return outputs
|
|
@@ -21,7 +21,8 @@ logger = get_logger()
|
|
|
21
21
|
pretty_name='HumanEval',
|
|
22
22
|
tags=[Tags.CODING],
|
|
23
23
|
description=
|
|
24
|
-
'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.'
|
|
24
|
+
'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior. '
|
|
25
|
+
'**By default the code is executed in local environment. We recommend using sandbox execution to safely run and evaluate the generated code, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/sandbox.html) for more details.**', # noqa: E501
|
|
25
26
|
dataset_id='opencompass/humaneval',
|
|
26
27
|
subset_list=['openai_humaneval'],
|
|
27
28
|
metric_list=['Pass@1'],
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
1
2
|
from typing import Any, Dict
|
|
2
3
|
|
|
3
4
|
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
@@ -19,7 +20,8 @@ logger = get_logger()
|
|
|
19
20
|
pretty_name='Live-Code-Bench',
|
|
20
21
|
tags=[Tags.CODING],
|
|
21
22
|
description=
|
|
22
|
-
'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.'
|
|
23
|
+
'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions. '
|
|
24
|
+
'**By default the code is executed in local environment. We recommend using sandbox execution to safely run and evaluate the generated code, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/sandbox.html) for more details.**',
|
|
23
25
|
dataset_id='AI-ModelScope/code_generation_lite',
|
|
24
26
|
subset_list=['release_latest'],
|
|
25
27
|
metric_list=['Pass@1'],
|
|
File without changes
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
MULTI_CHOICE_TYPE = 'multi-choice'
|
|
15
|
+
OPEN_TYPE = 'free-form'
|
|
16
|
+
|
|
17
|
+
OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.'
|
|
18
|
+
|
|
19
|
+
MULT_CHOICE_PROMPT = """
|
|
20
|
+
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A, B, C, D. Think step by step before answering.
|
|
21
|
+
|
|
22
|
+
{question}
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
SUBSET_LIST = ['Text Dominant', 'Text Lite', 'Vision Intensive', 'Vision Dominant', 'Vision Only']
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@register_benchmark(
|
|
29
|
+
BenchmarkMeta(
|
|
30
|
+
name='math_verse',
|
|
31
|
+
pretty_name='MathVerse',
|
|
32
|
+
dataset_id='evalscope/MathVerse',
|
|
33
|
+
tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
|
|
34
|
+
description=
|
|
35
|
+
'MathVerse, an all-around visual math benchmark designed for an equitable and in-depth evaluation of MLLMs. 2,612 high-quality, multi-subject math problems with diagrams from publicly available sources. Each problem is then transformed by human annotators into six distinct versions, each offering varying degrees of information content in multi-modality, contributing to 15K test samples in total. This approach allows MathVerse to comprehensively assess whether and how much MLLMs can truly understand the visual diagrams for mathematical reasoning.',
|
|
36
|
+
subset_list=SUBSET_LIST,
|
|
37
|
+
metric_list=[{
|
|
38
|
+
'acc': {
|
|
39
|
+
'numeric': True
|
|
40
|
+
}
|
|
41
|
+
}],
|
|
42
|
+
default_subset='testmini',
|
|
43
|
+
eval_split='testmini',
|
|
44
|
+
prompt_template=OPEN_PROMPT,
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
class MathVerseAdapter(VisionLanguageAdapter):
|
|
48
|
+
|
|
49
|
+
def __init__(self, **kwargs):
|
|
50
|
+
super().__init__(**kwargs)
|
|
51
|
+
self.reformat_subset = True
|
|
52
|
+
self._use_llm_judge = True
|
|
53
|
+
|
|
54
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
55
|
+
"""
|
|
56
|
+
Convert a dataset record to a Sample. Unifies handling for both multi-choice and free-form.
|
|
57
|
+
Builds the content list inline and appends image content if provided.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
record: Raw dataset record.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Sample: The standardized sample ready for evaluation.
|
|
64
|
+
"""
|
|
65
|
+
question_type = record.get('question_type', OPEN_TYPE)
|
|
66
|
+
question: str = record.get('question', '')
|
|
67
|
+
content_list: list[Content] = []
|
|
68
|
+
|
|
69
|
+
# Choose prompt text based on type; keep a single unified flow for creating Sample
|
|
70
|
+
if question_type == MULTI_CHOICE_TYPE:
|
|
71
|
+
prompt_text = MULT_CHOICE_PROMPT.format(question=question).strip()
|
|
72
|
+
else:
|
|
73
|
+
prompt_text = OPEN_PROMPT.format(question=question).strip()
|
|
74
|
+
|
|
75
|
+
content_list.append(ContentText(text=prompt_text))
|
|
76
|
+
|
|
77
|
+
# Append image if exists
|
|
78
|
+
image = record.get('image')
|
|
79
|
+
if image and isinstance(image, dict):
|
|
80
|
+
image_bytes = image.get('bytes')
|
|
81
|
+
if image_bytes:
|
|
82
|
+
image_base64 = bytes_to_base64(image_bytes, format='png', add_header=True)
|
|
83
|
+
content_list.append(ContentImage(image=image_base64))
|
|
84
|
+
|
|
85
|
+
metadata: Dict[str, Any] = {
|
|
86
|
+
'sample_index': record.get('sample_index'),
|
|
87
|
+
'problem_index': record.get('problem_index'),
|
|
88
|
+
'problem_version': record.get('problem_version'),
|
|
89
|
+
'question_type': question_type,
|
|
90
|
+
'query_wo': record.get('query_wo'),
|
|
91
|
+
'query_cot': record.get('query_cot'),
|
|
92
|
+
'question_for_eval': record.get('question_for_eval'),
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return Sample(
|
|
96
|
+
input=[ChatMessageUser(content=content_list)],
|
|
97
|
+
target=record['answer'],
|
|
98
|
+
subset_key=record['problem_version'],
|
|
99
|
+
metadata=metadata,
|
|
100
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.evaluator import TaskState
|
|
8
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
16
|
+
|
|
17
|
+
OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}} without units.'
|
|
18
|
+
|
|
19
|
+
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
20
|
+
|
|
21
|
+
SUBSET_LIST = ['level 1', 'level 2', 'level 3', 'level 4', 'level 5']
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@register_benchmark(
|
|
25
|
+
BenchmarkMeta(
|
|
26
|
+
name='math_vision',
|
|
27
|
+
pretty_name='MathVision',
|
|
28
|
+
dataset_id='evalscope/MathVision',
|
|
29
|
+
tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
|
|
30
|
+
description=
|
|
31
|
+
'The MATH-Vision (MATH-V) dataset, a meticulously curated collection of 3,040 high-quality mathematical problems with visual contexts sourced from real math competitions.',
|
|
32
|
+
subset_list=SUBSET_LIST,
|
|
33
|
+
metric_list=[{
|
|
34
|
+
'acc': {
|
|
35
|
+
'numeric': True
|
|
36
|
+
}
|
|
37
|
+
}],
|
|
38
|
+
eval_split='test',
|
|
39
|
+
prompt_template=OPEN_PROMPT,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
class MathVisionAdapter(VisionLanguageAdapter):
|
|
43
|
+
|
|
44
|
+
def __init__(self, **kwargs):
|
|
45
|
+
super().__init__(**kwargs)
|
|
46
|
+
self.reformat_subset = True
|
|
47
|
+
|
|
48
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
49
|
+
if len(record['options']) > 0:
|
|
50
|
+
question_type = 'multi_choice'
|
|
51
|
+
else:
|
|
52
|
+
question_type = 'free_form'
|
|
53
|
+
content_list, answers_list = MathVisionAdapter.create_content_and_answers_list(record, question_type)
|
|
54
|
+
metadata = {
|
|
55
|
+
'id': record['id'],
|
|
56
|
+
'image': record['image'],
|
|
57
|
+
'solution': record['solution'],
|
|
58
|
+
'level': record['level'],
|
|
59
|
+
'question_type': question_type,
|
|
60
|
+
'subject': record['subject']
|
|
61
|
+
}
|
|
62
|
+
if question_type == 'multi_choice':
|
|
63
|
+
label_answer = record['answer']
|
|
64
|
+
return Sample(
|
|
65
|
+
input=[ChatMessageUser(content=content_list)],
|
|
66
|
+
choices=answers_list,
|
|
67
|
+
target=label_answer,
|
|
68
|
+
subset_key=f'level {record["level"]}',
|
|
69
|
+
metadata=metadata
|
|
70
|
+
)
|
|
71
|
+
elif question_type == 'free_form':
|
|
72
|
+
return Sample(
|
|
73
|
+
input=[ChatMessageUser(content=content_list)],
|
|
74
|
+
target=record['answer'],
|
|
75
|
+
subset_key=f'level {record["level"]}',
|
|
76
|
+
metadata=metadata
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError(f'Unexpected question_type: {question_type}')
|
|
80
|
+
|
|
81
|
+
@staticmethod
|
|
82
|
+
def create_content_and_answers_list(record: Dict[str, Any], question_type) -> tuple[List[Content], List[str]]:
|
|
83
|
+
"""
|
|
84
|
+
Create a list of content elements and a list of answers from a record.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
record (dict): The record containing question, images, and options.
|
|
88
|
+
question_type (str): The type of this question
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
tuple: A tuple containing:
|
|
93
|
+
- content_list (list): A list of content elements (text and images).
|
|
94
|
+
- answers_list (list): A list of possible answers (for multiple-choice questions).
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
# Replace <image1>, <image2> ... to [image1], [image2], ... from question text
|
|
98
|
+
question = re.sub(r'<image(\d+)>', r'[image\1]', record['question']).strip()
|
|
99
|
+
|
|
100
|
+
if question_type == 'multi_choice':
|
|
101
|
+
answers_list = record['options']
|
|
102
|
+
input_text = prompt(question=question, choices=answers_list, template=MULT_CHOICE_PROMPT)
|
|
103
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
104
|
+
else:
|
|
105
|
+
answers_list: List[str] = []
|
|
106
|
+
content_list: List[Content] = [ContentText(text=OPEN_PROMPT.format(question=question))]
|
|
107
|
+
image = record['decoded_image']
|
|
108
|
+
if image:
|
|
109
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpg', add_header=True)
|
|
110
|
+
content_list.append(ContentImage(image=image_base64))
|
|
111
|
+
return content_list, answers_list
|
|
@@ -4,7 +4,6 @@ from typing import Any, Dict
|
|
|
4
4
|
|
|
5
5
|
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
6
6
|
from evalscope.api.dataset import Sample
|
|
7
|
-
from evalscope.api.evaluator import TaskState
|
|
8
7
|
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
9
8
|
from evalscope.api.registry import register_benchmark
|
|
10
9
|
from evalscope.constants import Tags
|
|
@@ -14,15 +13,7 @@ from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers,
|
|
|
14
13
|
|
|
15
14
|
logger = get_logger()
|
|
16
15
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
OPEN_PROMPT = """
|
|
20
|
-
Solve the following problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
|
|
21
|
-
|
|
22
|
-
{question}
|
|
23
|
-
|
|
24
|
-
Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
|
|
25
|
-
"""
|
|
16
|
+
OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}} without units.'
|
|
26
17
|
|
|
27
18
|
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
28
19
|
|
|
@@ -38,8 +29,11 @@ OPEN_TYPE = 'free_form'
|
|
|
38
29
|
tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
|
|
39
30
|
description=
|
|
40
31
|
'MathVista is a consolidated Mathematical reasoning benchmark within Visual contexts. It consists of three newly created datasets, IQTest, FunctionQA, and PaperQA, which address the missing visual domains and are tailored to evaluate logical reasoning on puzzle test figures, algebraic reasoning over functional plots, and scientific reasoning with academic paper figures, respectively. It also incorporates 9 MathQA datasets and 19 VQA datasets from the literature, which significantly enrich the diversity and complexity of visual perception and mathematical reasoning challenges within our benchmark. In total, MathVista includes 6,141 examples collected from 31 different datasets.',
|
|
41
|
-
|
|
42
|
-
|
|
32
|
+
metric_list=[{
|
|
33
|
+
'acc': {
|
|
34
|
+
'numeric': True
|
|
35
|
+
}
|
|
36
|
+
}],
|
|
43
37
|
eval_split='testmini',
|
|
44
38
|
prompt_template=OPEN_PROMPT,
|
|
45
39
|
)
|
|
@@ -86,20 +80,6 @@ class MathVistaAdapter(VisionLanguageAdapter):
|
|
|
86
80
|
logger.warning(f"Answer '{value}' not found in options: {options}. This may cause evaluation issues.")
|
|
87
81
|
return value
|
|
88
82
|
|
|
89
|
-
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
90
|
-
question_type = task_state.metadata['question_type']
|
|
91
|
-
if question_type == MULTI_CHOICE_TYPE:
|
|
92
|
-
answers = parse_answers(task_state)
|
|
93
|
-
return ''.join(sorted(list(answers)))
|
|
94
|
-
elif question_type == OPEN_TYPE:
|
|
95
|
-
pattern = r'ANSWER:\s*(.*)'
|
|
96
|
-
match = re.search(pattern, prediction)
|
|
97
|
-
if match:
|
|
98
|
-
return match.group(1).strip()
|
|
99
|
-
return ''
|
|
100
|
-
else:
|
|
101
|
-
raise ValueError(f'Unsupported question type: {question_type}')
|
|
102
|
-
|
|
103
83
|
@staticmethod
|
|
104
84
|
def create_content_and_answers_list(record: dict[str, Any], ) -> tuple[list[Content], list[str]]:
|
|
105
85
|
"""
|
|
@@ -36,7 +36,7 @@ Don't give information outside the document or repeat your findings."""
|
|
|
36
36
|
tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
|
|
37
37
|
description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
|
|
38
38
|
'It requires the model to find specific information within a large corpus of text. '
|
|
39
|
-
'[Usage Example](https://evalscope.readthedocs.io/
|
|
39
|
+
'[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html)', # noqa: E501
|
|
40
40
|
dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
|
|
41
41
|
metric_list=['acc'],
|
|
42
42
|
subset_list=['english', 'chinese'],
|
|
File without changes
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'BroadTwitterCorpus is a dataset of tweets collected over stratified times, places '
|
|
8
|
+
'and social uses. The goal is to represent a broad range of activities, giving a '
|
|
9
|
+
'dataset more representative of the language used in this hardest of social media '
|
|
10
|
+
'formats to process.'
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='broad-twitter-corpus',
|
|
17
|
+
pretty_name='BroadTwitterCorpus',
|
|
18
|
+
dataset_id='extraordinarylab/broad-twitter-corpus',
|
|
19
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
20
|
+
description=DESCRIPTION.strip(),
|
|
21
|
+
few_shot_num=5,
|
|
22
|
+
train_split='train',
|
|
23
|
+
eval_split='test',
|
|
24
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
25
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
26
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class BroadTwitterCorpusAdapter(NERAdapter):
|
|
30
|
+
"""
|
|
31
|
+
Adapter for the BroadTwitterCorpus Named Entity Recognition dataset.
|
|
32
|
+
|
|
33
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
34
|
+
configures it specifically for the BroadTwitterCorpus dataset's entity types.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, **kwargs):
|
|
38
|
+
# Initialize the parent class first
|
|
39
|
+
super().__init__(**kwargs)
|
|
40
|
+
|
|
41
|
+
# Define BroadTwitterCorpus-specific entity mappings
|
|
42
|
+
self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location'}
|
|
43
|
+
|
|
44
|
+
# Add descriptions for each entity type
|
|
45
|
+
self.entity_descriptions = {
|
|
46
|
+
'PER': 'Names of people, including first and last names',
|
|
47
|
+
'ORG': 'Names of companies, institutions, organizations, etc.',
|
|
48
|
+
'LOC': 'Names of locations, cities, states, countries, etc.',
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Setup entity mappings based on the defined entity types
|
|
52
|
+
self.setup_entity_mappings()
|