evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +62 -2
- evalscope/api/benchmark/meta.py +9 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +1 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
- evalscope/benchmarks/bfcl/generation.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +96 -14
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +30 -10
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/metric.py +27 -2
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +8 -6
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/combinator.py +0 -25
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +56 -7
- evalscope/utils/json_schema.py +23 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
- tests/benchmark/test_eval.py +80 -37
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_sandbox.py +81 -0
- tests/benchmark/test_vlm.py +137 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +44 -14
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
SUBSET_LIST = ['default']
|
|
16
|
+
|
|
17
|
+
OPEN_PROMPT = (
|
|
18
|
+
'Read the picture and solve the following problem step by step.'
|
|
19
|
+
'The last line of your response should be of the form'
|
|
20
|
+
' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.\n\n'
|
|
21
|
+
'{question}\n\n'
|
|
22
|
+
'Remember to put your answer on its own line at the end in the form'
|
|
23
|
+
' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem,'
|
|
24
|
+
' and you do not need to use a \\boxed command.'
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@register_benchmark(
|
|
29
|
+
BenchmarkMeta(
|
|
30
|
+
name='real_world_qa',
|
|
31
|
+
pretty_name='RealWorldQA',
|
|
32
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
33
|
+
description=
|
|
34
|
+
'RealWorldQA is a benchmark designed to evaluate the real-world spatial understanding capabilities of multimodal AI models, contributed by XAI. It assesses how well these models comprehend physical environments. The benchmark consists of 700+ images, each accompanied by a question and a verifiable answer. These images are drawn from real-world scenarios, including those captured from vehicles. The goal is to advance AI models\' understanding of our physical world.', # noqa: E501
|
|
35
|
+
dataset_id='lmms-lab/RealWorldQA',
|
|
36
|
+
subset_list=SUBSET_LIST,
|
|
37
|
+
metric_list=['acc'],
|
|
38
|
+
eval_split='test',
|
|
39
|
+
prompt_template=OPEN_PROMPT,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
class RealWorldQAAdapter(VisionLanguageAdapter):
|
|
43
|
+
|
|
44
|
+
def __init__(self, **kwargs):
|
|
45
|
+
super().__init__(**kwargs)
|
|
46
|
+
|
|
47
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
48
|
+
content_list: list[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
|
|
49
|
+
image = record.get('image')
|
|
50
|
+
if image:
|
|
51
|
+
image_base64 = bytes_to_base64(image['bytes'], format='webp', add_header=True)
|
|
52
|
+
content_list.append(ContentImage(image=image_base64))
|
|
53
|
+
return Sample(
|
|
54
|
+
input=[ChatMessageUser(content=content_list)],
|
|
55
|
+
target=record['answer'],
|
|
56
|
+
metadata={'image_path': record['image_path']}
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
60
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
61
|
+
match = re.search(pattern, prediction)
|
|
62
|
+
if match:
|
|
63
|
+
return match.group(1).strip()
|
|
64
|
+
return ''
|
|
@@ -45,7 +45,7 @@ def _patch_agent_solve(model: Model):
|
|
|
45
45
|
input=[dict_to_chat_message(msg) for msg in messages],
|
|
46
46
|
tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
|
|
47
47
|
)
|
|
48
|
-
oai_res = openai_chat_choices(res.choices)
|
|
48
|
+
oai_res = openai_chat_choices(res.choices, include_reasoning=False)
|
|
49
49
|
|
|
50
50
|
next_message = oai_res[0].message.model_dump(exclude_none=True)
|
|
51
51
|
|
|
@@ -13,6 +13,7 @@ from evalscope.api.registry import register_benchmark
|
|
|
13
13
|
from evalscope.constants import Tags
|
|
14
14
|
from evalscope.utils import get_logger
|
|
15
15
|
from evalscope.utils.function_utils import run_once
|
|
16
|
+
from evalscope.utils.import_utils import check_import
|
|
16
17
|
|
|
17
18
|
logger = get_logger()
|
|
18
19
|
|
|
@@ -35,8 +36,8 @@ logger = get_logger()
|
|
|
35
36
|
'api_key': 'EMPTY',
|
|
36
37
|
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
37
38
|
'generation_config': {
|
|
38
|
-
'temperature': 0.
|
|
39
|
-
'
|
|
39
|
+
'temperature': 0.0,
|
|
40
|
+
'max_tokens': 4096,
|
|
40
41
|
}
|
|
41
42
|
}
|
|
42
43
|
)
|
|
@@ -46,22 +47,18 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
46
47
|
def __init__(self, **kwargs):
|
|
47
48
|
super().__init__(**kwargs)
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
50
|
+
check_import(
|
|
51
|
+
'tau_bench',
|
|
52
|
+
package='git+https://github.com/sierra-research/tau-bench',
|
|
53
|
+
raise_error=True,
|
|
54
|
+
feature_name=self.pretty_name
|
|
55
|
+
)
|
|
54
56
|
|
|
55
57
|
# setup user model args
|
|
56
58
|
self.user_model = self.extra_params.get('user_model', 'qwen-plus')
|
|
57
59
|
self.api_key = self.extra_params.get('api_key', 'EMPTY')
|
|
58
60
|
self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
|
|
59
|
-
self.generation_config = self.extra_params.get(
|
|
60
|
-
'generation_config', {
|
|
61
|
-
'temperature': 0.7,
|
|
62
|
-
'max_new_tokens': 1024
|
|
63
|
-
}
|
|
64
|
-
)
|
|
61
|
+
self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
|
|
65
62
|
|
|
66
63
|
self._patch_env_completion()
|
|
67
64
|
|
|
@@ -84,10 +81,10 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
84
81
|
|
|
85
82
|
res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
|
|
86
83
|
|
|
87
|
-
message = res.
|
|
84
|
+
message = {'role': 'assistant', 'content': res.completion}
|
|
88
85
|
self.messages.append(message)
|
|
89
86
|
self.total_cost = 0
|
|
90
|
-
return
|
|
87
|
+
return res.completion
|
|
91
88
|
|
|
92
89
|
# get the current instance of TauBenchAdapter
|
|
93
90
|
adapter_instance = self
|
|
@@ -114,7 +111,11 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
114
111
|
})
|
|
115
112
|
# load dataset
|
|
116
113
|
dataset = DictDataLoader(
|
|
117
|
-
dict_list=tasks,
|
|
114
|
+
dict_list=tasks,
|
|
115
|
+
sample_fields=self.record_to_sample,
|
|
116
|
+
limit=self.limit,
|
|
117
|
+
repeats=self.repeats,
|
|
118
|
+
shuffle=self.shuffle,
|
|
118
119
|
).load()
|
|
119
120
|
|
|
120
121
|
data_dict[env_name] = dataset
|
|
@@ -145,15 +146,15 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
145
146
|
|
|
146
147
|
try:
|
|
147
148
|
# Parse the prediction to get the reward
|
|
148
|
-
|
|
149
|
-
reward =
|
|
149
|
+
task_result = task_state.metadata['task_result']
|
|
150
|
+
reward = task_result.get('reward', 0.0)
|
|
150
151
|
|
|
151
152
|
score.value = {
|
|
152
153
|
'Pass^1': float(reward),
|
|
153
154
|
}
|
|
154
155
|
score.explanation = f'Task completed with reward: {reward}'
|
|
155
156
|
score.metadata = {
|
|
156
|
-
'task_result':
|
|
157
|
+
'task_result': task_result,
|
|
157
158
|
'env_name': task_state.metadata.get('env_name', 'unknown'),
|
|
158
159
|
'task_index': task_state.metadata.get('task_index', -1)
|
|
159
160
|
}
|
|
File without changes
|
|
@@ -16,8 +16,10 @@ logger = get_logger()
|
|
|
16
16
|
@register_benchmark(
|
|
17
17
|
BenchmarkMeta(
|
|
18
18
|
name='evalmuse',
|
|
19
|
+
pretty_name='EvalMuse',
|
|
19
20
|
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
20
|
-
description='EvalMuse Text-to-Image Benchmark'
|
|
21
|
+
description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
|
|
22
|
+
'and semantic alignment of finely generated images',
|
|
21
23
|
tags=[Tags.TEXT_TO_IMAGE],
|
|
22
24
|
subset_list=['EvalMuse'],
|
|
23
25
|
metric_list=['FGA_BLIP2Score'],
|
|
@@ -4,7 +4,6 @@ import os
|
|
|
4
4
|
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
5
5
|
from evalscope.api.dataset import Sample
|
|
6
6
|
from evalscope.api.messages import ChatMessageUser
|
|
7
|
-
from evalscope.api.metric.scorer import Score
|
|
8
7
|
from evalscope.api.registry import get_metric, register_benchmark
|
|
9
8
|
from evalscope.constants import Tags
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
@@ -15,8 +14,9 @@ logger = get_logger()
|
|
|
15
14
|
@register_benchmark(
|
|
16
15
|
BenchmarkMeta(
|
|
17
16
|
name='genai_bench',
|
|
17
|
+
pretty_name='GenAI-Bench',
|
|
18
18
|
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
19
|
-
description='GenAI-Bench Text-to-Image Benchmark',
|
|
19
|
+
description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
|
|
20
20
|
tags=[Tags.TEXT_TO_IMAGE],
|
|
21
21
|
subset_list=['GenAI-Bench-1600'],
|
|
22
22
|
metric_list=['VQAScore'],
|
|
@@ -16,7 +16,7 @@ logger = get_logger()
|
|
|
16
16
|
name='general_t2i',
|
|
17
17
|
dataset_id='general_t2i',
|
|
18
18
|
description='General Text-to-Image Benchmark',
|
|
19
|
-
tags=[Tags.TEXT_TO_IMAGE],
|
|
19
|
+
tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
|
|
20
20
|
subset_list=['default'],
|
|
21
21
|
metric_list=['PickScore'],
|
|
22
22
|
few_shot_num=0,
|
|
@@ -14,8 +14,10 @@ logger = get_logger()
|
|
|
14
14
|
@register_benchmark(
|
|
15
15
|
BenchmarkMeta(
|
|
16
16
|
name='hpdv2',
|
|
17
|
+
pretty_name='HPD-v2',
|
|
17
18
|
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
18
|
-
description='HPDv2 Text-to-Image Benchmark'
|
|
19
|
+
description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
|
|
20
|
+
'trained on the Human Preference Dataset (HPD v2)',
|
|
19
21
|
tags=[Tags.TEXT_TO_IMAGE],
|
|
20
22
|
subset_list=['HPDv2'],
|
|
21
23
|
metric_list=['HPSv2.1Score'],
|
|
@@ -41,7 +43,10 @@ class HPDv2Adapter(Text2ImageAdapter):
|
|
|
41
43
|
return Sample(
|
|
42
44
|
input=[ChatMessageUser(content=record['prompt'])],
|
|
43
45
|
metadata={
|
|
46
|
+
'id': record['id'],
|
|
47
|
+
'prompt': record['prompt'],
|
|
44
48
|
'category': record.get('tags', {}).get('category', ''),
|
|
45
|
-
'tags': record.get('tags', {})
|
|
49
|
+
'tags': record.get('tags', {}),
|
|
50
|
+
'image_path': record.get('image_path', ''), # Optional field for existing image path
|
|
46
51
|
}
|
|
47
52
|
)
|
|
@@ -37,6 +37,7 @@ TRUTHFUL_QA_PROMPT = (
|
|
|
37
37
|
dataset_id='evalscope/truthful_qa',
|
|
38
38
|
metric_list=['multi_choice_acc'],
|
|
39
39
|
subset_list=['multiple_choice'],
|
|
40
|
+
shuffle_choices=True,
|
|
40
41
|
few_shot_num=0,
|
|
41
42
|
train_split=None,
|
|
42
43
|
eval_split='validation',
|
|
@@ -55,8 +56,6 @@ class TruthfulQaAdapter(MultiChoiceAdapter):
|
|
|
55
56
|
|
|
56
57
|
super().__init__(**kwargs)
|
|
57
58
|
|
|
58
|
-
self.shuffle_choices = True
|
|
59
|
-
|
|
60
59
|
self.multiple_correct = self.extra_params.get('multiple_correct', False)
|
|
61
60
|
if self.multiple_correct:
|
|
62
61
|
self.prompt_template = MultipleChoiceTemplate.MULTIPLE_ANSWER
|
evalscope/cli/start_app.py
CHANGED
|
@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
|
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
-
|
|
31
|
+
try:
|
|
32
|
+
from evalscope.app import create_app
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f'Failed to import create_app from evalscope.app, due to {e}. '
|
|
36
|
+
"Please run `pip install 'evalscope[app]'`."
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
create_app(self.args)
|
evalscope/cli/start_perf.py
CHANGED
|
@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
|
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
-
|
|
31
|
+
try:
|
|
32
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
|
|
36
|
+
"Please run `pip install 'evalscope[perf]'`."
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
run_perf_benchmark(self.args)
|
evalscope/config.py
CHANGED
|
@@ -6,7 +6,7 @@ from argparse import Namespace
|
|
|
6
6
|
from dataclasses import dataclass, field
|
|
7
7
|
from typing import Dict, List, Optional, Union
|
|
8
8
|
|
|
9
|
-
from evalscope.api.model import GenerateConfig
|
|
9
|
+
from evalscope.api.model import GenerateConfig, Model, ModelAPI
|
|
10
10
|
from evalscope.constants import (
|
|
11
11
|
DEFAULT_DATASET_CACHE_DIR,
|
|
12
12
|
DEFAULT_WORK_DIR,
|
|
@@ -15,10 +15,10 @@ from evalscope.constants import (
|
|
|
15
15
|
HubType,
|
|
16
16
|
JudgeStrategy,
|
|
17
17
|
ModelTask,
|
|
18
|
-
OutputType,
|
|
19
18
|
)
|
|
20
19
|
from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
|
|
21
20
|
from evalscope.utils.deprecation_utils import deprecated_warning
|
|
21
|
+
from evalscope.utils.import_utils import check_import
|
|
22
22
|
from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
|
|
23
23
|
from evalscope.utils.logger import get_logger
|
|
24
24
|
|
|
@@ -28,51 +28,115 @@ logger = get_logger()
|
|
|
28
28
|
@dataclass
|
|
29
29
|
class TaskConfig(BaseArgument):
|
|
30
30
|
# Model-related arguments
|
|
31
|
-
model: Optional[str] = None
|
|
31
|
+
model: Optional[Union[str, Model, ModelAPI]] = None
|
|
32
|
+
"""The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
|
|
33
|
+
|
|
32
34
|
model_id: Optional[str] = None
|
|
35
|
+
"""Unique identifier for the model. Auto-generated from model name if not provided."""
|
|
36
|
+
|
|
33
37
|
model_args: Dict = field(default_factory=dict)
|
|
38
|
+
"""Additional arguments to pass to the model during initialization."""
|
|
39
|
+
|
|
34
40
|
model_task: str = ModelTask.TEXT_GENERATION
|
|
41
|
+
"""The type of task the model performs (e.g., text generation, image generation)."""
|
|
35
42
|
|
|
36
43
|
# Template-related arguments
|
|
37
44
|
chat_template: Optional[str] = None
|
|
45
|
+
"""Chat template to use for formatting conversations with the model."""
|
|
38
46
|
|
|
39
47
|
# Dataset-related arguments
|
|
40
48
|
datasets: List[str] = field(default_factory=list)
|
|
49
|
+
"""List of dataset names to evaluate the model on."""
|
|
50
|
+
|
|
41
51
|
dataset_args: Dict = field(default_factory=dict)
|
|
52
|
+
"""Additional arguments to pass to datasets during loading."""
|
|
53
|
+
|
|
42
54
|
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
55
|
+
"""Directory where datasets are cached locally."""
|
|
56
|
+
|
|
43
57
|
dataset_hub: str = HubType.MODELSCOPE
|
|
44
|
-
|
|
58
|
+
"""Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
|
|
59
|
+
|
|
60
|
+
repeats: int = 1
|
|
61
|
+
"""Number of times to repeat the dataset items for k-metrics evaluation."""
|
|
45
62
|
|
|
46
63
|
# Generation configuration arguments
|
|
47
64
|
generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
|
|
65
|
+
"""Configuration parameters for text/image generation."""
|
|
48
66
|
|
|
49
67
|
# Evaluation-related arguments
|
|
50
68
|
eval_type: str = EvalType.CHECKPOINT
|
|
69
|
+
"""Type of evaluation: checkpoint, service, or mock."""
|
|
70
|
+
|
|
51
71
|
eval_backend: str = EvalBackend.NATIVE
|
|
72
|
+
"""Backend framework to use for evaluation."""
|
|
73
|
+
|
|
52
74
|
eval_config: Union[str, Dict, None] = None
|
|
75
|
+
"""Additional evaluation configuration parameters."""
|
|
76
|
+
|
|
53
77
|
limit: Optional[Union[int, float]] = None
|
|
78
|
+
"""Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
|
|
79
|
+
|
|
54
80
|
eval_batch_size: int = 1
|
|
81
|
+
"""Batch size for evaluation processing."""
|
|
55
82
|
|
|
56
83
|
# Cache and working directory arguments
|
|
57
84
|
use_cache: Optional[str] = None
|
|
85
|
+
"""Whether to use cached results and which cache strategy to apply."""
|
|
86
|
+
|
|
58
87
|
rerun_review: bool = False
|
|
88
|
+
"""Whether to rerun the review process even if results exist."""
|
|
89
|
+
|
|
59
90
|
work_dir: str = DEFAULT_WORK_DIR
|
|
91
|
+
"""Working directory for storing evaluation results and temporary files."""
|
|
60
92
|
|
|
61
93
|
# Debug and runtime mode arguments
|
|
62
94
|
ignore_errors: bool = False
|
|
95
|
+
"""Whether to continue evaluation when encountering errors."""
|
|
96
|
+
|
|
63
97
|
debug: bool = False
|
|
64
|
-
|
|
98
|
+
"""Enable debug mode for detailed logging and error reporting."""
|
|
99
|
+
|
|
65
100
|
seed: Optional[int] = 42
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
101
|
+
"""Random seed for reproducible results."""
|
|
102
|
+
|
|
103
|
+
api_url: Optional[str] = None
|
|
104
|
+
"""API endpoint URL for server-based model evaluation."""
|
|
105
|
+
|
|
106
|
+
api_key: Optional[str] = 'EMPTY'
|
|
107
|
+
"""API key for authenticating with server-based models."""
|
|
108
|
+
|
|
109
|
+
timeout: Optional[float] = None
|
|
110
|
+
"""Request timeout in seconds for server-based models."""
|
|
111
|
+
|
|
112
|
+
stream: Optional[bool] = None
|
|
113
|
+
"""Whether to use streaming responses for server-based models."""
|
|
70
114
|
|
|
71
115
|
# LLMJudge arguments
|
|
72
116
|
judge_strategy: str = JudgeStrategy.AUTO
|
|
117
|
+
"""Strategy for LLM-based judgment (auto, single, pairwise)."""
|
|
118
|
+
|
|
73
119
|
judge_worker_num: int = 1
|
|
120
|
+
"""Number of worker processes for parallel LLM judging."""
|
|
121
|
+
|
|
74
122
|
judge_model_args: Optional[Dict] = field(default_factory=dict)
|
|
123
|
+
"""Additional arguments for the judge model configuration."""
|
|
124
|
+
|
|
75
125
|
analysis_report: bool = False
|
|
126
|
+
"""Whether to generate detailed analysis reports after evaluation."""
|
|
127
|
+
|
|
128
|
+
# Sandbox configuration arguments
|
|
129
|
+
use_sandbox: bool = False
|
|
130
|
+
"""Whether to execute code in a sandboxed environment."""
|
|
131
|
+
|
|
132
|
+
sandbox_type: Optional[str] = 'docker'
|
|
133
|
+
"""Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
|
|
134
|
+
|
|
135
|
+
sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
|
|
136
|
+
"""Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
|
|
137
|
+
|
|
138
|
+
sandbox_config: Optional[Dict] = field(default_factory=dict)
|
|
139
|
+
"""Configuration for sandboxed code execution environments."""
|
|
76
140
|
|
|
77
141
|
def __post_init__(self):
|
|
78
142
|
self.__init_model_and_id()
|
|
@@ -82,20 +146,22 @@ class TaskConfig(BaseArgument):
|
|
|
82
146
|
# Set default generation_config and model_args
|
|
83
147
|
self.__init_default_generation_config()
|
|
84
148
|
self.__init_default_model_args()
|
|
149
|
+
self.__init_default_sandbox_config()
|
|
85
150
|
|
|
86
151
|
def __init_model_and_id(self):
|
|
87
152
|
# Set model to DummyCustomModel if not provided
|
|
88
153
|
if self.model is None:
|
|
89
154
|
self.model = self.model_task
|
|
90
155
|
self.eval_type = EvalType.MOCK_LLM
|
|
91
|
-
else:
|
|
92
|
-
if self.model_task == ModelTask.IMAGE_GENERATION:
|
|
93
|
-
self.eval_type = EvalType.TEXT2IMAGE
|
|
94
156
|
|
|
95
157
|
# Set model_id if not provided
|
|
96
158
|
if not self.model_id:
|
|
97
|
-
if self.model:
|
|
159
|
+
if isinstance(self.model, str):
|
|
98
160
|
self.model_id = safe_filename(os.path.basename(self.model))
|
|
161
|
+
elif isinstance(self.model, Model):
|
|
162
|
+
self.model_id = safe_filename(self.model.name)
|
|
163
|
+
elif isinstance(self.model, ModelAPI):
|
|
164
|
+
self.model_id = safe_filename(self.model.model_name)
|
|
99
165
|
else:
|
|
100
166
|
self.model_id = 'dummy_model'
|
|
101
167
|
|
|
@@ -113,6 +179,11 @@ class TaskConfig(BaseArgument):
|
|
|
113
179
|
'num_inference_steps': 50,
|
|
114
180
|
'guidance_scale': 9.0,
|
|
115
181
|
}
|
|
182
|
+
if self.eval_batch_size != 1:
|
|
183
|
+
logger.warning(
|
|
184
|
+
'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
|
|
185
|
+
)
|
|
186
|
+
self.eval_batch_size = 1
|
|
116
187
|
elif self.model_task == ModelTask.TEXT_GENERATION:
|
|
117
188
|
if self.eval_type == EvalType.CHECKPOINT:
|
|
118
189
|
self.generation_config = {
|
|
@@ -167,6 +238,14 @@ class TaskConfig(BaseArgument):
|
|
|
167
238
|
'precision': 'torch.float16',
|
|
168
239
|
}
|
|
169
240
|
|
|
241
|
+
def __init_default_sandbox_config(self):
|
|
242
|
+
if not self.use_sandbox:
|
|
243
|
+
return
|
|
244
|
+
check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
|
|
245
|
+
|
|
246
|
+
if not self.sandbox_type:
|
|
247
|
+
self.sandbox_type = 'docker'
|
|
248
|
+
|
|
170
249
|
def update(self, other: Union['TaskConfig', dict]):
|
|
171
250
|
if isinstance(other, TaskConfig):
|
|
172
251
|
other = other.to_dict()
|
|
@@ -182,9 +261,12 @@ class TaskConfig(BaseArgument):
|
|
|
182
261
|
logger.warning(f'Failed to dump overall task config: {e}')
|
|
183
262
|
|
|
184
263
|
def to_dict(self):
|
|
185
|
-
result = copy.
|
|
264
|
+
result = copy.copy(self.__dict__)
|
|
186
265
|
del result['api_key'] # Do not expose api_key in the config
|
|
187
266
|
|
|
267
|
+
if isinstance(self.model, (Model, ModelAPI)):
|
|
268
|
+
result['model'] = self.model.__class__.__name__
|
|
269
|
+
|
|
188
270
|
if isinstance(self.generation_config, GenerateConfig):
|
|
189
271
|
result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
|
|
190
272
|
return result
|
evalscope/constants.py
CHANGED
|
@@ -15,6 +15,7 @@ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old versio
|
|
|
15
15
|
DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
|
|
16
16
|
os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
|
|
17
17
|
) # ~/.cache/evalscope
|
|
18
|
+
IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1' # To avoid some heavy dependencies when building doc
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class HubType:
|
|
@@ -70,6 +71,7 @@ class EvalType:
|
|
|
70
71
|
CHECKPOINT = 'llm_ckpt' # native model checkpoint
|
|
71
72
|
SERVICE = 'openai_api' # model service
|
|
72
73
|
TEXT2IMAGE = 'text2image' # image generation service
|
|
74
|
+
IMAGE_EDITING = 'image_editing' # image editing service
|
|
73
75
|
|
|
74
76
|
|
|
75
77
|
class OutputType:
|
|
@@ -127,3 +129,12 @@ class Tags:
|
|
|
127
129
|
RETRIEVAL = 'Retrieval'
|
|
128
130
|
FUNCTION_CALLING = 'FunctionCalling'
|
|
129
131
|
TEXT_TO_IMAGE = 'TextToImage'
|
|
132
|
+
IMAGE_EDITING = 'ImageEditing'
|
|
133
|
+
MULTI_MODAL = 'MultiModal'
|
|
134
|
+
MULTI_LINGUAL = 'MultiLingual'
|
|
135
|
+
MULTI_TURN = 'MultiTurn'
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class FileConstants:
|
|
139
|
+
IMAGE_PATH = 'image_path'
|
|
140
|
+
ID = 'id'
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -8,8 +8,9 @@ and report generation.
|
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
import os
|
|
11
|
+
import traceback
|
|
11
12
|
from collections import defaultdict
|
|
12
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
|
|
13
14
|
from tqdm import tqdm
|
|
14
15
|
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
|
15
16
|
|
|
@@ -17,6 +18,7 @@ from evalscope.api.dataset import Dataset, DatasetDict, Sample
|
|
|
17
18
|
from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
|
|
18
19
|
from evalscope.api.metric import AggScore, SampleScore
|
|
19
20
|
from evalscope.report import Report, gen_table
|
|
21
|
+
from evalscope.utils.logger import get_logger
|
|
20
22
|
|
|
21
23
|
if TYPE_CHECKING:
|
|
22
24
|
from evalscope.api.benchmark import DataAdapter
|
|
@@ -24,8 +26,6 @@ if TYPE_CHECKING:
|
|
|
24
26
|
from evalscope.config import TaskConfig
|
|
25
27
|
from evalscope.utils.io_utils import OutputsStructure
|
|
26
28
|
|
|
27
|
-
from evalscope.utils.logger import get_logger
|
|
28
|
-
|
|
29
29
|
logger = get_logger()
|
|
30
30
|
|
|
31
31
|
|
|
@@ -96,12 +96,17 @@ class DefaultEvaluator(Evaluator):
|
|
|
96
96
|
|
|
97
97
|
# Process each subset (e.g., test, validation) independently
|
|
98
98
|
for subset, dataset in dataset_dict.items():
|
|
99
|
-
|
|
99
|
+
if len(dataset) == 0:
|
|
100
|
+
logger.info(f'No samples found in subset: {subset}, skipping.')
|
|
101
|
+
continue
|
|
100
102
|
subset_score = self.evaluate_subset(subset, dataset)
|
|
101
103
|
agg_score_dict[subset] = subset_score
|
|
102
104
|
|
|
103
105
|
# Generate the report based on aggregated scores
|
|
104
106
|
report = self.get_report(agg_score_dict)
|
|
107
|
+
|
|
108
|
+
# Finalize the evaluation process
|
|
109
|
+
self.finalize()
|
|
105
110
|
return report
|
|
106
111
|
|
|
107
112
|
def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
|
|
@@ -181,10 +186,13 @@ class DefaultEvaluator(Evaluator):
|
|
|
181
186
|
model_result = self.cache_manager.save_prediction_cache(
|
|
182
187
|
subset, task_state, self.benchmark.save_metadata
|
|
183
188
|
)
|
|
184
|
-
logger.debug(f'Model result: \n{model_result.
|
|
189
|
+
logger.debug(f'Model result: \n{model_result.pretty_print()}')
|
|
185
190
|
|
|
186
191
|
except Exception as exc:
|
|
187
|
-
|
|
192
|
+
tb_str = traceback.format_exc()
|
|
193
|
+
logger.error(
|
|
194
|
+
f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}'
|
|
195
|
+
)
|
|
188
196
|
if self.task_config.ignore_errors:
|
|
189
197
|
logger.warning('Error ignored, continuing with next sample.')
|
|
190
198
|
else:
|
|
@@ -251,7 +259,13 @@ class DefaultEvaluator(Evaluator):
|
|
|
251
259
|
for future in as_completed(future_to_task_state):
|
|
252
260
|
task_state = future_to_task_state[future]
|
|
253
261
|
try:
|
|
254
|
-
|
|
262
|
+
try:
|
|
263
|
+
sample_score = future.result()
|
|
264
|
+
except TimeoutError:
|
|
265
|
+
logger.warning(
|
|
266
|
+
f'Timeout when reviewing sample {task_state.sample_id}, setting score to zero.'
|
|
267
|
+
)
|
|
268
|
+
sample_score = SampleScore(sample_id=task_state.sample_id, scores={})
|
|
255
269
|
sample_score_list.append(sample_score)
|
|
256
270
|
|
|
257
271
|
# Save the review result to cache for future use
|
|
@@ -261,10 +275,13 @@ class DefaultEvaluator(Evaluator):
|
|
|
261
275
|
sample_score=sample_score,
|
|
262
276
|
save_metadata=self.benchmark.save_metadata
|
|
263
277
|
)
|
|
264
|
-
logger.debug(f'Review result: \n{review_result.
|
|
278
|
+
logger.debug(f'Review result: \n{review_result.pretty_print()}')
|
|
265
279
|
|
|
266
280
|
except Exception as exc:
|
|
267
|
-
|
|
281
|
+
tb_str = traceback.format_exc()
|
|
282
|
+
logger.error(
|
|
283
|
+
f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}'
|
|
284
|
+
)
|
|
268
285
|
if self.task_config.ignore_errors:
|
|
269
286
|
logger.warning('Error ignored, continuing with next sample.')
|
|
270
287
|
else:
|
|
@@ -317,7 +334,7 @@ class DefaultEvaluator(Evaluator):
|
|
|
317
334
|
|
|
318
335
|
# Generate and display a summary table of results
|
|
319
336
|
try:
|
|
320
|
-
report_table = gen_table(report_list=[report], add_overall_metric=
|
|
337
|
+
report_table = gen_table(report_list=[report], add_overall_metric=self.benchmark.add_overall_metric)
|
|
321
338
|
logger.info(f'\n{self.benchmark_name} report table:'
|
|
322
339
|
f'\n{report_table} \n')
|
|
323
340
|
except Exception:
|
|
@@ -335,3 +352,6 @@ class DefaultEvaluator(Evaluator):
|
|
|
335
352
|
report.to_json(report_file)
|
|
336
353
|
logger.info(f'Dump report to: {report_file} \n')
|
|
337
354
|
return report
|
|
355
|
+
|
|
356
|
+
def finalize(self, *args, **kwargs):
|
|
357
|
+
self.benchmark.finalize(*args, **kwargs)
|