evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +62 -2
- evalscope/api/benchmark/meta.py +9 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +1 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
- evalscope/benchmarks/bfcl/generation.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +96 -14
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +30 -10
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/metric.py +27 -2
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +8 -6
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/combinator.py +0 -25
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +56 -7
- evalscope/utils/json_schema.py +23 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
- tests/benchmark/test_eval.py +80 -37
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_sandbox.py +81 -0
- tests/benchmark/test_vlm.py +137 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +44 -14
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import Any, Dict, List, Optional
|
|
4
4
|
|
|
5
|
+
from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
|
|
5
6
|
from evalscope.constants import JudgeScoreType
|
|
6
7
|
from evalscope.utils.logger import get_logger
|
|
7
8
|
|
|
@@ -109,20 +110,31 @@ class LLMJudge:
|
|
|
109
110
|
config=GenerateConfig(**self.generation_config),
|
|
110
111
|
)
|
|
111
112
|
|
|
112
|
-
def judge(
|
|
113
|
+
def judge(
|
|
114
|
+
self,
|
|
115
|
+
prompt: str = '',
|
|
116
|
+
system_prompt: Optional[str] = None,
|
|
117
|
+
messages: Optional[List[ChatMessage]] = None
|
|
118
|
+
) -> str:
|
|
113
119
|
"""
|
|
120
|
+
Generate a response from the LLM based on the provided prompt and context.
|
|
121
|
+
If messages is provided, it will be used as the input context.
|
|
122
|
+
|
|
114
123
|
Args:
|
|
115
124
|
prompt (str): The prompt to evaluate
|
|
116
125
|
system_prompt (str, optional): The system prompt to use for the evaluation
|
|
126
|
+
messages (List[ChatMessage], optional): A list of chat messages to include in the evaluation
|
|
117
127
|
Returns:
|
|
118
128
|
str: The response from the LLM
|
|
119
129
|
"""
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
input_messages
|
|
130
|
+
# parse messages
|
|
131
|
+
if messages is not None:
|
|
132
|
+
input_messages = messages
|
|
133
|
+
else:
|
|
134
|
+
system_content = system_prompt or self.system_prompt
|
|
135
|
+
input_messages = [ChatMessageUser(content=prompt)]
|
|
136
|
+
if system_content:
|
|
137
|
+
input_messages.insert(0, ChatMessageSystem(content=system_content))
|
|
126
138
|
try:
|
|
127
139
|
# Send request using ServerModelAdapter
|
|
128
140
|
response = self.model.generate(input_messages)
|
evalscope/metrics/metric.py
CHANGED
|
@@ -6,11 +6,19 @@ from evalscope.api.registry import register_aggregation, register_metric
|
|
|
6
6
|
from .metrics import mean
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
def normalize_text(text: str) -> str:
|
|
10
|
+
"""Normalize text by lowering case and stripping whitespace."""
|
|
11
|
+
return text.strip().lower()
|
|
12
|
+
|
|
13
|
+
|
|
9
14
|
@register_metric(name='exact_match')
|
|
10
15
|
class ExactMatch(Metric):
|
|
11
16
|
|
|
12
17
|
def apply(self, predictions, references):
|
|
13
|
-
return [
|
|
18
|
+
return [
|
|
19
|
+
float(normalize_text(prediction) == normalize_text(reference))
|
|
20
|
+
for prediction, reference in zip(predictions, references)
|
|
21
|
+
]
|
|
14
22
|
|
|
15
23
|
|
|
16
24
|
@register_metric(name='acc')
|
|
@@ -202,6 +210,9 @@ class Mean(Aggregator):
|
|
|
202
210
|
|
|
203
211
|
name = 'mean'
|
|
204
212
|
|
|
213
|
+
def agg_func(self, values: List[float]) -> float:
|
|
214
|
+
return mean(values)
|
|
215
|
+
|
|
205
216
|
def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
|
|
206
217
|
"""Aggregate scores by computing the mean for each metric.
|
|
207
218
|
|
|
@@ -230,7 +241,7 @@ class Mean(Aggregator):
|
|
|
230
241
|
if values: # Only process non-empty value lists
|
|
231
242
|
aggregated_scores.append(
|
|
232
243
|
AggScore(
|
|
233
|
-
score=
|
|
244
|
+
score=self.agg_func(values),
|
|
234
245
|
metric_name=metric_name,
|
|
235
246
|
aggregation_name=self.name,
|
|
236
247
|
num=len(values),
|
|
@@ -241,6 +252,20 @@ class Mean(Aggregator):
|
|
|
241
252
|
return aggregated_scores
|
|
242
253
|
|
|
243
254
|
|
|
255
|
+
@register_aggregation(name='clipped_mean')
|
|
256
|
+
class ClippedMean(Mean):
|
|
257
|
+
|
|
258
|
+
name = 'clipped_mean'
|
|
259
|
+
|
|
260
|
+
def __init__(self, clip_min: float = 0.0, clip_max: float = 1.0):
|
|
261
|
+
self.clip_min = clip_min
|
|
262
|
+
self.clip_max = clip_max
|
|
263
|
+
|
|
264
|
+
def agg_func(self, values: List[float]) -> float:
|
|
265
|
+
clipped_values = min(max(mean(values), self.clip_min), self.clip_max)
|
|
266
|
+
return clipped_values
|
|
267
|
+
|
|
268
|
+
|
|
244
269
|
@register_aggregation(name='pass_at_k')
|
|
245
270
|
class PassAtK(Aggregator):
|
|
246
271
|
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import time
|
|
5
|
+
import torch
|
|
6
|
+
from logging import getLogger
|
|
7
|
+
from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
|
|
8
|
+
|
|
9
|
+
from evalscope.api.messages import (
|
|
10
|
+
ChatMessage,
|
|
11
|
+
ChatMessageAssistant,
|
|
12
|
+
ContentAudio,
|
|
13
|
+
ContentImage,
|
|
14
|
+
ContentText,
|
|
15
|
+
ContentVideo,
|
|
16
|
+
)
|
|
17
|
+
from evalscope.api.model import (
|
|
18
|
+
ChatCompletionChoice,
|
|
19
|
+
GenerateConfig,
|
|
20
|
+
Logprob,
|
|
21
|
+
Logprobs,
|
|
22
|
+
ModelAPI,
|
|
23
|
+
ModelOutput,
|
|
24
|
+
ModelUsage,
|
|
25
|
+
TopLogprob,
|
|
26
|
+
)
|
|
27
|
+
from evalscope.api.tool import ToolChoice, ToolInfo
|
|
28
|
+
from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
|
|
29
|
+
from evalscope.utils.model_utils import get_device
|
|
30
|
+
|
|
31
|
+
logger = getLogger()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ImageEditAPI(ModelAPI):
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
model_name: str,
|
|
39
|
+
base_url: Optional[str] = None,
|
|
40
|
+
api_key: Optional[str] = None,
|
|
41
|
+
config: GenerateConfig = GenerateConfig(),
|
|
42
|
+
**model_args: Any,
|
|
43
|
+
):
|
|
44
|
+
super().__init__(
|
|
45
|
+
model_name=model_name,
|
|
46
|
+
base_url=base_url,
|
|
47
|
+
api_key=api_key,
|
|
48
|
+
config=config,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# collect known model_args (then delete them so we can pass the rest on)
|
|
52
|
+
def collect_model_arg(name: str) -> Optional[Any]:
|
|
53
|
+
nonlocal model_args
|
|
54
|
+
value = model_args.get(name, None)
|
|
55
|
+
if value is not None:
|
|
56
|
+
model_args.pop(name)
|
|
57
|
+
return value
|
|
58
|
+
|
|
59
|
+
model_path = collect_model_arg('model_path')
|
|
60
|
+
torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
|
|
61
|
+
device_map = collect_model_arg('device_map')
|
|
62
|
+
# torch dtype
|
|
63
|
+
DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
|
|
64
|
+
|
|
65
|
+
if isinstance(torch_dtype, str) and torch_dtype != 'auto':
|
|
66
|
+
torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
|
|
67
|
+
self.torch_dtype = torch_dtype
|
|
68
|
+
self.device = device_map or get_device()
|
|
69
|
+
|
|
70
|
+
self.pipeline_cls = collect_model_arg('pipeline_cls')
|
|
71
|
+
# default to DiffusionPipeline if not specified
|
|
72
|
+
if self.pipeline_cls is None:
|
|
73
|
+
if 'qwen' in model_name.lower():
|
|
74
|
+
self.pipeline_cls = 'QwenImageEditPipeline'
|
|
75
|
+
else:
|
|
76
|
+
logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
|
|
77
|
+
raise ValueError('Invalid pipeline class.')
|
|
78
|
+
|
|
79
|
+
model_name_or_path = model_path or model_name
|
|
80
|
+
|
|
81
|
+
# from modelscope import pipeline_cls
|
|
82
|
+
module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
|
|
83
|
+
logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
|
|
84
|
+
|
|
85
|
+
self.model = module.from_pretrained(
|
|
86
|
+
model_name_or_path,
|
|
87
|
+
torch_dtype=self.torch_dtype,
|
|
88
|
+
**model_args,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
self.model.to(self.device)
|
|
92
|
+
|
|
93
|
+
def generate(
|
|
94
|
+
self,
|
|
95
|
+
input: List[ChatMessage],
|
|
96
|
+
tools: List[ToolInfo],
|
|
97
|
+
tool_choice: ToolChoice,
|
|
98
|
+
config: GenerateConfig,
|
|
99
|
+
) -> ModelOutput:
|
|
100
|
+
|
|
101
|
+
# prepare generator
|
|
102
|
+
kwargs: Dict[str, Any] = {}
|
|
103
|
+
if config.num_inference_steps is not None:
|
|
104
|
+
kwargs['num_inference_steps'] = config.num_inference_steps
|
|
105
|
+
kwargs.update(config.model_extra)
|
|
106
|
+
|
|
107
|
+
# assume the first text as prompt
|
|
108
|
+
content = input[0].content
|
|
109
|
+
assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
|
|
110
|
+
'Invalid content types, expected (ContentText, ContentImage)'
|
|
111
|
+
|
|
112
|
+
prompt = content[0].text
|
|
113
|
+
input_image_base64 = content[1].image
|
|
114
|
+
input_image = base64_to_PIL(input_image_base64)
|
|
115
|
+
# get the first image as output
|
|
116
|
+
output = self.model(image=input_image, prompt=prompt, **kwargs)
|
|
117
|
+
image = output.images[0]
|
|
118
|
+
|
|
119
|
+
image_base64 = PIL_to_base64(image)
|
|
120
|
+
|
|
121
|
+
return ModelOutput(
|
|
122
|
+
model=self.model_name,
|
|
123
|
+
choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
|
|
124
|
+
time=time.time(),
|
|
125
|
+
)
|
evalscope/models/model_apis.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from evalscope.api.model import ModelAPI
|
|
2
2
|
from evalscope.api.registry import register_model_api
|
|
3
3
|
from evalscope.utils.deprecation_utils import deprecated
|
|
4
|
+
from evalscope.utils.import_utils import check_import
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
@register_model_api(name='mock_llm')
|
|
@@ -27,6 +28,8 @@ def server() -> type[ModelAPI]:
|
|
|
27
28
|
|
|
28
29
|
@register_model_api(name='llm_ckpt')
|
|
29
30
|
def llm_ckpt() -> type[ModelAPI]:
|
|
31
|
+
check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
|
|
32
|
+
|
|
30
33
|
from .modelscope import ModelScopeAPI
|
|
31
34
|
|
|
32
35
|
return ModelScopeAPI
|
|
@@ -35,6 +38,8 @@ def llm_ckpt() -> type[ModelAPI]:
|
|
|
35
38
|
@register_model_api(name='checkpoint')
|
|
36
39
|
@deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
|
|
37
40
|
def checkpoint() -> type[ModelAPI]:
|
|
41
|
+
check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
|
|
42
|
+
|
|
38
43
|
from .modelscope import ModelScopeAPI
|
|
39
44
|
|
|
40
45
|
return ModelScopeAPI
|
|
@@ -42,6 +47,23 @@ def checkpoint() -> type[ModelAPI]:
|
|
|
42
47
|
|
|
43
48
|
@register_model_api(name='text2image')
|
|
44
49
|
def text2image() -> type[ModelAPI]:
|
|
50
|
+
check_import(['torch', 'torchvision', 'diffusers'],
|
|
51
|
+
package='evalscope[aigc]',
|
|
52
|
+
raise_error=True,
|
|
53
|
+
feature_name='text2image')
|
|
54
|
+
|
|
45
55
|
from .text2image_model import Text2ImageAPI
|
|
46
56
|
|
|
47
57
|
return Text2ImageAPI
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@register_model_api(name='image_editing')
|
|
61
|
+
def image_editing() -> type[ModelAPI]:
|
|
62
|
+
check_import(['torch', 'torchvision', 'diffusers'],
|
|
63
|
+
package='evalscope[aigc]',
|
|
64
|
+
raise_error=True,
|
|
65
|
+
feature_name='image_editing')
|
|
66
|
+
|
|
67
|
+
from .image_edit_model import ImageEditAPI
|
|
68
|
+
|
|
69
|
+
return ImageEditAPI
|
|
@@ -48,6 +48,9 @@ class OpenAICompatibleAPI(ModelAPI):
|
|
|
48
48
|
self.base_url = base_url or os.environ.get('EVALSCOPE_BASE_URL', None)
|
|
49
49
|
assert self.base_url, f'Base URL for {model_name} not found'
|
|
50
50
|
|
|
51
|
+
# remove trailing slash from base_url
|
|
52
|
+
self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
|
|
53
|
+
|
|
51
54
|
# create http client
|
|
52
55
|
self.client = OpenAI(
|
|
53
56
|
api_key=self.api_key,
|
|
@@ -107,8 +107,8 @@ class Text2ImageAPI(ModelAPI):
|
|
|
107
107
|
kwargs['num_inference_steps'] = config.num_inference_steps
|
|
108
108
|
if config.guidance_scale is not None:
|
|
109
109
|
kwargs['guidance_scale'] = config.guidance_scale
|
|
110
|
-
|
|
111
|
-
|
|
110
|
+
# update with extra model parameters
|
|
111
|
+
kwargs.update(config.model_extra)
|
|
112
112
|
|
|
113
113
|
# assume the first text as prompt
|
|
114
114
|
prompt = input[0].text
|
evalscope/models/utils/openai.py
CHANGED
|
@@ -104,10 +104,9 @@ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartPa
|
|
|
104
104
|
)
|
|
105
105
|
elif content.type == 'audio':
|
|
106
106
|
audio_data_uri = file_as_data_uri(content.audio)
|
|
107
|
-
audio_data = audio_data_uri.split('base64,')[1]
|
|
108
107
|
|
|
109
108
|
return ChatCompletionContentPartInputAudioParam(
|
|
110
|
-
type='input_audio', input_audio=dict(data=
|
|
109
|
+
type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
|
|
111
110
|
)
|
|
112
111
|
|
|
113
112
|
else:
|
|
@@ -209,7 +208,7 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
|
|
|
209
208
|
return params
|
|
210
209
|
|
|
211
210
|
|
|
212
|
-
def openai_assistant_content(message: ChatMessageAssistant) -> str:
|
|
211
|
+
def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
|
|
213
212
|
# In agent bridge scenarios, we could encounter concepts such as reasoning and
|
|
214
213
|
# .internal use in the ChatMessageAssistant that are not supported by the OpenAI
|
|
215
214
|
# choices API. This code smuggles that data into the plain text so that it
|
|
@@ -220,7 +219,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
|
|
|
220
219
|
else:
|
|
221
220
|
content = ''
|
|
222
221
|
for c in message.content:
|
|
223
|
-
if c.type == 'reasoning':
|
|
222
|
+
if c.type == 'reasoning' and include_reasoning:
|
|
224
223
|
attribs = ''
|
|
225
224
|
if c.signature is not None:
|
|
226
225
|
attribs = f'{attribs} signature="{c.signature}"'
|
|
@@ -239,11 +238,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
|
|
|
239
238
|
return content
|
|
240
239
|
|
|
241
240
|
|
|
242
|
-
def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
|
|
241
|
+
def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
|
|
243
242
|
oai_choices: List[Choice] = []
|
|
244
243
|
|
|
245
244
|
for index, choice in enumerate(choices):
|
|
246
|
-
|
|
245
|
+
# Handle content
|
|
246
|
+
content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
|
|
247
|
+
|
|
248
|
+
# Handle tool calls
|
|
247
249
|
if choice.message.tool_calls:
|
|
248
250
|
tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
|
|
249
251
|
else:
|
evalscope/perf/arguments.py
CHANGED
|
@@ -55,6 +55,7 @@ class Arguments(BaseArgument):
|
|
|
55
55
|
image_height: int = 224 # Height of the image for random VL dataset
|
|
56
56
|
image_format: str = 'RGB' # Image format for random VL dataset
|
|
57
57
|
image_num: int = 1 # Number of images for random VL dataset
|
|
58
|
+
image_patch_size: int = 28 # Patch size for image tokenizer, only for local image token calculation
|
|
58
59
|
|
|
59
60
|
# Dataset settings
|
|
60
61
|
dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
|
|
@@ -171,6 +172,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
171
172
|
parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
|
|
172
173
|
parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
|
|
173
174
|
parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
|
|
175
|
+
parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation') # noqa: E501
|
|
174
176
|
|
|
175
177
|
# Output settings
|
|
176
178
|
parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -42,6 +42,8 @@ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGen
|
|
|
42
42
|
try:
|
|
43
43
|
for messages in message_generator.build_messages():
|
|
44
44
|
dataset_messages.append(messages)
|
|
45
|
+
if len(dataset_messages) >= args.number:
|
|
46
|
+
break
|
|
45
47
|
except StopIteration:
|
|
46
48
|
pass
|
|
47
49
|
|
|
@@ -43,7 +43,7 @@ class ApiPluginBase:
|
|
|
43
43
|
|
|
44
44
|
@abstractmethod
|
|
45
45
|
async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
|
|
46
|
-
body: Dict) -> AsyncGenerator[Tuple[bool, int,
|
|
46
|
+
body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
|
|
47
47
|
"""Process the HTTP request and handle the response.
|
|
48
48
|
|
|
49
49
|
Args:
|
|
@@ -53,7 +53,7 @@ class ApiPluginBase:
|
|
|
53
53
|
body: The request body
|
|
54
54
|
|
|
55
55
|
Yields:
|
|
56
|
-
Tuple[bool, int,
|
|
56
|
+
Tuple[bool, int, Any]: (is_error, status_code, response_data)
|
|
57
57
|
"""
|
|
58
58
|
raise NotImplementedError
|
|
59
59
|
|
|
@@ -18,7 +18,7 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
18
18
|
super().__init__(param)
|
|
19
19
|
|
|
20
20
|
async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
|
|
21
|
-
body: Dict) -> AsyncGenerator[Tuple[bool, int,
|
|
21
|
+
body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
|
|
22
22
|
"""Process the HTTP request and handle the response.
|
|
23
23
|
|
|
24
24
|
Args:
|
|
@@ -28,7 +28,7 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
28
28
|
body: The request body
|
|
29
29
|
|
|
30
30
|
Yields:
|
|
31
|
-
Tuple[bool, int,
|
|
31
|
+
Tuple[bool, int, Any]: (is_error, status_code, response_data)
|
|
32
32
|
"""
|
|
33
33
|
try:
|
|
34
34
|
headers = {'Content-Type': 'application/json', **headers}
|
|
@@ -40,7 +40,7 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
40
40
|
logger.error(f'Error in process_request: {e}')
|
|
41
41
|
yield (True, None, str(e))
|
|
42
42
|
|
|
43
|
-
async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int,
|
|
43
|
+
async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
|
|
44
44
|
"""Handle streaming response from server-sent events.
|
|
45
45
|
|
|
46
46
|
Args:
|
|
@@ -71,14 +71,14 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
71
71
|
logger.error(f'Error in _handle_stream: {e}')
|
|
72
72
|
yield True, response.status, str(e)
|
|
73
73
|
|
|
74
|
-
async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int,
|
|
74
|
+
async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
|
|
75
75
|
"""Handle the HTTP response based on content type and status.
|
|
76
76
|
|
|
77
77
|
Args:
|
|
78
78
|
response: The aiohttp response object
|
|
79
79
|
|
|
80
80
|
Yields:
|
|
81
|
-
Tuple[bool, int,
|
|
81
|
+
Tuple[bool, int, Any]: (is_error, status_code, response_data)
|
|
82
82
|
"""
|
|
83
83
|
response_status = response.status
|
|
84
84
|
response_content_type = response.content_type
|
|
@@ -94,7 +94,7 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
94
94
|
# Handle successful response with 'application/json' content type
|
|
95
95
|
elif content_type_json in response_content_type:
|
|
96
96
|
content = await response.json()
|
|
97
|
-
yield (False, response_status,
|
|
97
|
+
yield (False, response_status, content)
|
|
98
98
|
# Handle other successful responses
|
|
99
99
|
else:
|
|
100
100
|
content = await response.read()
|
|
@@ -102,4 +102,4 @@ class DefaultApiPlugin(ApiPluginBase):
|
|
|
102
102
|
else:
|
|
103
103
|
# error is always in JSON format
|
|
104
104
|
error = await response.json()
|
|
105
|
-
yield (True, response_status,
|
|
105
|
+
yield (True, response_status, error)
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import math
|
|
2
3
|
import os
|
|
4
|
+
from collections import defaultdict
|
|
3
5
|
from typing import Any, Dict, List, Tuple, Union
|
|
4
6
|
|
|
5
7
|
from evalscope.perf.arguments import Arguments
|
|
6
8
|
from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
|
|
7
9
|
from evalscope.perf.plugin.registry import register_api
|
|
10
|
+
from evalscope.utils.io_utils import base64_to_PIL
|
|
8
11
|
from evalscope.utils.logger import get_logger
|
|
9
12
|
|
|
10
13
|
logger = get_logger()
|
|
@@ -113,7 +116,7 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
113
116
|
return input_tokens, output_tokens
|
|
114
117
|
|
|
115
118
|
# no usage information in the response, parse the response to get the tokens
|
|
116
|
-
delta_contents =
|
|
119
|
+
delta_contents = defaultdict(list)
|
|
117
120
|
for response in responses:
|
|
118
121
|
if 'object' in response:
|
|
119
122
|
self.__process_response_object(response, delta_contents)
|
|
@@ -123,41 +126,46 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
123
126
|
input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
|
|
124
127
|
return input_tokens, output_tokens
|
|
125
128
|
|
|
126
|
-
def __process_response_object(self,
|
|
127
|
-
if
|
|
128
|
-
|
|
129
|
+
def __process_response_object(self, response, delta_contents):
|
|
130
|
+
if not response.get('choices'):
|
|
131
|
+
return
|
|
132
|
+
if response['object'] == 'chat.completion':
|
|
133
|
+
for choice in response['choices']:
|
|
129
134
|
delta_contents[choice['index']] = [choice['message']['content']]
|
|
130
|
-
elif
|
|
131
|
-
for choice in
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
+
elif response['object'] == 'text_completion':
|
|
136
|
+
for choice in response['choices']:
|
|
137
|
+
if 'text' in choice and 'index' in choice:
|
|
138
|
+
delta_contents[choice['index']].append(choice['text'])
|
|
139
|
+
elif response['object'] == 'chat.completion.chunk':
|
|
140
|
+
for choice in response['choices']:
|
|
135
141
|
if 'delta' in choice and 'index' in choice:
|
|
136
142
|
delta = choice['delta']
|
|
137
143
|
idx = choice['index']
|
|
138
144
|
if 'content' in delta:
|
|
139
|
-
|
|
140
|
-
delta_contents.setdefault(idx, []).append(delta_content)
|
|
145
|
+
delta_contents[idx].append(delta['content'])
|
|
141
146
|
|
|
142
|
-
def __process_no_object(self,
|
|
147
|
+
def __process_no_object(self, response, delta_contents):
|
|
143
148
|
# assume the response is a single choice
|
|
144
|
-
|
|
149
|
+
if not response.get('choices'):
|
|
150
|
+
return
|
|
151
|
+
for choice in response['choices']:
|
|
145
152
|
if 'delta' in choice:
|
|
146
153
|
delta = choice['delta']
|
|
147
154
|
idx = choice['index']
|
|
148
155
|
if 'content' in delta:
|
|
149
|
-
|
|
150
|
-
delta_contents.setdefault(idx, []).append(delta_content)
|
|
156
|
+
delta_contents[idx].append(delta['content'])
|
|
151
157
|
else:
|
|
152
158
|
delta_contents[choice['index']] = [choice['message']['content']]
|
|
153
159
|
|
|
154
|
-
def __calculate_tokens_from_content(self, request,
|
|
160
|
+
def __calculate_tokens_from_content(self, request, content):
|
|
155
161
|
input_tokens = output_tokens = 0
|
|
156
162
|
if self.tokenizer is not None:
|
|
157
|
-
|
|
163
|
+
# Calculate input tokens
|
|
164
|
+
input_tokens += self._count_input_tokens(request)
|
|
165
|
+
for idx, choice_contents in content.items():
|
|
158
166
|
full_response_content = ''.join(choice_contents)
|
|
159
|
-
|
|
160
|
-
output_tokens +=
|
|
167
|
+
# Calculate output tokens
|
|
168
|
+
output_tokens += self._count_output_tokens(full_response_content)
|
|
161
169
|
else:
|
|
162
170
|
raise ValueError(
|
|
163
171
|
'Error: Unable to retrieve usage information\n\n'
|
|
@@ -171,3 +179,59 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
171
179
|
'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
|
|
172
180
|
)
|
|
173
181
|
return input_tokens, output_tokens
|
|
182
|
+
|
|
183
|
+
def _count_input_tokens(self, request: Dict) -> int:
|
|
184
|
+
"""Count the number of input tokens in the request.
|
|
185
|
+
|
|
186
|
+
This method handles different types of requests and calculates tokens for:
|
|
187
|
+
- Text content in messages or prompts
|
|
188
|
+
- Images in multimodal messages (converted to patch tokens)
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
request (Dict): The request dictionary containing either 'messages' for chat
|
|
192
|
+
completion or 'prompt' for text completion.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
int: The total number of input tokens including text and image tokens.
|
|
196
|
+
"""
|
|
197
|
+
input_tokens = 0
|
|
198
|
+
if 'messages' in request:
|
|
199
|
+
input_content = self.tokenizer.apply_chat_template(
|
|
200
|
+
request['messages'], tokenize=True, add_generation_prompt=True
|
|
201
|
+
)
|
|
202
|
+
input_tokens += len(input_content)
|
|
203
|
+
# handle image tokens if any
|
|
204
|
+
for message in request['messages']:
|
|
205
|
+
content = message.get('content', '')
|
|
206
|
+
if isinstance(content, str):
|
|
207
|
+
continue
|
|
208
|
+
for cont in content:
|
|
209
|
+
if cont['type'] == 'image_url':
|
|
210
|
+
try:
|
|
211
|
+
# assuming image_url is base64 string
|
|
212
|
+
image_base64 = cont['image_url']['url']
|
|
213
|
+
image = base64_to_PIL(image_base64)
|
|
214
|
+
# Use math.ceil for more accurate token count when image dimensions
|
|
215
|
+
# aren't perfectly divisible by patch size
|
|
216
|
+
n_patches = (
|
|
217
|
+
math.ceil(image.height / self.param.image_patch_size)
|
|
218
|
+
* math.ceil(image.width / self.param.image_patch_size)
|
|
219
|
+
)
|
|
220
|
+
input_tokens += n_patches
|
|
221
|
+
except Exception as e:
|
|
222
|
+
logger.warning(f'Failed to process image for token counting: {e}')
|
|
223
|
+
# Continue processing other content without failing
|
|
224
|
+
elif 'prompt' in request:
|
|
225
|
+
input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
|
|
226
|
+
return input_tokens
|
|
227
|
+
|
|
228
|
+
def _count_output_tokens(self, response: str) -> int:
|
|
229
|
+
"""Count the number of output tokens in the response. Only string response is supported.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
response (str): The API response text.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
int: The number of output tokens.
|
|
236
|
+
"""
|
|
237
|
+
return len(self.tokenizer.encode(response, add_special_tokens=False))
|
|
@@ -22,7 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
for item in dataset:
|
|
23
23
|
pil_image = item['jpg']
|
|
24
24
|
text = item['txt']
|
|
25
|
-
base64_image = PIL_to_base64(pil_image)
|
|
25
|
+
base64_image = PIL_to_base64(pil_image, add_header=True)
|
|
26
26
|
|
|
27
|
-
message = self.create_message(text=text, image_urls=
|
|
27
|
+
message = self.create_message(text=text, image_urls=base64_image)
|
|
28
28
|
yield [message]
|
|
@@ -22,7 +22,7 @@ class KontextDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
for item in dataset:
|
|
23
23
|
pil_image = item['image']
|
|
24
24
|
text = item['instruction']
|
|
25
|
-
base64_image = PIL_to_base64(pil_image)
|
|
25
|
+
base64_image = PIL_to_base64(pil_image, add_header=True)
|
|
26
26
|
|
|
27
|
-
message = self.create_message(text=text, image_urls=
|
|
27
|
+
message = self.create_message(text=text, image_urls=base64_image)
|
|
28
28
|
yield [message]
|
|
@@ -31,7 +31,7 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
|
|
|
31
31
|
# Generate random images based on image_num
|
|
32
32
|
images_b64 = []
|
|
33
33
|
for _ in range(self.image_num):
|
|
34
|
-
images_b64.append(
|
|
34
|
+
images_b64.append(self._generate_random_image_b64())
|
|
35
35
|
|
|
36
36
|
message = self.create_message(text=prompt, image_urls=images_b64)
|
|
37
37
|
yield [message]
|
|
@@ -77,4 +77,4 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
|
|
|
77
77
|
draw.line(coords, fill=shape_color, width=random.randint(1, 5))
|
|
78
78
|
|
|
79
79
|
# Convert to base64
|
|
80
|
-
return PIL_to_base64(image, format='PNG')
|
|
80
|
+
return PIL_to_base64(image, format='PNG', add_header=True)
|