evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  6. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  7. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  8. evalscope/api/benchmark/benchmark.py +62 -2
  9. evalscope/api/benchmark/meta.py +9 -0
  10. evalscope/api/dataset/dataset.py +6 -6
  11. evalscope/api/dataset/loader.py +2 -1
  12. evalscope/api/evaluator/cache.py +24 -1
  13. evalscope/api/evaluator/evaluator.py +5 -0
  14. evalscope/api/evaluator/state.py +17 -1
  15. evalscope/api/messages/__init__.py +1 -0
  16. evalscope/api/messages/chat_message.py +52 -2
  17. evalscope/api/metric/scorer.py +15 -7
  18. evalscope/api/mixin/__init__.py +1 -1
  19. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  20. evalscope/api/mixin/sandbox_mixin.py +204 -0
  21. evalscope/api/model/generate_config.py +1 -6
  22. evalscope/api/model/model.py +5 -2
  23. evalscope/api/tool/tool_info.py +1 -1
  24. evalscope/app/app.py +3 -0
  25. evalscope/app/ui/single_model.py +3 -3
  26. evalscope/app/utils/data_utils.py +7 -7
  27. evalscope/app/utils/env_utils.py +12 -0
  28. evalscope/app/utils/text_utils.py +14 -12
  29. evalscope/arguments.py +8 -4
  30. evalscope/backend/opencompass/backend_manager.py +0 -2
  31. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  32. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  33. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  34. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  35. evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
  36. evalscope/benchmarks/bfcl/generation.py +9 -9
  37. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  38. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  39. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  40. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  41. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  42. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  43. evalscope/benchmarks/healthbench/utils.py +102 -0
  44. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  45. evalscope/benchmarks/humaneval/utils.py +235 -0
  46. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  47. evalscope/benchmarks/image_edit/__init__.py +0 -0
  48. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  49. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  50. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  51. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  52. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  53. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  54. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  55. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  56. evalscope/benchmarks/math_vista/__init__.py +0 -0
  57. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  58. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  59. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  60. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  61. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  62. evalscope/benchmarks/mm_star/__init__.py +0 -0
  63. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  64. evalscope/benchmarks/mmmu/__init__.py +0 -0
  65. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  66. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  67. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  68. evalscope/benchmarks/multi_if/__init__.py +0 -0
  69. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  70. evalscope/benchmarks/multi_if/metrics.py +120 -0
  71. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  72. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
  73. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  74. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  75. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  76. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  77. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  78. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  79. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  80. evalscope/benchmarks/tau_bench/generation.py +1 -1
  81. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
  82. evalscope/benchmarks/text2image/__init__.py +0 -0
  83. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  84. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  85. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  86. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  87. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  88. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  89. evalscope/cli/start_app.py +7 -1
  90. evalscope/cli/start_perf.py +7 -1
  91. evalscope/config.py +96 -14
  92. evalscope/constants.py +11 -0
  93. evalscope/evaluator/evaluator.py +30 -10
  94. evalscope/metrics/llm_judge.py +19 -7
  95. evalscope/metrics/metric.py +27 -2
  96. evalscope/models/image_edit_model.py +125 -0
  97. evalscope/models/model_apis.py +22 -0
  98. evalscope/models/openai_compatible.py +3 -0
  99. evalscope/models/text2image_model.py +2 -2
  100. evalscope/models/utils/openai.py +8 -6
  101. evalscope/perf/arguments.py +2 -0
  102. evalscope/perf/benchmark.py +2 -0
  103. evalscope/perf/plugin/api/base.py +2 -2
  104. evalscope/perf/plugin/api/default_api.py +7 -7
  105. evalscope/perf/plugin/api/openai_api.py +83 -19
  106. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  107. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  108. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  109. evalscope/perf/utils/benchmark_util.py +7 -5
  110. evalscope/perf/utils/local_server.py +3 -0
  111. evalscope/report/__init__.py +0 -1
  112. evalscope/report/combinator.py +0 -25
  113. evalscope/report/generator.py +8 -87
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +9 -5
  116. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  117. evalscope/utils/chat_service.py +1 -1
  118. evalscope/utils/function_utils.py +41 -0
  119. evalscope/utils/import_utils.py +73 -1
  120. evalscope/utils/io_utils.py +56 -7
  121. evalscope/utils/json_schema.py +23 -2
  122. evalscope/utils/logger.py +19 -0
  123. evalscope/utils/model_utils.py +4 -3
  124. evalscope/utils/multi_choices.py +23 -6
  125. evalscope/version.py +2 -2
  126. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
  127. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
  128. tests/benchmark/test_eval.py +80 -37
  129. tests/benchmark/test_image_edit.py +65 -0
  130. tests/benchmark/test_sandbox.py +81 -0
  131. tests/benchmark/test_vlm.py +137 -0
  132. tests/cli/test_all.py +83 -43
  133. tests/cli/test_collection.py +8 -5
  134. tests/cli/test_reasoning.py +81 -0
  135. tests/common.py +73 -0
  136. tests/perf/test_perf.py +44 -14
  137. tests/rag/test_clip_benchmark.py +0 -3
  138. evalscope/api/mixin/dataset_mixin.py +0 -105
  139. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  140. tests/aigc/__init__.py +0 -1
  141. /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
  142. /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
  143. /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
  144. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  145. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  146. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  147. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
  148. /tests/{aigc → benchmark}/test_t2i.py +0 -0
@@ -2,6 +2,7 @@ import os
2
2
  import re
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
+ from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
5
6
  from evalscope.constants import JudgeScoreType
6
7
  from evalscope.utils.logger import get_logger
7
8
 
@@ -109,20 +110,31 @@ class LLMJudge:
109
110
  config=GenerateConfig(**self.generation_config),
110
111
  )
111
112
 
112
- def judge(self, prompt: str, system_prompt: Optional[str] = None) -> str:
113
+ def judge(
114
+ self,
115
+ prompt: str = '',
116
+ system_prompt: Optional[str] = None,
117
+ messages: Optional[List[ChatMessage]] = None
118
+ ) -> str:
113
119
  """
120
+ Generate a response from the LLM based on the provided prompt and context.
121
+ If messages is provided, it will be used as the input context.
122
+
114
123
  Args:
115
124
  prompt (str): The prompt to evaluate
116
125
  system_prompt (str, optional): The system prompt to use for the evaluation
126
+ messages (List[ChatMessage], optional): A list of chat messages to include in the evaluation
117
127
  Returns:
118
128
  str: The response from the LLM
119
129
  """
120
- from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
121
-
122
- system_content = system_prompt or self.system_prompt
123
- input_messages = [ChatMessageUser(content=prompt)]
124
- if system_content:
125
- input_messages.insert(0, ChatMessageSystem(content=system_content))
130
+ # parse messages
131
+ if messages is not None:
132
+ input_messages = messages
133
+ else:
134
+ system_content = system_prompt or self.system_prompt
135
+ input_messages = [ChatMessageUser(content=prompt)]
136
+ if system_content:
137
+ input_messages.insert(0, ChatMessageSystem(content=system_content))
126
138
  try:
127
139
  # Send request using ServerModelAdapter
128
140
  response = self.model.generate(input_messages)
@@ -6,11 +6,19 @@ from evalscope.api.registry import register_aggregation, register_metric
6
6
  from .metrics import mean
7
7
 
8
8
 
9
+ def normalize_text(text: str) -> str:
10
+ """Normalize text by lowering case and stripping whitespace."""
11
+ return text.strip().lower()
12
+
13
+
9
14
  @register_metric(name='exact_match')
10
15
  class ExactMatch(Metric):
11
16
 
12
17
  def apply(self, predictions, references):
13
- return [float(prediction == reference) for prediction, reference in zip(predictions, references)]
18
+ return [
19
+ float(normalize_text(prediction) == normalize_text(reference))
20
+ for prediction, reference in zip(predictions, references)
21
+ ]
14
22
 
15
23
 
16
24
  @register_metric(name='acc')
@@ -202,6 +210,9 @@ class Mean(Aggregator):
202
210
 
203
211
  name = 'mean'
204
212
 
213
+ def agg_func(self, values: List[float]) -> float:
214
+ return mean(values)
215
+
205
216
  def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
206
217
  """Aggregate scores by computing the mean for each metric.
207
218
 
@@ -230,7 +241,7 @@ class Mean(Aggregator):
230
241
  if values: # Only process non-empty value lists
231
242
  aggregated_scores.append(
232
243
  AggScore(
233
- score=mean(values),
244
+ score=self.agg_func(values),
234
245
  metric_name=metric_name,
235
246
  aggregation_name=self.name,
236
247
  num=len(values),
@@ -241,6 +252,20 @@ class Mean(Aggregator):
241
252
  return aggregated_scores
242
253
 
243
254
 
255
+ @register_aggregation(name='clipped_mean')
256
+ class ClippedMean(Mean):
257
+
258
+ name = 'clipped_mean'
259
+
260
+ def __init__(self, clip_min: float = 0.0, clip_max: float = 1.0):
261
+ self.clip_min = clip_min
262
+ self.clip_max = clip_max
263
+
264
+ def agg_func(self, values: List[float]) -> float:
265
+ clipped_values = min(max(mean(values), self.clip_min), self.clip_max)
266
+ return clipped_values
267
+
268
+
244
269
  @register_aggregation(name='pass_at_k')
245
270
  class PassAtK(Aggregator):
246
271
 
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import time
5
+ import torch
6
+ from logging import getLogger
7
+ from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
8
+
9
+ from evalscope.api.messages import (
10
+ ChatMessage,
11
+ ChatMessageAssistant,
12
+ ContentAudio,
13
+ ContentImage,
14
+ ContentText,
15
+ ContentVideo,
16
+ )
17
+ from evalscope.api.model import (
18
+ ChatCompletionChoice,
19
+ GenerateConfig,
20
+ Logprob,
21
+ Logprobs,
22
+ ModelAPI,
23
+ ModelOutput,
24
+ ModelUsage,
25
+ TopLogprob,
26
+ )
27
+ from evalscope.api.tool import ToolChoice, ToolInfo
28
+ from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
29
+ from evalscope.utils.model_utils import get_device
30
+
31
+ logger = getLogger()
32
+
33
+
34
+ class ImageEditAPI(ModelAPI):
35
+
36
+ def __init__(
37
+ self,
38
+ model_name: str,
39
+ base_url: Optional[str] = None,
40
+ api_key: Optional[str] = None,
41
+ config: GenerateConfig = GenerateConfig(),
42
+ **model_args: Any,
43
+ ):
44
+ super().__init__(
45
+ model_name=model_name,
46
+ base_url=base_url,
47
+ api_key=api_key,
48
+ config=config,
49
+ )
50
+
51
+ # collect known model_args (then delete them so we can pass the rest on)
52
+ def collect_model_arg(name: str) -> Optional[Any]:
53
+ nonlocal model_args
54
+ value = model_args.get(name, None)
55
+ if value is not None:
56
+ model_args.pop(name)
57
+ return value
58
+
59
+ model_path = collect_model_arg('model_path')
60
+ torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
61
+ device_map = collect_model_arg('device_map')
62
+ # torch dtype
63
+ DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
64
+
65
+ if isinstance(torch_dtype, str) and torch_dtype != 'auto':
66
+ torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
67
+ self.torch_dtype = torch_dtype
68
+ self.device = device_map or get_device()
69
+
70
+ self.pipeline_cls = collect_model_arg('pipeline_cls')
71
+ # default to DiffusionPipeline if not specified
72
+ if self.pipeline_cls is None:
73
+ if 'qwen' in model_name.lower():
74
+ self.pipeline_cls = 'QwenImageEditPipeline'
75
+ else:
76
+ logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
77
+ raise ValueError('Invalid pipeline class.')
78
+
79
+ model_name_or_path = model_path or model_name
80
+
81
+ # from modelscope import pipeline_cls
82
+ module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
83
+ logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
84
+
85
+ self.model = module.from_pretrained(
86
+ model_name_or_path,
87
+ torch_dtype=self.torch_dtype,
88
+ **model_args,
89
+ )
90
+
91
+ self.model.to(self.device)
92
+
93
+ def generate(
94
+ self,
95
+ input: List[ChatMessage],
96
+ tools: List[ToolInfo],
97
+ tool_choice: ToolChoice,
98
+ config: GenerateConfig,
99
+ ) -> ModelOutput:
100
+
101
+ # prepare generator
102
+ kwargs: Dict[str, Any] = {}
103
+ if config.num_inference_steps is not None:
104
+ kwargs['num_inference_steps'] = config.num_inference_steps
105
+ kwargs.update(config.model_extra)
106
+
107
+ # assume the first text as prompt
108
+ content = input[0].content
109
+ assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
110
+ 'Invalid content types, expected (ContentText, ContentImage)'
111
+
112
+ prompt = content[0].text
113
+ input_image_base64 = content[1].image
114
+ input_image = base64_to_PIL(input_image_base64)
115
+ # get the first image as output
116
+ output = self.model(image=input_image, prompt=prompt, **kwargs)
117
+ image = output.images[0]
118
+
119
+ image_base64 = PIL_to_base64(image)
120
+
121
+ return ModelOutput(
122
+ model=self.model_name,
123
+ choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
124
+ time=time.time(),
125
+ )
@@ -1,6 +1,7 @@
1
1
  from evalscope.api.model import ModelAPI
2
2
  from evalscope.api.registry import register_model_api
3
3
  from evalscope.utils.deprecation_utils import deprecated
4
+ from evalscope.utils.import_utils import check_import
4
5
 
5
6
 
6
7
  @register_model_api(name='mock_llm')
@@ -27,6 +28,8 @@ def server() -> type[ModelAPI]:
27
28
 
28
29
  @register_model_api(name='llm_ckpt')
29
30
  def llm_ckpt() -> type[ModelAPI]:
31
+ check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
32
+
30
33
  from .modelscope import ModelScopeAPI
31
34
 
32
35
  return ModelScopeAPI
@@ -35,6 +38,8 @@ def llm_ckpt() -> type[ModelAPI]:
35
38
  @register_model_api(name='checkpoint')
36
39
  @deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
37
40
  def checkpoint() -> type[ModelAPI]:
41
+ check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
42
+
38
43
  from .modelscope import ModelScopeAPI
39
44
 
40
45
  return ModelScopeAPI
@@ -42,6 +47,23 @@ def checkpoint() -> type[ModelAPI]:
42
47
 
43
48
  @register_model_api(name='text2image')
44
49
  def text2image() -> type[ModelAPI]:
50
+ check_import(['torch', 'torchvision', 'diffusers'],
51
+ package='evalscope[aigc]',
52
+ raise_error=True,
53
+ feature_name='text2image')
54
+
45
55
  from .text2image_model import Text2ImageAPI
46
56
 
47
57
  return Text2ImageAPI
58
+
59
+
60
+ @register_model_api(name='image_editing')
61
+ def image_editing() -> type[ModelAPI]:
62
+ check_import(['torch', 'torchvision', 'diffusers'],
63
+ package='evalscope[aigc]',
64
+ raise_error=True,
65
+ feature_name='image_editing')
66
+
67
+ from .image_edit_model import ImageEditAPI
68
+
69
+ return ImageEditAPI
@@ -48,6 +48,9 @@ class OpenAICompatibleAPI(ModelAPI):
48
48
  self.base_url = base_url or os.environ.get('EVALSCOPE_BASE_URL', None)
49
49
  assert self.base_url, f'Base URL for {model_name} not found'
50
50
 
51
+ # remove trailing slash from base_url
52
+ self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
53
+
51
54
  # create http client
52
55
  self.client = OpenAI(
53
56
  api_key=self.api_key,
@@ -107,8 +107,8 @@ class Text2ImageAPI(ModelAPI):
107
107
  kwargs['num_inference_steps'] = config.num_inference_steps
108
108
  if config.guidance_scale is not None:
109
109
  kwargs['guidance_scale'] = config.guidance_scale
110
- if config.extra_body is not None:
111
- kwargs.update(config.extra_body)
110
+ # update with extra model parameters
111
+ kwargs.update(config.model_extra)
112
112
 
113
113
  # assume the first text as prompt
114
114
  prompt = input[0].text
@@ -104,10 +104,9 @@ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartPa
104
104
  )
105
105
  elif content.type == 'audio':
106
106
  audio_data_uri = file_as_data_uri(content.audio)
107
- audio_data = audio_data_uri.split('base64,')[1]
108
107
 
109
108
  return ChatCompletionContentPartInputAudioParam(
110
- type='input_audio', input_audio=dict(data=audio_data, format=content.format)
109
+ type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
111
110
  )
112
111
 
113
112
  else:
@@ -209,7 +208,7 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
209
208
  return params
210
209
 
211
210
 
212
- def openai_assistant_content(message: ChatMessageAssistant) -> str:
211
+ def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
213
212
  # In agent bridge scenarios, we could encounter concepts such as reasoning and
214
213
  # .internal use in the ChatMessageAssistant that are not supported by the OpenAI
215
214
  # choices API. This code smuggles that data into the plain text so that it
@@ -220,7 +219,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
220
219
  else:
221
220
  content = ''
222
221
  for c in message.content:
223
- if c.type == 'reasoning':
222
+ if c.type == 'reasoning' and include_reasoning:
224
223
  attribs = ''
225
224
  if c.signature is not None:
226
225
  attribs = f'{attribs} signature="{c.signature}"'
@@ -239,11 +238,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
239
238
  return content
240
239
 
241
240
 
242
- def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
241
+ def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
243
242
  oai_choices: List[Choice] = []
244
243
 
245
244
  for index, choice in enumerate(choices):
246
- content = openai_assistant_content(choice.message)
245
+ # Handle content
246
+ content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
247
+
248
+ # Handle tool calls
247
249
  if choice.message.tool_calls:
248
250
  tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
249
251
  else:
@@ -55,6 +55,7 @@ class Arguments(BaseArgument):
55
55
  image_height: int = 224 # Height of the image for random VL dataset
56
56
  image_format: str = 'RGB' # Image format for random VL dataset
57
57
  image_num: int = 1 # Number of images for random VL dataset
58
+ image_patch_size: int = 28 # Patch size for image tokenizer, only for local image token calculation
58
59
 
59
60
  # Dataset settings
60
61
  dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
@@ -171,6 +172,7 @@ def add_argument(parser: argparse.ArgumentParser):
171
172
  parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
172
173
  parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
173
174
  parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
175
+ parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation') # noqa: E501
174
176
 
175
177
  # Output settings
176
178
  parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
@@ -42,6 +42,8 @@ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGen
42
42
  try:
43
43
  for messages in message_generator.build_messages():
44
44
  dataset_messages.append(messages)
45
+ if len(dataset_messages) >= args.number:
46
+ break
45
47
  except StopIteration:
46
48
  pass
47
49
 
@@ -43,7 +43,7 @@ class ApiPluginBase:
43
43
 
44
44
  @abstractmethod
45
45
  async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
46
- body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
46
+ body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
47
47
  """Process the HTTP request and handle the response.
48
48
 
49
49
  Args:
@@ -53,7 +53,7 @@ class ApiPluginBase:
53
53
  body: The request body
54
54
 
55
55
  Yields:
56
- Tuple[bool, int, str]: (is_error, status_code, response_data)
56
+ Tuple[bool, int, Any]: (is_error, status_code, response_data)
57
57
  """
58
58
  raise NotImplementedError
59
59
 
@@ -18,7 +18,7 @@ class DefaultApiPlugin(ApiPluginBase):
18
18
  super().__init__(param)
19
19
 
20
20
  async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
21
- body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
21
+ body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
22
22
  """Process the HTTP request and handle the response.
23
23
 
24
24
  Args:
@@ -28,7 +28,7 @@ class DefaultApiPlugin(ApiPluginBase):
28
28
  body: The request body
29
29
 
30
30
  Yields:
31
- Tuple[bool, int, str]: (is_error, status_code, response_data)
31
+ Tuple[bool, int, Any]: (is_error, status_code, response_data)
32
32
  """
33
33
  try:
34
34
  headers = {'Content-Type': 'application/json', **headers}
@@ -40,7 +40,7 @@ class DefaultApiPlugin(ApiPluginBase):
40
40
  logger.error(f'Error in process_request: {e}')
41
41
  yield (True, None, str(e))
42
42
 
43
- async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
43
+ async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
44
44
  """Handle streaming response from server-sent events.
45
45
 
46
46
  Args:
@@ -71,14 +71,14 @@ class DefaultApiPlugin(ApiPluginBase):
71
71
  logger.error(f'Error in _handle_stream: {e}')
72
72
  yield True, response.status, str(e)
73
73
 
74
- async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
74
+ async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
75
75
  """Handle the HTTP response based on content type and status.
76
76
 
77
77
  Args:
78
78
  response: The aiohttp response object
79
79
 
80
80
  Yields:
81
- Tuple[bool, int, str]: (is_error, status_code, response_data)
81
+ Tuple[bool, int, Any]: (is_error, status_code, response_data)
82
82
  """
83
83
  response_status = response.status
84
84
  response_content_type = response.content_type
@@ -94,7 +94,7 @@ class DefaultApiPlugin(ApiPluginBase):
94
94
  # Handle successful response with 'application/json' content type
95
95
  elif content_type_json in response_content_type:
96
96
  content = await response.json()
97
- yield (False, response_status, json.dumps(content, ensure_ascii=False))
97
+ yield (False, response_status, content)
98
98
  # Handle other successful responses
99
99
  else:
100
100
  content = await response.read()
@@ -102,4 +102,4 @@ class DefaultApiPlugin(ApiPluginBase):
102
102
  else:
103
103
  # error is always in JSON format
104
104
  error = await response.json()
105
- yield (True, response_status, json.dumps(error, ensure_ascii=False))
105
+ yield (True, response_status, error)
@@ -1,10 +1,13 @@
1
1
  import json
2
+ import math
2
3
  import os
4
+ from collections import defaultdict
3
5
  from typing import Any, Dict, List, Tuple, Union
4
6
 
5
7
  from evalscope.perf.arguments import Arguments
6
8
  from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
7
9
  from evalscope.perf.plugin.registry import register_api
10
+ from evalscope.utils.io_utils import base64_to_PIL
8
11
  from evalscope.utils.logger import get_logger
9
12
 
10
13
  logger = get_logger()
@@ -113,7 +116,7 @@ class OpenaiPlugin(DefaultApiPlugin):
113
116
  return input_tokens, output_tokens
114
117
 
115
118
  # no usage information in the response, parse the response to get the tokens
116
- delta_contents = {}
119
+ delta_contents = defaultdict(list)
117
120
  for response in responses:
118
121
  if 'object' in response:
119
122
  self.__process_response_object(response, delta_contents)
@@ -123,41 +126,46 @@ class OpenaiPlugin(DefaultApiPlugin):
123
126
  input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
124
127
  return input_tokens, output_tokens
125
128
 
126
- def __process_response_object(self, js, delta_contents):
127
- if js['object'] == 'chat.completion':
128
- for choice in js['choices']:
129
+ def __process_response_object(self, response, delta_contents):
130
+ if not response.get('choices'):
131
+ return
132
+ if response['object'] == 'chat.completion':
133
+ for choice in response['choices']:
129
134
  delta_contents[choice['index']] = [choice['message']['content']]
130
- elif js['object'] == 'text_completion':
131
- for choice in js['choices']:
132
- delta_contents[choice['index']] = [choice['text']]
133
- elif js['object'] == 'chat.completion.chunk':
134
- for choice in js.get('choices', []):
135
+ elif response['object'] == 'text_completion':
136
+ for choice in response['choices']:
137
+ if 'text' in choice and 'index' in choice:
138
+ delta_contents[choice['index']].append(choice['text'])
139
+ elif response['object'] == 'chat.completion.chunk':
140
+ for choice in response['choices']:
135
141
  if 'delta' in choice and 'index' in choice:
136
142
  delta = choice['delta']
137
143
  idx = choice['index']
138
144
  if 'content' in delta:
139
- delta_content = delta['content']
140
- delta_contents.setdefault(idx, []).append(delta_content)
145
+ delta_contents[idx].append(delta['content'])
141
146
 
142
- def __process_no_object(self, js, delta_contents):
147
+ def __process_no_object(self, response, delta_contents):
143
148
  # assume the response is a single choice
144
- for choice in js['choices']:
149
+ if not response.get('choices'):
150
+ return
151
+ for choice in response['choices']:
145
152
  if 'delta' in choice:
146
153
  delta = choice['delta']
147
154
  idx = choice['index']
148
155
  if 'content' in delta:
149
- delta_content = delta['content']
150
- delta_contents.setdefault(idx, []).append(delta_content)
156
+ delta_contents[idx].append(delta['content'])
151
157
  else:
152
158
  delta_contents[choice['index']] = [choice['message']['content']]
153
159
 
154
- def __calculate_tokens_from_content(self, request, delta_contents):
160
+ def __calculate_tokens_from_content(self, request, content):
155
161
  input_tokens = output_tokens = 0
156
162
  if self.tokenizer is not None:
157
- for idx, choice_contents in delta_contents.items():
163
+ # Calculate input tokens
164
+ input_tokens += self._count_input_tokens(request)
165
+ for idx, choice_contents in content.items():
158
166
  full_response_content = ''.join(choice_contents)
159
- input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
160
- output_tokens += len(self.tokenizer.encode(full_response_content))
167
+ # Calculate output tokens
168
+ output_tokens += self._count_output_tokens(full_response_content)
161
169
  else:
162
170
  raise ValueError(
163
171
  'Error: Unable to retrieve usage information\n\n'
@@ -171,3 +179,59 @@ class OpenaiPlugin(DefaultApiPlugin):
171
179
  'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
172
180
  )
173
181
  return input_tokens, output_tokens
182
+
183
+ def _count_input_tokens(self, request: Dict) -> int:
184
+ """Count the number of input tokens in the request.
185
+
186
+ This method handles different types of requests and calculates tokens for:
187
+ - Text content in messages or prompts
188
+ - Images in multimodal messages (converted to patch tokens)
189
+
190
+ Args:
191
+ request (Dict): The request dictionary containing either 'messages' for chat
192
+ completion or 'prompt' for text completion.
193
+
194
+ Returns:
195
+ int: The total number of input tokens including text and image tokens.
196
+ """
197
+ input_tokens = 0
198
+ if 'messages' in request:
199
+ input_content = self.tokenizer.apply_chat_template(
200
+ request['messages'], tokenize=True, add_generation_prompt=True
201
+ )
202
+ input_tokens += len(input_content)
203
+ # handle image tokens if any
204
+ for message in request['messages']:
205
+ content = message.get('content', '')
206
+ if isinstance(content, str):
207
+ continue
208
+ for cont in content:
209
+ if cont['type'] == 'image_url':
210
+ try:
211
+ # assuming image_url is base64 string
212
+ image_base64 = cont['image_url']['url']
213
+ image = base64_to_PIL(image_base64)
214
+ # Use math.ceil for more accurate token count when image dimensions
215
+ # aren't perfectly divisible by patch size
216
+ n_patches = (
217
+ math.ceil(image.height / self.param.image_patch_size)
218
+ * math.ceil(image.width / self.param.image_patch_size)
219
+ )
220
+ input_tokens += n_patches
221
+ except Exception as e:
222
+ logger.warning(f'Failed to process image for token counting: {e}')
223
+ # Continue processing other content without failing
224
+ elif 'prompt' in request:
225
+ input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
226
+ return input_tokens
227
+
228
+ def _count_output_tokens(self, response: str) -> int:
229
+ """Count the number of output tokens in the response. Only string response is supported.
230
+
231
+ Args:
232
+ response (str): The API response text.
233
+
234
+ Returns:
235
+ int: The number of output tokens.
236
+ """
237
+ return len(self.tokenizer.encode(response, add_special_tokens=False))
@@ -22,7 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
22
22
  for item in dataset:
23
23
  pil_image = item['jpg']
24
24
  text = item['txt']
25
- base64_image = PIL_to_base64(pil_image)
25
+ base64_image = PIL_to_base64(pil_image, add_header=True)
26
26
 
27
- message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
27
+ message = self.create_message(text=text, image_urls=base64_image)
28
28
  yield [message]
@@ -22,7 +22,7 @@ class KontextDatasetPlugin(DatasetPluginBase):
22
22
  for item in dataset:
23
23
  pil_image = item['image']
24
24
  text = item['instruction']
25
- base64_image = PIL_to_base64(pil_image)
25
+ base64_image = PIL_to_base64(pil_image, add_header=True)
26
26
 
27
- message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
27
+ message = self.create_message(text=text, image_urls=base64_image)
28
28
  yield [message]
@@ -31,7 +31,7 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
31
31
  # Generate random images based on image_num
32
32
  images_b64 = []
33
33
  for _ in range(self.image_num):
34
- images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
34
+ images_b64.append(self._generate_random_image_b64())
35
35
 
36
36
  message = self.create_message(text=prompt, image_urls=images_b64)
37
37
  yield [message]
@@ -77,4 +77,4 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
77
77
  draw.line(coords, fill=shape_color, width=random.randint(1, 5))
78
78
 
79
79
  # Convert to base64
80
- return PIL_to_base64(image, format='PNG')
80
+ return PIL_to_base64(image, format='PNG', add_header=True)