evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import aiohttp
|
|
1
2
|
import json
|
|
2
|
-
from typing import Any, Dict,
|
|
3
|
+
from typing import Any, AsyncGenerator, Dict, List, Tuple, Union
|
|
3
4
|
|
|
4
5
|
from evalscope.perf.arguments import Arguments
|
|
5
6
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
@@ -11,82 +12,238 @@ logger = get_logger()
|
|
|
11
12
|
|
|
12
13
|
@register_api('custom')
|
|
13
14
|
class CustomPlugin(ApiPluginBase):
|
|
14
|
-
"""Support
|
|
15
|
+
"""Support custom API implementations.
|
|
16
|
+
|
|
17
|
+
This class serves as a template for users to implement their own API plugins.
|
|
18
|
+
By extending this class, users can connect to any LLM API with custom request
|
|
19
|
+
and response formats.
|
|
15
20
|
"""
|
|
16
21
|
|
|
17
|
-
def __init__(self,
|
|
18
|
-
"""
|
|
22
|
+
def __init__(self, param: Arguments):
|
|
23
|
+
"""Initialize the plugin with the provided parameters.
|
|
19
24
|
|
|
20
25
|
Args:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
26
|
+
param (Arguments): Configuration parameters for the plugin, including:
|
|
27
|
+
- tokenizer_path: Path to the tokenizer for token counting
|
|
28
|
+
- model: Name of the model to use
|
|
29
|
+
- Other request parameters like max_tokens, temperature, etc.
|
|
24
30
|
"""
|
|
25
|
-
super().__init__(
|
|
26
|
-
if
|
|
31
|
+
super().__init__(param=param)
|
|
32
|
+
if param.tokenizer_path is not None:
|
|
27
33
|
from modelscope import AutoTokenizer
|
|
28
|
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
34
|
+
self.tokenizer = AutoTokenizer.from_pretrained(param.tokenizer_path)
|
|
29
35
|
else:
|
|
30
36
|
self.tokenizer = None
|
|
31
37
|
|
|
32
|
-
def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
|
|
33
|
-
"""Build
|
|
38
|
+
def build_request(self, messages: Union[List[Dict], str], param: Arguments = None) -> Dict:
|
|
39
|
+
"""Build a custom API request body based on the input messages and parameters.
|
|
34
40
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
param (Arguments): The query parameters.
|
|
41
|
+
This method formats the input messages into the expected request format
|
|
42
|
+
for your custom API.
|
|
38
43
|
|
|
39
|
-
|
|
40
|
-
|
|
44
|
+
Args:
|
|
45
|
+
messages (Union[List[Dict], str]): The input messages to include in the request.
|
|
46
|
+
Could be a list of message dictionaries (for chat models) or a string (for completion models).
|
|
47
|
+
param (Arguments, optional): Request parameters. Defaults to self.param.
|
|
41
48
|
|
|
42
49
|
Returns:
|
|
43
|
-
Dict:
|
|
50
|
+
Dict: A properly formatted request body for your custom API.
|
|
44
51
|
"""
|
|
52
|
+
param = param or self.param
|
|
45
53
|
try:
|
|
46
|
-
query
|
|
47
|
-
|
|
48
|
-
|
|
54
|
+
# Create a default query format if no template is provided
|
|
55
|
+
if isinstance(messages, str):
|
|
56
|
+
query = {'input_text': messages}
|
|
57
|
+
else:
|
|
58
|
+
query = {'messages': messages}
|
|
59
|
+
|
|
60
|
+
# Add model parameters to the request
|
|
61
|
+
return self._add_parameters_to_request(query, param)
|
|
49
62
|
except Exception as e:
|
|
50
63
|
logger.exception(e)
|
|
51
|
-
logger.error('Prompt: %s invalidate!' % messages)
|
|
52
64
|
return None
|
|
53
65
|
|
|
54
|
-
def
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
|
|
66
|
+
def _add_parameters_to_request(self, payload: Dict, param: Arguments) -> Dict:
|
|
67
|
+
"""Add model parameters to the request payload.
|
|
68
|
+
|
|
69
|
+
This helper method adds various parameters like temperature, max_tokens, etc.
|
|
70
|
+
to the request based on what your custom API supports.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
payload (Dict): The base request payload.
|
|
74
|
+
param (Arguments): The parameters to add.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Dict: The request payload with added parameters.
|
|
78
|
+
"""
|
|
79
|
+
# Add the model name
|
|
80
|
+
payload['model'] = param.model
|
|
81
|
+
|
|
82
|
+
# Add various parameters if they are provided
|
|
83
|
+
if param.max_tokens is not None:
|
|
84
|
+
payload['max_tokens'] = param.max_tokens
|
|
85
|
+
if param.temperature is not None:
|
|
86
|
+
payload['temperature'] = param.temperature
|
|
87
|
+
if param.top_p is not None:
|
|
88
|
+
payload['top_p'] = param.top_p
|
|
89
|
+
if param.top_k is not None:
|
|
90
|
+
payload['top_k'] = param.top_k
|
|
91
|
+
if param.stream is not None:
|
|
92
|
+
payload['stream'] = param.stream
|
|
93
|
+
payload['stream_options'] = {'include_usage': True}
|
|
58
94
|
|
|
95
|
+
# Add any extra arguments passed via command line
|
|
96
|
+
if param.extra_args is not None:
|
|
97
|
+
payload.update(param.extra_args)
|
|
98
|
+
|
|
99
|
+
return payload
|
|
100
|
+
|
|
101
|
+
def parse_responses(self, responses: List[str], request: Any = None, **kwargs) -> Tuple[int, int]:
|
|
102
|
+
"""Parse API responses and return token counts.
|
|
103
|
+
|
|
104
|
+
This method extracts the number of input and output tokens from the API responses.
|
|
105
|
+
Different APIs may return this information in different formats, or you may need
|
|
106
|
+
to calculate it using a tokenizer.
|
|
59
107
|
|
|
60
108
|
Args:
|
|
61
|
-
responses (List[
|
|
62
|
-
|
|
63
|
-
kwargs:
|
|
109
|
+
responses (List[str]): List of API response strings.
|
|
110
|
+
request (Any, optional): The original request, which might be needed for token calculation.
|
|
111
|
+
**kwargs: Additional arguments.
|
|
112
|
+
|
|
64
113
|
Returns:
|
|
65
|
-
Tuple:
|
|
114
|
+
Tuple[int, int]: (input_tokens, output_tokens) - The number of tokens in the prompt and completion.
|
|
115
|
+
"""
|
|
116
|
+
try:
|
|
117
|
+
# Example 1: Try to get token counts from the API response
|
|
118
|
+
last_response = json.loads(responses[-1])
|
|
119
|
+
|
|
120
|
+
# If the API provides token usage information
|
|
121
|
+
if 'usage' in last_response and last_response['usage']:
|
|
122
|
+
input_tokens = last_response['usage'].get('prompt_tokens', 0)
|
|
123
|
+
output_tokens = last_response['usage'].get('completion_tokens', 0)
|
|
124
|
+
return input_tokens, output_tokens
|
|
125
|
+
|
|
126
|
+
# Example 2: Calculate tokens using the tokenizer if no usage info is available
|
|
127
|
+
if self.tokenizer is not None:
|
|
128
|
+
input_text = ''
|
|
129
|
+
output_text = ''
|
|
130
|
+
|
|
131
|
+
# Extract input text from the request
|
|
132
|
+
if request and 'messages' in request:
|
|
133
|
+
# For chat API
|
|
134
|
+
input_text = ' '.join([msg['content'] for msg in request['messages']])
|
|
135
|
+
elif request and 'input_text' in request:
|
|
136
|
+
# For completion API
|
|
137
|
+
input_text = request['input_text']
|
|
138
|
+
|
|
139
|
+
# Extract output text from the response
|
|
140
|
+
for response in responses:
|
|
141
|
+
js = json.loads(response)
|
|
142
|
+
if 'choices' in js:
|
|
143
|
+
for choice in js['choices']:
|
|
144
|
+
if 'message' in choice and 'content' in choice['message']:
|
|
145
|
+
output_text += choice['message']['content']
|
|
146
|
+
elif 'text' in choice:
|
|
147
|
+
output_text += choice['text']
|
|
148
|
+
|
|
149
|
+
# Count tokens
|
|
150
|
+
input_tokens = len(self.tokenizer.encode(input_text))
|
|
151
|
+
output_tokens = len(self.tokenizer.encode(output_text))
|
|
152
|
+
return input_tokens, output_tokens
|
|
153
|
+
|
|
154
|
+
# If no usage information and no tokenizer, raise an error
|
|
155
|
+
raise ValueError(
|
|
156
|
+
'Cannot determine token counts: no usage information in response and no tokenizer provided.')
|
|
157
|
+
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.error(f'Error parsing responses: {e}')
|
|
160
|
+
return 0, 0
|
|
161
|
+
|
|
162
|
+
async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
|
|
163
|
+
body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
|
|
164
|
+
"""Process the HTTP request and handle the response.
|
|
165
|
+
|
|
166
|
+
This method handles sending the request to your API and processing the response,
|
|
167
|
+
including handling streaming responses if supported.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
client_session (aiohttp.ClientSession): The aiohttp client session.
|
|
171
|
+
url (str): The API endpoint URL.
|
|
172
|
+
headers (Dict): The request headers.
|
|
173
|
+
body (Dict): The request body.
|
|
174
|
+
|
|
175
|
+
Yields:
|
|
176
|
+
Tuple[bool, int, str]: (is_error, status_code, response_data)
|
|
177
|
+
- is_error: Whether the response indicates an error
|
|
178
|
+
- status_code: HTTP status code
|
|
179
|
+
- response_data: Response content
|
|
66
180
|
"""
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
data = json.
|
|
73
|
-
|
|
74
|
-
#
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
181
|
+
try:
|
|
182
|
+
# Set content type header
|
|
183
|
+
headers = {'Content-Type': 'application/json', **headers}
|
|
184
|
+
|
|
185
|
+
# Convert body to JSON
|
|
186
|
+
data = json.dumps(body, ensure_ascii=False)
|
|
187
|
+
|
|
188
|
+
# Send the request
|
|
189
|
+
async with client_session.request(
|
|
190
|
+
'POST', url=url, data=data, headers=headers) as response: # noqa: E125
|
|
191
|
+
# Get the status code
|
|
192
|
+
status_code = response.status
|
|
193
|
+
|
|
194
|
+
# Check if it's a streaming response
|
|
195
|
+
if 'text/event-stream' in response.content_type:
|
|
196
|
+
# Handle streaming response
|
|
197
|
+
async for line in response.content:
|
|
198
|
+
line_str = line.decode('utf-8').strip()
|
|
199
|
+
if not line_str:
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
# Check for data prefix in server-sent events
|
|
203
|
+
if line_str.startswith('data: '):
|
|
204
|
+
data = line_str[6:] # Remove 'data: ' prefix
|
|
205
|
+
|
|
206
|
+
# Check if it's the end of the stream
|
|
207
|
+
if data == '[DONE]':
|
|
208
|
+
break
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
# Parse the JSON data
|
|
212
|
+
parsed_data = json.loads(data)
|
|
213
|
+
yield (False, status_code, json.dumps(parsed_data))
|
|
214
|
+
except json.JSONDecodeError:
|
|
215
|
+
yield (True, status_code, f'Failed to parse JSON: {data}')
|
|
78
216
|
else:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
logger.
|
|
91
|
-
|
|
92
|
-
|
|
217
|
+
# Handle regular response
|
|
218
|
+
if 'application/json' in response.content_type:
|
|
219
|
+
# JSON response
|
|
220
|
+
content = await response.json()
|
|
221
|
+
yield (status_code >= 400, status_code, json.dumps(content))
|
|
222
|
+
else:
|
|
223
|
+
# Text response
|
|
224
|
+
content = await response.text()
|
|
225
|
+
yield (status_code >= 400, status_code, content)
|
|
226
|
+
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.error(f'Error in process_request: {e}')
|
|
229
|
+
yield (True, 500, str(e))
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
if __name__ == '__main__':
|
|
233
|
+
# Example usage of the CustomPlugin
|
|
234
|
+
from dotenv import dotenv_values
|
|
235
|
+
env = dotenv_values('.env')
|
|
236
|
+
|
|
237
|
+
from evalscope.perf.arguments import Arguments
|
|
238
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
239
|
+
|
|
240
|
+
args = Arguments(
|
|
241
|
+
model='qwen2.5-7b-instruct',
|
|
242
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
243
|
+
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
244
|
+
api='custom', # Use the custom API plugin registered above
|
|
245
|
+
dataset='openqa',
|
|
246
|
+
number=1,
|
|
247
|
+
max_tokens=10)
|
|
248
|
+
|
|
249
|
+
run_perf_benchmark(args)
|
|
@@ -13,17 +13,10 @@ logger = get_logger()
|
|
|
13
13
|
@register_api('dashscope')
|
|
14
14
|
class DashScopeApiPlugin(ApiPluginBase):
|
|
15
15
|
|
|
16
|
-
def __init__(self,
|
|
17
|
-
|
|
16
|
+
def __init__(self, param: Arguments):
|
|
17
|
+
super().__init__(param)
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
mode_path (str): The model path, we use the tokenizer
|
|
21
|
-
weight in the model to calculate the number of the
|
|
22
|
-
input and output tokens.
|
|
23
|
-
"""
|
|
24
|
-
super().__init__(model_path=mode_path)
|
|
25
|
-
|
|
26
|
-
def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
|
|
19
|
+
def build_request(self, messages: List[Dict], param: Arguments = None) -> Dict:
|
|
27
20
|
"""Build the openai format request based on prompt, dataset
|
|
28
21
|
|
|
29
22
|
Args:
|
|
@@ -36,6 +29,7 @@ class DashScopeApiPlugin(ApiPluginBase):
|
|
|
36
29
|
Returns:
|
|
37
30
|
Dict: The request body. None if prompt format is error.
|
|
38
31
|
"""
|
|
32
|
+
param = param or self.param
|
|
39
33
|
try:
|
|
40
34
|
if param.query_template is not None:
|
|
41
35
|
if param.query_template.startswith('@'):
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import aiohttp
|
|
2
|
+
import json
|
|
3
|
+
from http import HTTPStatus
|
|
4
|
+
from typing import Any, AsyncGenerator, Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
from evalscope.perf.arguments import Arguments
|
|
7
|
+
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
8
|
+
from evalscope.perf.utils.local_server import ServerSentEvent
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DefaultApiPlugin(ApiPluginBase):
|
|
15
|
+
"""Default implementation of API plugin with common HTTP handling methods."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, param: Arguments):
|
|
18
|
+
super().__init__(param)
|
|
19
|
+
|
|
20
|
+
async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
|
|
21
|
+
body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
|
|
22
|
+
"""Process the HTTP request and handle the response.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
client_session: The aiohttp client session
|
|
26
|
+
url: The request URL
|
|
27
|
+
headers: The request headers
|
|
28
|
+
body: The request body
|
|
29
|
+
|
|
30
|
+
Yields:
|
|
31
|
+
Tuple[bool, int, str]: (is_error, status_code, response_data)
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
headers = {'Content-Type': 'application/json', **headers}
|
|
35
|
+
data = json.dumps(body, ensure_ascii=False) # serialize to JSON
|
|
36
|
+
async with client_session.request('POST', url=url, data=data, headers=headers) as response:
|
|
37
|
+
async for result in self._handle_response(response):
|
|
38
|
+
yield result
|
|
39
|
+
except Exception as e:
|
|
40
|
+
logger.error(f'Error in process_request: {e}')
|
|
41
|
+
yield (True, None, str(e))
|
|
42
|
+
|
|
43
|
+
async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
|
|
44
|
+
"""Handle streaming response from server-sent events.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
response: The aiohttp response object containing a stream
|
|
48
|
+
|
|
49
|
+
Yields:
|
|
50
|
+
Tuple[bool, int, Any]: (is_error, status_code, data)
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
async for chunk_bytes in response.content:
|
|
54
|
+
chunk_bytes = chunk_bytes.strip()
|
|
55
|
+
if not chunk_bytes:
|
|
56
|
+
continue
|
|
57
|
+
chunk_bytes = chunk_bytes.decode('utf-8')
|
|
58
|
+
# NOTE: SSE comments (often used as pings) start with a colon.
|
|
59
|
+
# These are not JSON data payload and should be skipped.
|
|
60
|
+
if chunk_bytes.startswith(':'):
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
chunk = chunk_bytes.removeprefix('data: ')
|
|
64
|
+
|
|
65
|
+
if chunk != '[DONE]':
|
|
66
|
+
data = json.loads(chunk)
|
|
67
|
+
|
|
68
|
+
yield False, response.status, data
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.error(f'Error in _handle_stream: {e}')
|
|
72
|
+
yield True, response.status, str(e)
|
|
73
|
+
|
|
74
|
+
async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
|
|
75
|
+
"""Handle the HTTP response based on content type and status.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
response: The aiohttp response object
|
|
79
|
+
|
|
80
|
+
Yields:
|
|
81
|
+
Tuple[bool, int, str]: (is_error, status_code, response_data)
|
|
82
|
+
"""
|
|
83
|
+
response_status = response.status
|
|
84
|
+
response_content_type = response.content_type
|
|
85
|
+
content_type_json = 'application/json'
|
|
86
|
+
content_type_stream = 'text/event-stream'
|
|
87
|
+
is_success = (response_status == HTTPStatus.OK)
|
|
88
|
+
|
|
89
|
+
if is_success:
|
|
90
|
+
# Handle successful response with 'text/event-stream' content type
|
|
91
|
+
if content_type_stream in response_content_type:
|
|
92
|
+
async for is_error, response_status, content in self._handle_stream(response):
|
|
93
|
+
yield (is_error, response_status, content)
|
|
94
|
+
# Handle successful response with 'application/json' content type
|
|
95
|
+
elif content_type_json in response_content_type:
|
|
96
|
+
content = await response.json()
|
|
97
|
+
yield (False, response_status, json.dumps(content, ensure_ascii=False))
|
|
98
|
+
# Handle other successful responses
|
|
99
|
+
else:
|
|
100
|
+
content = await response.read()
|
|
101
|
+
yield (False, response_status, content.decode('utf-8'))
|
|
102
|
+
else:
|
|
103
|
+
# error is always in JSON format
|
|
104
|
+
error = await response.json()
|
|
105
|
+
yield (True, response_status, json.dumps(error, ensure_ascii=False))
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
from typing import Any, Dict,
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
4
4
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
6
|
-
from evalscope.perf.plugin.api.
|
|
6
|
+
from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
|
|
7
7
|
from evalscope.perf.plugin.registry import register_api
|
|
8
8
|
from evalscope.utils.logger import get_logger
|
|
9
9
|
|
|
@@ -11,25 +11,25 @@ logger = get_logger()
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
@register_api(['openai', 'local_vllm', 'local'])
|
|
14
|
-
class OpenaiPlugin(
|
|
14
|
+
class OpenaiPlugin(DefaultApiPlugin):
|
|
15
15
|
"""Base of openai interface."""
|
|
16
16
|
|
|
17
|
-
def __init__(self,
|
|
18
|
-
"""
|
|
17
|
+
def __init__(self, param: Arguments):
|
|
18
|
+
"""Initialize the OpenaiPlugin.
|
|
19
19
|
|
|
20
20
|
Args:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
21
|
+
param (Arguments): Configuration object containing parameters
|
|
22
|
+
such as the tokenizer path and model details. If a tokenizer
|
|
23
|
+
path is provided, it is used to initialize the tokenizer.
|
|
24
24
|
"""
|
|
25
|
-
super().__init__(
|
|
26
|
-
if
|
|
25
|
+
super().__init__(param=param)
|
|
26
|
+
if param.tokenizer_path is not None:
|
|
27
27
|
from modelscope import AutoTokenizer
|
|
28
|
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
28
|
+
self.tokenizer = AutoTokenizer.from_pretrained(param.tokenizer_path)
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
31
31
|
|
|
32
|
-
def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
|
|
32
|
+
def build_request(self, messages: Union[List[Dict], str], param: Arguments = None) -> Dict:
|
|
33
33
|
"""Build the openai format request based on prompt, dataset
|
|
34
34
|
|
|
35
35
|
Args:
|
|
@@ -42,6 +42,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
42
42
|
Returns:
|
|
43
43
|
Dict: The request body. None if prompt format is error.
|
|
44
44
|
"""
|
|
45
|
+
param = param or self.param
|
|
45
46
|
try:
|
|
46
47
|
if param.query_template is not None:
|
|
47
48
|
if param.query_template.startswith('@'):
|
|
@@ -54,8 +55,6 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
54
55
|
else:
|
|
55
56
|
query = json.loads(param.query_template)
|
|
56
57
|
|
|
57
|
-
if 'stream' in query.keys():
|
|
58
|
-
param.stream = query['stream']
|
|
59
58
|
# replace template messages with input messages.
|
|
60
59
|
query['messages'] = messages
|
|
61
60
|
elif isinstance(messages, str):
|
|
@@ -107,7 +106,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
107
106
|
|
|
108
107
|
# when stream, the last response is the full usage
|
|
109
108
|
# when non-stream, the last response is the first response
|
|
110
|
-
last_response_js =
|
|
109
|
+
last_response_js = responses[-1]
|
|
111
110
|
if 'usage' in last_response_js and last_response_js['usage']:
|
|
112
111
|
input_tokens = last_response_js['usage']['prompt_tokens']
|
|
113
112
|
output_tokens = last_response_js['usage']['completion_tokens']
|
|
@@ -116,11 +115,10 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
116
115
|
# no usage information in the response, parse the response to get the tokens
|
|
117
116
|
delta_contents = {}
|
|
118
117
|
for response in responses:
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
self.__process_response_object(js, delta_contents)
|
|
118
|
+
if 'object' in response:
|
|
119
|
+
self.__process_response_object(response, delta_contents)
|
|
122
120
|
else:
|
|
123
|
-
self.__process_no_object(
|
|
121
|
+
self.__process_no_object(response, delta_contents)
|
|
124
122
|
|
|
125
123
|
input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
|
|
126
124
|
return input_tokens, output_tokens
|
|
@@ -1,7 +1,10 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
1
|
+
from .base import DatasetPluginBase
|
|
2
|
+
from .custom import CustomDatasetPlugin
|
|
3
|
+
from .flickr8k import FlickrDatasetPlugin
|
|
4
|
+
from .kontext_bench import KontextDatasetPlugin
|
|
5
|
+
from .line_by_line import LineByLineDatasetPlugin
|
|
6
|
+
from .longalpaca import LongAlpacaDatasetPlugin
|
|
7
|
+
from .openqa import OpenqaDatasetPlugin
|
|
8
|
+
from .random_dataset import RandomDatasetPlugin
|
|
9
|
+
from .random_vl_dataset import RandomVLDatasetPlugin
|
|
10
|
+
from .speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import sys
|
|
3
3
|
from abc import abstractmethod
|
|
4
|
-
from typing import Any, Dict, Iterator, List, Tuple
|
|
4
|
+
from typing import Any, Dict, Iterator, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.perf.arguments import Arguments
|
|
7
7
|
|
|
@@ -64,3 +64,24 @@ class DatasetPluginBase:
|
|
|
64
64
|
data = json.loads(content)
|
|
65
65
|
for item in data:
|
|
66
66
|
yield item
|
|
67
|
+
|
|
68
|
+
def create_message(self, text: str, image_urls: Union[List[str], str] = None, role: str = 'user') -> Dict:
|
|
69
|
+
"""Create a message with text and optional image URLs.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
text (str): The text content of the message.
|
|
73
|
+
image_urls (List[str], optional): List of image URLs. Defaults to None.
|
|
74
|
+
role (str, optional): The role of the message sender. Defaults to "user".
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Dict: A dictionary representing the message.
|
|
78
|
+
"""
|
|
79
|
+
if image_urls is None:
|
|
80
|
+
message = {'role': role, 'content': text}
|
|
81
|
+
else:
|
|
82
|
+
message = {'role': role, 'content': [{'type': 'text', 'text': text}]}
|
|
83
|
+
if isinstance(image_urls, str):
|
|
84
|
+
image_urls = [image_urls]
|
|
85
|
+
for url in image_urls:
|
|
86
|
+
message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
|
|
87
|
+
return message
|
|
@@ -19,7 +19,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
19
19
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
20
20
|
prompt) < self.query_parameters.max_prompt_length:
|
|
21
21
|
if self.query_parameters.apply_chat_template:
|
|
22
|
-
|
|
22
|
+
message = self.create_message(prompt)
|
|
23
|
+
yield [message]
|
|
23
24
|
else:
|
|
24
25
|
yield prompt
|
|
25
26
|
|
|
@@ -1,18 +1,9 @@
|
|
|
1
|
-
import base64
|
|
2
|
-
from io import BytesIO
|
|
3
|
-
from PIL import Image
|
|
4
1
|
from typing import Any, Dict, Iterator, List
|
|
5
2
|
|
|
6
3
|
from evalscope.perf.arguments import Arguments
|
|
7
4
|
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
8
5
|
from evalscope.perf.plugin.registry import register_dataset
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def PIL_to_base64(image: Image.Image) -> str:
|
|
12
|
-
buffered = BytesIO()
|
|
13
|
-
image.save(buffered, format='JPEG')
|
|
14
|
-
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
15
|
-
return img_str
|
|
6
|
+
from evalscope.utils.io_utils import PIL_to_base64
|
|
16
7
|
|
|
17
8
|
|
|
18
9
|
@register_dataset('flickr8k')
|
|
@@ -31,21 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
31
22
|
for item in dataset:
|
|
32
23
|
pil_image = item['jpg']
|
|
33
24
|
text = item['txt']
|
|
34
|
-
|
|
25
|
+
base64_image = PIL_to_base64(pil_image)
|
|
35
26
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
'user',
|
|
39
|
-
'content': [
|
|
40
|
-
{
|
|
41
|
-
'type': 'text',
|
|
42
|
-
'text': text,
|
|
43
|
-
},
|
|
44
|
-
{
|
|
45
|
-
'type': 'image_url',
|
|
46
|
-
'image_url': {
|
|
47
|
-
'url': f'data:image/jpeg;base64,{base64_iamge}',
|
|
48
|
-
}
|
|
49
|
-
},
|
|
50
|
-
],
|
|
51
|
-
}]
|
|
27
|
+
message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
|
|
28
|
+
yield [message]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterator, List
|
|
2
|
+
|
|
3
|
+
from evalscope.perf.arguments import Arguments
|
|
4
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
5
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
6
|
+
from evalscope.utils.io_utils import PIL_to_base64
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_dataset('kontext_bench')
|
|
10
|
+
class KontextDatasetPlugin(DatasetPluginBase):
|
|
11
|
+
"""Read dataset and return prompt.
|
|
12
|
+
Datasets: https://modelscope.cn/datasets/black-forest-labs/kontext-bench/dataPeview
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, query_parameters: Arguments):
|
|
16
|
+
super().__init__(query_parameters)
|
|
17
|
+
|
|
18
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
19
|
+
from modelscope.msdatasets import MsDataset
|
|
20
|
+
dataset = MsDataset.load('black-forest-labs/kontext-bench', subset_name='default', split='test')
|
|
21
|
+
|
|
22
|
+
for item in dataset:
|
|
23
|
+
pil_image = item['image']
|
|
24
|
+
text = item['instruction']
|
|
25
|
+
base64_image = PIL_to_base64(pil_image)
|
|
26
|
+
|
|
27
|
+
message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
|
|
28
|
+
yield [message]
|