evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +67 -59
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +12 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +101 -18
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/utils.py +28 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +94 -32
- evalscope/config.py +54 -17
- evalscope/evaluator/evaluator.py +80 -41
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +15 -8
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +17 -25
- evalscope/perf/arguments.py +16 -7
- evalscope/perf/benchmark.py +0 -15
- evalscope/perf/main.py +72 -15
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +34 -16
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +61 -4
- evalscope/run.py +12 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -32
- tests/cli/test_collection.py +8 -6
- tests/cli/test_run.py +43 -17
- tests/perf/test_perf.py +23 -0
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
3
|
import torch
|
|
4
|
-
from typing import Any, Dict, List, Tuple, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
|
|
7
7
|
from evalscope.utils.logger import get_logger
|
|
@@ -58,19 +58,15 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
58
58
|
return generation_config
|
|
59
59
|
|
|
60
60
|
def _model_generate(self,
|
|
61
|
-
|
|
62
|
-
system_prompts: List[str] = None,
|
|
61
|
+
formatted_prompts: List[str],
|
|
63
62
|
infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
|
|
64
63
|
"""
|
|
65
64
|
Args:
|
|
66
|
-
|
|
67
|
-
system_prompts: The system prompts.
|
|
65
|
+
formatted_prompts: The formatted prompts.
|
|
68
66
|
infer_cfg: The inference configuration.
|
|
69
67
|
Returns:
|
|
70
68
|
The prediction results.
|
|
71
69
|
"""
|
|
72
|
-
if system_prompts is None:
|
|
73
|
-
system_prompts = []
|
|
74
70
|
if infer_cfg is None:
|
|
75
71
|
infer_cfg = {}
|
|
76
72
|
|
|
@@ -92,27 +88,6 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
92
88
|
self.generation_config.update(**infer_cfg)
|
|
93
89
|
fix_do_sample_warning(self.generation_config)
|
|
94
90
|
|
|
95
|
-
# For chat model, use the chat template to format the input
|
|
96
|
-
if self.tokenizer.chat_template is not None:
|
|
97
|
-
formatted_prompts = []
|
|
98
|
-
for i, query in enumerate(queries):
|
|
99
|
-
messages = [ChatMessage(role='user', content=query)]
|
|
100
|
-
if i < len(system_prompts) and system_prompts[i]:
|
|
101
|
-
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
102
|
-
# whether thinking is needed
|
|
103
|
-
chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
|
|
104
|
-
if chat_template_kwargs is not None:
|
|
105
|
-
prompts = self.tokenizer.apply_chat_template(
|
|
106
|
-
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
|
|
107
|
-
else:
|
|
108
|
-
prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
109
|
-
formatted_prompts.append(prompts)
|
|
110
|
-
else:
|
|
111
|
-
# For base model, use the queries as the input
|
|
112
|
-
formatted_prompts = queries
|
|
113
|
-
|
|
114
|
-
logger.debug(f'formatted_prompts: {formatted_prompts}')
|
|
115
|
-
|
|
116
91
|
# Get input ids
|
|
117
92
|
inputs = self.tokenizer(
|
|
118
93
|
formatted_prompts, return_tensors='pt', padding=True, truncation=True,
|
|
@@ -136,26 +111,68 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
136
111
|
|
|
137
112
|
return responses, input_lengths
|
|
138
113
|
|
|
139
|
-
|
|
140
|
-
def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
|
|
114
|
+
def _prepare_inputs(self, inputs: List[dict], infer_cfg: dict = {}) -> List[str]:
|
|
141
115
|
"""
|
|
116
|
+
Prepare the inputs for the model.
|
|
142
117
|
Args:
|
|
143
118
|
inputs: The input data.
|
|
144
119
|
infer_cfg: The inference configuration.
|
|
145
120
|
Returns:
|
|
146
|
-
The
|
|
121
|
+
The prepared inputs and system prompts.
|
|
147
122
|
"""
|
|
148
|
-
|
|
149
|
-
# Process inputs
|
|
150
123
|
queries = []
|
|
151
124
|
system_prompts = []
|
|
125
|
+
message_list = []
|
|
152
126
|
|
|
153
127
|
for input_item in inputs:
|
|
154
128
|
queries.append(input_item['data'][0])
|
|
155
129
|
system_prompts.append(input_item.get('system_prompt', None))
|
|
130
|
+
if input_item.get('messages', None):
|
|
131
|
+
message_list.append(input_item.get('messages', None))
|
|
132
|
+
|
|
133
|
+
# For non chat model, use the original queries as the input
|
|
134
|
+
if self.tokenizer.chat_template is None:
|
|
135
|
+
return queries
|
|
136
|
+
|
|
137
|
+
# For chat model, use the messages as the input
|
|
138
|
+
# if message_list is None, use the queries as the input
|
|
139
|
+
if len(message_list) == 0:
|
|
140
|
+
for i, query in enumerate(queries):
|
|
141
|
+
messages = [ChatMessage(role='user', content=query)]
|
|
142
|
+
if i < len(system_prompts) and system_prompts[i]:
|
|
143
|
+
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
144
|
+
message_list.append(messages)
|
|
145
|
+
|
|
146
|
+
# Format the messages
|
|
147
|
+
formatted_prompts = []
|
|
148
|
+
for messages in message_list:
|
|
149
|
+
# apply chat template
|
|
150
|
+
chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
|
|
151
|
+
if chat_template_kwargs is not None:
|
|
152
|
+
prompts = self.tokenizer.apply_chat_template(
|
|
153
|
+
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
|
|
154
|
+
else:
|
|
155
|
+
prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
156
|
+
formatted_prompts.append(prompts)
|
|
157
|
+
|
|
158
|
+
logger.debug(f'formatted_prompts: {formatted_prompts}')
|
|
159
|
+
return formatted_prompts
|
|
160
|
+
|
|
161
|
+
@torch.no_grad()
|
|
162
|
+
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = {}) -> List[dict]:
|
|
163
|
+
"""
|
|
164
|
+
Args:
|
|
165
|
+
inputs: The input data.
|
|
166
|
+
infer_cfg: The inference configuration.
|
|
167
|
+
Returns:
|
|
168
|
+
The prediction results.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
# Process inputs
|
|
172
|
+
formatted_prompts = self._prepare_inputs(inputs, infer_cfg)
|
|
156
173
|
|
|
157
174
|
# Run inference
|
|
158
|
-
responses, input_lengths = self._model_generate(
|
|
175
|
+
responses, input_lengths = self._model_generate(formatted_prompts, infer_cfg)
|
|
159
176
|
|
|
160
177
|
# Process outputs
|
|
161
178
|
results = []
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import openai
|
|
2
2
|
from collections import defaultdict
|
|
3
|
-
from inspect import signature
|
|
4
3
|
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
5
4
|
from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
|
|
6
5
|
from typing import List, Optional, Union
|
|
7
6
|
|
|
8
7
|
from evalscope.utils.logger import get_logger
|
|
8
|
+
from evalscope.utils.utils import get_supported_params
|
|
9
9
|
from .base_adapter import BaseModelAdapter
|
|
10
10
|
|
|
11
11
|
logger = get_logger()
|
|
@@ -31,7 +31,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
31
31
|
api_key=api_key,
|
|
32
32
|
base_url=self.api_url,
|
|
33
33
|
)
|
|
34
|
-
self.supported_params = self.
|
|
34
|
+
self.supported_params = get_supported_params(self.client.chat.completions.create)
|
|
35
35
|
|
|
36
36
|
self.seed = kwargs.get('seed', None)
|
|
37
37
|
self.timeout = kwargs.get('timeout', 60)
|
|
@@ -39,11 +39,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
39
39
|
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
40
40
|
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
41
41
|
|
|
42
|
-
def
|
|
43
|
-
sig = signature(self.client.chat.completions.create)
|
|
44
|
-
return list(sig.parameters.keys())
|
|
45
|
-
|
|
46
|
-
def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
|
|
42
|
+
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
47
43
|
"""
|
|
48
44
|
Model prediction func.
|
|
49
45
|
|
|
@@ -65,23 +61,26 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
65
61
|
|
|
66
62
|
def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
67
63
|
"""Process a single input item."""
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
query = '\n'.join(''.join(item) for item in data)
|
|
71
|
-
system_prompt = input_item.get('system_prompt', None)
|
|
64
|
+
if input_item.get('messages', None):
|
|
65
|
+
content = input_item['messages']
|
|
72
66
|
else:
|
|
73
|
-
|
|
74
|
-
system_prompt = input_item.get('system_prompt', None)
|
|
75
|
-
|
|
76
|
-
content = self.make_request_content(query, system_prompt)
|
|
67
|
+
content = self.make_request_content(input_item)
|
|
77
68
|
request_json = self.make_request(content, infer_cfg)
|
|
78
69
|
response = self.send_request(request_json)
|
|
79
70
|
return response
|
|
80
71
|
|
|
81
|
-
def make_request_content(self,
|
|
72
|
+
def make_request_content(self, input_item: dict) -> list:
|
|
82
73
|
"""
|
|
83
74
|
Make request content for OpenAI API.
|
|
84
75
|
"""
|
|
76
|
+
data: list = input_item['data']
|
|
77
|
+
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
78
|
+
query = '\n'.join(''.join(item) for item in data)
|
|
79
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
80
|
+
else:
|
|
81
|
+
query = data[0]
|
|
82
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
83
|
+
|
|
85
84
|
messages = []
|
|
86
85
|
if system_prompt:
|
|
87
86
|
messages.append({'role': 'system', 'content': system_prompt})
|
|
@@ -90,16 +89,9 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
90
89
|
|
|
91
90
|
return messages
|
|
92
91
|
|
|
93
|
-
def make_request(self, content: list, infer_cfg: dict
|
|
92
|
+
def make_request(self, content: list, infer_cfg: dict) -> dict:
|
|
94
93
|
"""Make request to remote API."""
|
|
95
94
|
# Format request JSON according to OpenAI API format
|
|
96
|
-
from evalscope.config import DEFAULT_GENERATION_CONFIG
|
|
97
|
-
if infer_cfg == DEFAULT_GENERATION_CONFIG:
|
|
98
|
-
infer_cfg = {
|
|
99
|
-
'max_tokens': 2048,
|
|
100
|
-
'temperature': 0.0,
|
|
101
|
-
}
|
|
102
|
-
|
|
103
95
|
request_json = {'model': self.model_id, 'messages': content, **infer_cfg}
|
|
104
96
|
|
|
105
97
|
if self.timeout:
|
|
@@ -137,7 +129,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
137
129
|
return response.model_dump(exclude_unset=True)
|
|
138
130
|
except Exception as e:
|
|
139
131
|
logger.error(f'Error when calling remote API: {str(e)}')
|
|
140
|
-
raise
|
|
132
|
+
raise e
|
|
141
133
|
|
|
142
134
|
def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
|
|
143
135
|
collected_chunks = []
|
evalscope/perf/arguments.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import os
|
|
4
4
|
import sys
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
|
-
from typing import Any, Dict, List, Optional
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from evalscope.constants import DEFAULT_WORK_DIR
|
|
9
9
|
|
|
@@ -27,8 +27,8 @@ class Arguments:
|
|
|
27
27
|
no_test_connection: bool = False # Test the connection before starting the benchmark
|
|
28
28
|
|
|
29
29
|
# Performance and parallelism
|
|
30
|
-
number: int = 1000 # Number of requests to be made
|
|
31
|
-
parallel: int = 1 # Number of parallel requests
|
|
30
|
+
number: Union[int, List[int]] = 1000 # Number of requests to be made
|
|
31
|
+
parallel: Union[int, List[int]] = 1 # Number of parallel requests
|
|
32
32
|
rate: int = -1 # Rate limit for requests (default: -1, no limit)
|
|
33
33
|
|
|
34
34
|
# Logging and debugging
|
|
@@ -60,8 +60,8 @@ class Arguments:
|
|
|
60
60
|
min_tokens: Optional[int] = None # Minimum number of tokens in the response
|
|
61
61
|
n_choices: Optional[int] = None # Number of response choices
|
|
62
62
|
seed: Optional[int] = 0 # Random seed for reproducibility
|
|
63
|
-
stop: Optional[List[str]] =
|
|
64
|
-
stop_token_ids: Optional[List[str]] =
|
|
63
|
+
stop: Optional[List[str]] = None # Stop sequences for the response
|
|
64
|
+
stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
|
|
65
65
|
stream: Optional[bool] = True # Whether to stream the response
|
|
66
66
|
temperature: float = 0.0 # Temperature setting for the response
|
|
67
67
|
top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
|
|
@@ -98,6 +98,15 @@ class Arguments:
|
|
|
98
98
|
if self.apply_chat_template is None:
|
|
99
99
|
self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
|
|
100
100
|
|
|
101
|
+
# Set number and parallel to lists if they are integers
|
|
102
|
+
if isinstance(self.number, int):
|
|
103
|
+
self.number = [self.number]
|
|
104
|
+
if isinstance(self.parallel, int):
|
|
105
|
+
self.parallel = [self.parallel]
|
|
106
|
+
assert len(self.number) == len(
|
|
107
|
+
self.parallel
|
|
108
|
+
), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
|
|
109
|
+
|
|
101
110
|
def __str__(self):
|
|
102
111
|
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
103
112
|
|
|
@@ -143,8 +152,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
143
152
|
parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
|
|
144
153
|
|
|
145
154
|
# Performance and parallelism
|
|
146
|
-
parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
|
|
147
|
-
parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
|
|
155
|
+
parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
|
|
156
|
+
parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1') # noqa: E501
|
|
148
157
|
parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
|
|
149
158
|
|
|
150
159
|
# Logging and debugging
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import copy
|
|
3
2
|
import json
|
|
4
3
|
import numpy as np
|
|
5
|
-
import os
|
|
6
4
|
import platform
|
|
7
5
|
import sqlite3
|
|
8
|
-
import threading
|
|
9
6
|
import time
|
|
10
7
|
from http import HTTPStatus
|
|
11
8
|
from tqdm import tqdm
|
|
@@ -17,8 +14,6 @@ from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
|
|
|
17
14
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
18
15
|
from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
|
|
19
16
|
from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
|
|
20
|
-
from evalscope.perf.utils.local_server import start_app
|
|
21
|
-
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
22
17
|
from evalscope.utils.logger import get_logger
|
|
23
18
|
|
|
24
19
|
logger = get_logger()
|
|
@@ -116,11 +111,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
|
|
|
116
111
|
|
|
117
112
|
result_db_path = get_result_db_path(args)
|
|
118
113
|
|
|
119
|
-
if args.wandb_api_key:
|
|
120
|
-
init_wandb(args)
|
|
121
|
-
if args.swanlab_api_key:
|
|
122
|
-
init_swanlab(args)
|
|
123
|
-
|
|
124
114
|
collected_benchmark_data = []
|
|
125
115
|
|
|
126
116
|
with tqdm(desc='Processing', total=args.number) as pbar:
|
|
@@ -170,11 +160,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
|
|
|
170
160
|
|
|
171
161
|
@exception_handler
|
|
172
162
|
async def connect_test(args: Arguments) -> bool:
|
|
173
|
-
if args.api.startswith('local'):
|
|
174
|
-
# start local server
|
|
175
|
-
server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
|
|
176
|
-
server.start()
|
|
177
|
-
|
|
178
163
|
if (not args.no_test_connection) and (not await test_connection(args)):
|
|
179
164
|
raise TimeoutError('Test connection failed')
|
|
180
165
|
|
evalscope/perf/main.py
CHANGED
|
@@ -1,32 +1,34 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import copy
|
|
2
3
|
import os
|
|
3
4
|
import platform
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
4
7
|
from argparse import Namespace
|
|
5
8
|
|
|
6
|
-
from evalscope.perf.
|
|
7
|
-
from evalscope.perf.
|
|
8
|
-
from evalscope.perf.utils.db_util import get_output_path
|
|
9
|
-
from evalscope.perf.utils.handler import add_signal_handlers
|
|
9
|
+
from evalscope.perf.utils.local_server import start_app
|
|
10
|
+
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
10
11
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
11
12
|
from evalscope.utils.utils import seed_everything
|
|
13
|
+
from .arguments import Arguments, parse_args
|
|
14
|
+
from .benchmark import benchmark
|
|
15
|
+
from .utils.db_util import get_output_path
|
|
16
|
+
from .utils.handler import add_signal_handlers
|
|
17
|
+
from .utils.rich_display import print_summary
|
|
12
18
|
|
|
13
19
|
logger = get_logger()
|
|
14
20
|
|
|
15
21
|
|
|
16
|
-
def
|
|
17
|
-
if isinstance(args,
|
|
18
|
-
args =
|
|
19
|
-
|
|
20
|
-
args =
|
|
21
|
-
|
|
22
|
-
if args.seed is not None:
|
|
23
|
-
seed_everything(args.seed)
|
|
22
|
+
def run_one_benchmark(args: Arguments, output_path: str = None):
|
|
23
|
+
if isinstance(args.parallel, list):
|
|
24
|
+
args.parallel = args.parallel[0]
|
|
25
|
+
if isinstance(args.number, list):
|
|
26
|
+
args.number = args.number[0]
|
|
24
27
|
|
|
25
28
|
# Setup logger and output
|
|
26
|
-
args.outputs_dir =
|
|
27
|
-
configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
|
|
29
|
+
args.outputs_dir = output_path
|
|
28
30
|
|
|
29
|
-
logger.info('Starting benchmark
|
|
31
|
+
logger.info('Starting benchmark with args: ')
|
|
30
32
|
logger.info(args)
|
|
31
33
|
|
|
32
34
|
if platform.system() == 'Windows':
|
|
@@ -39,6 +41,61 @@ def run_perf_benchmark(args):
|
|
|
39
41
|
return loop.run_until_complete(benchmark(args))
|
|
40
42
|
|
|
41
43
|
|
|
44
|
+
def run_multi_benchmark(args: Arguments, output_path: str = None):
|
|
45
|
+
results = []
|
|
46
|
+
number_list = copy.deepcopy(args.number)
|
|
47
|
+
parallel_list = copy.deepcopy(args.parallel)
|
|
48
|
+
for i, (number, parallel) in enumerate(zip(number_list, parallel_list)):
|
|
49
|
+
args.number = number
|
|
50
|
+
args.parallel = parallel
|
|
51
|
+
# Set up output path for each run
|
|
52
|
+
cur_output_path = os.path.join(output_path, f'parallel_{parallel}_number_{number}')
|
|
53
|
+
os.makedirs(cur_output_path, exist_ok=True)
|
|
54
|
+
# Start the benchmark
|
|
55
|
+
metrics_result = run_one_benchmark(args, output_path=cur_output_path)
|
|
56
|
+
# Save the results
|
|
57
|
+
results.append(metrics_result)
|
|
58
|
+
# Sleep between runs to avoid overwhelming the server
|
|
59
|
+
if i < len(number_list) - 1:
|
|
60
|
+
logger.info('Sleeping for 5 seconds before the next run...')
|
|
61
|
+
time.sleep(5)
|
|
62
|
+
# Analyze results
|
|
63
|
+
print_summary(results, args.model_id)
|
|
64
|
+
return results
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def run_perf_benchmark(args):
|
|
68
|
+
# Check if args is a dictionary or Namespace
|
|
69
|
+
if isinstance(args, dict):
|
|
70
|
+
args = Arguments(**args)
|
|
71
|
+
elif isinstance(args, Namespace):
|
|
72
|
+
args = Arguments.from_args(args)
|
|
73
|
+
|
|
74
|
+
if args.seed is not None:
|
|
75
|
+
seed_everything(args.seed)
|
|
76
|
+
|
|
77
|
+
# Initialize output directory
|
|
78
|
+
output_path = get_output_path(args)
|
|
79
|
+
configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
|
|
80
|
+
|
|
81
|
+
# Initialize wandb and swanlab
|
|
82
|
+
if args.wandb_api_key:
|
|
83
|
+
init_wandb(args)
|
|
84
|
+
if args.swanlab_api_key:
|
|
85
|
+
init_swanlab(args)
|
|
86
|
+
|
|
87
|
+
# Initialize local server if needed
|
|
88
|
+
if args.api.startswith('local'):
|
|
89
|
+
# start local server
|
|
90
|
+
server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
|
|
91
|
+
server.start()
|
|
92
|
+
# Start benchmark
|
|
93
|
+
if len(args.number) == 1:
|
|
94
|
+
return run_one_benchmark(args, output_path=output_path)
|
|
95
|
+
else:
|
|
96
|
+
return run_multi_benchmark(args, output_path=output_path)
|
|
97
|
+
|
|
98
|
+
|
|
42
99
|
if __name__ == '__main__':
|
|
43
100
|
args = Arguments.from_args(parse_args())
|
|
44
101
|
metrics_result, percentile_result = run_perf_benchmark(args)
|
|
@@ -22,3 +22,18 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
yield [{'role': 'user', 'content': prompt}]
|
|
23
23
|
else:
|
|
24
24
|
yield prompt
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
if __name__ == '__main__':
|
|
28
|
+
from evalscope.perf.arguments import Arguments
|
|
29
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
30
|
+
|
|
31
|
+
args = Arguments(
|
|
32
|
+
model='qwen2.5-7b-instruct',
|
|
33
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
34
|
+
dataset_path='outputs/perf_data.txt',
|
|
35
|
+
api_key='EMPTY',
|
|
36
|
+
dataset='custom',
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
run_perf_benchmark(args)
|
|
@@ -38,7 +38,7 @@ class BenchmarkData:
|
|
|
38
38
|
self.first_chunk_latency = self.query_latency
|
|
39
39
|
self.n_chunks = 1
|
|
40
40
|
self.n_chunks_time = self.query_latency
|
|
41
|
-
self.time_per_output_token = self.n_chunks_time / self.
|
|
41
|
+
self.time_per_output_token = self.n_chunks_time / self.n_chunks
|
|
42
42
|
|
|
43
43
|
def _calculate_tokens(self, api_plugin):
|
|
44
44
|
self.prompt_tokens, self.completion_tokens = \
|
|
@@ -51,6 +51,24 @@ class BenchmarkData:
|
|
|
51
51
|
self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
|
|
52
52
|
|
|
53
53
|
|
|
54
|
+
class Metrics:
|
|
55
|
+
TIME_TAKEN_FOR_TESTS = 'Time taken for tests (s)'
|
|
56
|
+
NUMBER_OF_CONCURRENCY = 'Number of concurrency'
|
|
57
|
+
TOTAL_REQUESTS = 'Total requests'
|
|
58
|
+
SUCCEED_REQUESTS = 'Succeed requests'
|
|
59
|
+
FAILED_REQUESTS = 'Failed requests'
|
|
60
|
+
OUTPUT_TOKEN_THROUGHPUT = 'Output token throughput (tok/s)'
|
|
61
|
+
TOTAL_TOKEN_THROUGHPUT = 'Total token throughput (tok/s)'
|
|
62
|
+
REQUEST_THROUGHPUT = 'Request throughput (req/s)'
|
|
63
|
+
AVERAGE_LATENCY = 'Average latency (s)'
|
|
64
|
+
AVERAGE_TIME_TO_FIRST_TOKEN = 'Average time to first token (s)'
|
|
65
|
+
AVERAGE_TIME_PER_OUTPUT_TOKEN = 'Average time per output token (s)'
|
|
66
|
+
AVERAGE_INPUT_TOKENS_PER_REQUEST = 'Average input tokens per request'
|
|
67
|
+
AVERAGE_OUTPUT_TOKENS_PER_REQUEST = 'Average output tokens per request'
|
|
68
|
+
AVERAGE_PACKAGE_LATENCY = 'Average package latency (s)'
|
|
69
|
+
AVERAGE_PACKAGE_PER_REQUEST = 'Average package per request'
|
|
70
|
+
|
|
71
|
+
|
|
54
72
|
@dataclass
|
|
55
73
|
class BenchmarkMetrics:
|
|
56
74
|
concurrency: int = 0
|
|
@@ -125,20 +143,20 @@ class BenchmarkMetrics:
|
|
|
125
143
|
|
|
126
144
|
def create_message(self, default_ndigits=4):
|
|
127
145
|
message = {
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
146
|
+
Metrics.TIME_TAKEN_FOR_TESTS: round(self.total_time, default_ndigits),
|
|
147
|
+
Metrics.NUMBER_OF_CONCURRENCY: self.concurrency,
|
|
148
|
+
Metrics.TOTAL_REQUESTS: int(self.n_total_queries),
|
|
149
|
+
Metrics.SUCCEED_REQUESTS: self.n_succeed_queries,
|
|
150
|
+
Metrics.FAILED_REQUESTS: self.n_failed_queries,
|
|
151
|
+
Metrics.OUTPUT_TOKEN_THROUGHPUT: round(self.avg_output_token_per_seconds, default_ndigits),
|
|
152
|
+
Metrics.TOTAL_TOKEN_THROUGHPUT: round(self.avg_total_token_per_seconds, default_ndigits),
|
|
153
|
+
Metrics.REQUEST_THROUGHPUT: round(self.qps, default_ndigits),
|
|
154
|
+
Metrics.AVERAGE_LATENCY: round(self.avg_latency, default_ndigits),
|
|
155
|
+
Metrics.AVERAGE_TIME_TO_FIRST_TOKEN: round(self.avg_first_chunk_latency, default_ndigits),
|
|
156
|
+
Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN: round(self.avg_time_per_token, default_ndigits),
|
|
157
|
+
Metrics.AVERAGE_INPUT_TOKENS_PER_REQUEST: round(self.avg_prompt_tokens, default_ndigits),
|
|
158
|
+
Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST: round(self.avg_completion_tokens, default_ndigits),
|
|
159
|
+
Metrics.AVERAGE_PACKAGE_LATENCY: round(self.avg_chunk_time, default_ndigits),
|
|
160
|
+
Metrics.AVERAGE_PACKAGE_PER_REQUEST: round(self.n_avg_chunks, default_ndigits),
|
|
143
161
|
}
|
|
144
162
|
return message
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -111,6 +111,18 @@ def get_result_db_path(args: Arguments):
|
|
|
111
111
|
return result_db_path
|
|
112
112
|
|
|
113
113
|
|
|
114
|
+
class PercentileMetrics:
|
|
115
|
+
TTFT = 'TTFT (s)'
|
|
116
|
+
ITL = 'ITL (s)'
|
|
117
|
+
TPOT = 'TPOT (s)'
|
|
118
|
+
LATENCY = 'Latency (s)'
|
|
119
|
+
INPUT_TOKENS = 'Input tokens'
|
|
120
|
+
OUTPUT_TOKENS = 'Output tokens'
|
|
121
|
+
OUTPUT_THROUGHPUT = 'Output (tok/s)'
|
|
122
|
+
TOTAL_THROUGHPUT = 'Total (tok/s)'
|
|
123
|
+
PERCENTILES = 'Percentiles'
|
|
124
|
+
|
|
125
|
+
|
|
114
126
|
def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
|
|
115
127
|
"""
|
|
116
128
|
Calculate the percentiles for a specific list of data.
|
|
@@ -157,10 +169,6 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
157
169
|
with sqlite3.connect(result_db_path) as con:
|
|
158
170
|
rows = con.execute(query_sql).fetchall()
|
|
159
171
|
|
|
160
|
-
if len(rows) < len(percentiles):
|
|
161
|
-
logger.info('Too little data to calculate quantiles!')
|
|
162
|
-
return {}
|
|
163
|
-
|
|
164
172
|
# Define index variables for columns
|
|
165
173
|
CHUNK_TIMES_INDEX = 1
|
|
166
174
|
LATENCY_INDEX = 4
|
|
@@ -175,24 +183,25 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
175
183
|
inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
|
|
176
184
|
|
|
177
185
|
metrics = {
|
|
178
|
-
|
|
179
|
-
|
|
186
|
+
PercentileMetrics.TTFT: [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
187
|
+
PercentileMetrics.ITL:
|
|
180
188
|
inter_token_latencies_all,
|
|
181
|
-
|
|
189
|
+
PercentileMetrics.TPOT:
|
|
182
190
|
[(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
|
|
183
191
|
for row in rows],
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
192
|
+
PercentileMetrics.LATENCY: [row[LATENCY_INDEX] for row in rows],
|
|
193
|
+
PercentileMetrics.INPUT_TOKENS: [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
194
|
+
PercentileMetrics.OUTPUT_TOKENS: [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
195
|
+
PercentileMetrics.OUTPUT_THROUGHPUT:
|
|
188
196
|
[(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
189
197
|
for row in rows],
|
|
190
|
-
|
|
191
|
-
|
|
198
|
+
PercentileMetrics.TOTAL_THROUGHPUT: [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
|
|
199
|
+
/ row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
200
|
+
for row in rows]
|
|
192
201
|
}
|
|
193
202
|
|
|
194
203
|
# Calculate percentiles for each metric
|
|
195
|
-
results = {
|
|
204
|
+
results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
|
|
196
205
|
for metric_name, data in metrics.items():
|
|
197
206
|
metric_percentiles = calculate_percentiles(data, percentiles)
|
|
198
207
|
results[metric_name] = [metric_percentiles[p] for p in percentiles]
|
|
@@ -205,7 +214,6 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
|
|
|
205
214
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
206
215
|
|
|
207
216
|
metrics_result = metrics.create_message()
|
|
208
|
-
metrics_result.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
|
|
209
217
|
write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
|
|
210
218
|
|
|
211
219
|
# Print summary in a table
|
|
@@ -223,6 +231,8 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
|
|
|
223
231
|
if args.dataset.startswith('speed_benchmark'):
|
|
224
232
|
speed_benchmark_result(result_db_path)
|
|
225
233
|
|
|
234
|
+
logger.info(f'Save the summary to: {result_path}')
|
|
235
|
+
|
|
226
236
|
return metrics_result, percentile_result
|
|
227
237
|
|
|
228
238
|
|
|
@@ -96,6 +96,7 @@ def create_app(model, attn_implementation=None) -> FastAPI:
|
|
|
96
96
|
|
|
97
97
|
|
|
98
98
|
def start_app(args: Arguments):
|
|
99
|
+
logger.info('Starting local server, please wait...')
|
|
99
100
|
if args.api == 'local':
|
|
100
101
|
app = create_app(args.model, args.attn_implementation)
|
|
101
102
|
uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
|
|
@@ -34,8 +34,15 @@ def init_swanlab(args: Arguments) -> None:
|
|
|
34
34
|
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
35
35
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
36
36
|
swanlab.config.update({'framework': '📏evalscope'})
|
|
37
|
-
|
|
38
|
-
project
|
|
39
|
-
name
|
|
40
|
-
config
|
|
41
|
-
mode
|
|
37
|
+
init_kwargs = {
|
|
38
|
+
'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
|
|
39
|
+
'name': name,
|
|
40
|
+
'config': args.to_dict(),
|
|
41
|
+
'mode': 'local' if args.swanlab_api_key == 'local' else None
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
workspace = os.getenv('SWANLAB_WORKSPACE')
|
|
45
|
+
if workspace:
|
|
46
|
+
init_kwargs['workspace'] = workspace
|
|
47
|
+
|
|
48
|
+
swanlab.init(**init_kwargs)
|