evalscope 0.13.2__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +4 -4
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/data_adapter.py +6 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
- evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/collections/evaluator.py +4 -2
- evalscope/config.py +1 -1
- evalscope/perf/arguments.py +24 -5
- evalscope/perf/benchmark.py +28 -42
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +2 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/utils/benchmark_util.py +12 -6
- evalscope/perf/utils/db_util.py +1 -1
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/app.py +11 -11
- evalscope/run.py +7 -0
- evalscope/summarizer.py +2 -1
- evalscope/utils/utils.py +36 -25
- evalscope/version.py +2 -2
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/METADATA +20 -15
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/RECORD +55 -54
- tests/cli/test_all.py +4 -4
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +9 -8
- tests/perf/test_perf.py +1 -2
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +29 -26
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
4
|
+
from evalscope.constants import EvalType, OutputType
|
|
5
|
+
from evalscope.metrics import exact_match
|
|
6
|
+
from evalscope.utils.utils import ResponseParser
|
|
7
|
+
|
|
8
|
+
SUBSET_LIST = ['default']
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='maritime_bench',
|
|
13
|
+
pretty_name='MaritimeBench',
|
|
14
|
+
dataset_id='HiDolphin/MaritimeBench',
|
|
15
|
+
model_adapter=OutputType.GENERATION,
|
|
16
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
17
|
+
subset_list=SUBSET_LIST,
|
|
18
|
+
metric_list=['AverageAccuracy'],
|
|
19
|
+
eval_split='test',
|
|
20
|
+
prompt_template=
|
|
21
|
+
'题目来自于{subset_name}请回答单选题。要求只输出选项,不输出解释,将选项放在<>里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:<A> 当前题目\n {query}', # noqa: E501
|
|
22
|
+
)
|
|
23
|
+
class MaritimeBenchAdapter(DataAdapter):
|
|
24
|
+
|
|
25
|
+
def __init__(self, **kwargs):
|
|
26
|
+
super().__init__(**kwargs)
|
|
27
|
+
|
|
28
|
+
self.choices = ['A', 'B', 'C', 'D']
|
|
29
|
+
|
|
30
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
31
|
+
|
|
32
|
+
prefix = ''
|
|
33
|
+
query = prefix + input_d['question'] + '\n'
|
|
34
|
+
available_choices = []
|
|
35
|
+
for option in self.choices:
|
|
36
|
+
if option in input_d and input_d[option]:
|
|
37
|
+
query += option + ':' + input_d[option] + '\n'
|
|
38
|
+
available_choices.append(option)
|
|
39
|
+
|
|
40
|
+
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
|
|
41
|
+
return self.gen_prompt_data(full_prompt, choices=available_choices)
|
|
42
|
+
|
|
43
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Parse the raw input labels (gold).
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
input_d: input raw data. Depending on the dataset.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The parsed input. e.g. gold answer ... Depending on the dataset.
|
|
52
|
+
"""
|
|
53
|
+
return input_d['answer']
|
|
54
|
+
|
|
55
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Parse the raw model prediction (pred).
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
pred: model prediction. Depending on the model.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The parsed prediction. e.g. model answer... Depending on the model.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
return ResponseParser.parse_bracketed_answer(result, options=self.choices)
|
|
67
|
+
|
|
68
|
+
def match(self, gold: Any, pred: Any) -> Any:
|
|
69
|
+
"""
|
|
70
|
+
Match the gold answer with the predicted answer.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
gold: The gold answer.
|
|
74
|
+
pred: The predicted answer.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
The result of the match.
|
|
78
|
+
"""
|
|
79
|
+
return exact_match(gold=gold, pred=pred)
|
|
@@ -145,7 +145,7 @@ SUBJECT_MAPPING = {
|
|
|
145
145
|
train_split='train',
|
|
146
146
|
eval_split='test',
|
|
147
147
|
prompt_template=
|
|
148
|
-
|
|
148
|
+
"""Answer the following multiple choice question about {subset_name}. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{query}""", # noqa: E501
|
|
149
149
|
)
|
|
150
150
|
class MMLUAdapter(DataAdapter):
|
|
151
151
|
|
|
@@ -224,9 +224,8 @@ class MMLUAdapter(DataAdapter):
|
|
|
224
224
|
|
|
225
225
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
226
226
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
227
|
-
query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
228
227
|
|
|
229
|
-
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=
|
|
228
|
+
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
|
|
230
229
|
|
|
231
230
|
return self.gen_prompt_data(full_prompt)
|
|
232
231
|
|
|
@@ -249,7 +248,7 @@ class MMLUAdapter(DataAdapter):
|
|
|
249
248
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
250
249
|
return result
|
|
251
250
|
else:
|
|
252
|
-
return ResponseParser.parse_first_option(result)
|
|
251
|
+
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
253
252
|
|
|
254
253
|
def match(self, gold: str, pred: str) -> float:
|
|
255
254
|
return exact_match(gold=gold, pred=pred)
|
|
@@ -260,11 +259,10 @@ class MMLUAdapter(DataAdapter):
|
|
|
260
259
|
|
|
261
260
|
example: str = input_d['input']
|
|
262
261
|
for j in range(len(self.choices)):
|
|
263
|
-
example += '\n{
|
|
262
|
+
example += f'\n{self.choices[j]}) {input_choices[j]}'
|
|
264
263
|
|
|
265
|
-
example += '\nAnswer:'
|
|
266
264
|
if include_answer:
|
|
267
|
-
example +=
|
|
265
|
+
example += f"\nAnswer: {input_d['target']}\n\n"
|
|
268
266
|
|
|
269
267
|
return example
|
|
270
268
|
|
|
@@ -92,7 +92,7 @@ class MMLUProAdapter(DataAdapter):
|
|
|
92
92
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
93
93
|
return result
|
|
94
94
|
else:
|
|
95
|
-
return ResponseParser.parse_first_option(result)
|
|
95
|
+
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
96
96
|
|
|
97
97
|
def match(self, gold: str, pred: str) -> float:
|
|
98
98
|
"""
|
|
@@ -164,7 +164,7 @@ class MMLUReduxAdapter(DataAdapter):
|
|
|
164
164
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
165
165
|
return result
|
|
166
166
|
else:
|
|
167
|
-
return ResponseParser.parse_first_option(result)
|
|
167
|
+
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
168
168
|
|
|
169
169
|
def match(self, gold: str, pred: str) -> float:
|
|
170
170
|
"""
|
|
@@ -62,7 +62,7 @@ class MuSRAdapter(DataAdapter):
|
|
|
62
62
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
63
63
|
return result
|
|
64
64
|
else:
|
|
65
|
-
return ResponseParser.parse_first_option(result)
|
|
65
|
+
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
66
66
|
|
|
67
67
|
def match(self, gold: str, pred: str) -> float:
|
|
68
68
|
"""
|
|
@@ -65,7 +65,7 @@ class EvaluatorCollection:
|
|
|
65
65
|
self.evaluators = self._initialize_evaluators()
|
|
66
66
|
|
|
67
67
|
def load(self) -> tuple[list[DatasetEntry], str]:
|
|
68
|
-
dataset_name = os.path.basename(self.data_adapter.dataset_id)
|
|
68
|
+
dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
|
|
69
69
|
raw_dataset = self.data_adapter.load()
|
|
70
70
|
# limit the dataset
|
|
71
71
|
if self.task_cfg.limit:
|
|
@@ -174,6 +174,7 @@ class EvaluatorCollection:
|
|
|
174
174
|
os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
|
|
175
175
|
with open(report_file_path, 'w', encoding='utf-8') as f:
|
|
176
176
|
json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
|
|
177
|
+
return report
|
|
177
178
|
|
|
178
179
|
def _filter_answer(self, pred_file_path):
|
|
179
180
|
answer_dict = defaultdict(dict)
|
|
@@ -274,4 +275,5 @@ class EvaluatorCollection:
|
|
|
274
275
|
answers = self.get_answers()
|
|
275
276
|
reviews = self.get_reviews(answers)
|
|
276
277
|
scores = self.get_scores(reviews)
|
|
277
|
-
self.get_report(scores)
|
|
278
|
+
report = self.get_report(scores)
|
|
279
|
+
return report
|
evalscope/config.py
CHANGED
|
@@ -212,7 +212,7 @@ def parse_task_config(task_cfg) -> TaskConfig:
|
|
|
212
212
|
logger.info('Args: Task config is provided with CommandLine type.')
|
|
213
213
|
task_cfg = TaskConfig.from_args(task_cfg)
|
|
214
214
|
elif isinstance(task_cfg, str):
|
|
215
|
-
extension =
|
|
215
|
+
extension = os.path.splitext(task_cfg)[-1]
|
|
216
216
|
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
217
217
|
if extension in ['yaml', 'yml']:
|
|
218
218
|
task_cfg = TaskConfig.from_yaml(task_cfg)
|
evalscope/perf/arguments.py
CHANGED
|
@@ -35,6 +35,7 @@ class Arguments:
|
|
|
35
35
|
log_every_n_query: int = 10 # Log every N queries
|
|
36
36
|
debug: bool = False # Debug mode
|
|
37
37
|
wandb_api_key: Optional[str] = None # WandB API key for logging
|
|
38
|
+
swanlab_api_key: Optional[str] = None # SwanLab API key for logging
|
|
38
39
|
name: Optional[str] = None # Name for the run
|
|
39
40
|
|
|
40
41
|
# Output settings
|
|
@@ -46,6 +47,7 @@ class Arguments:
|
|
|
46
47
|
prefix_length: int = 0 # Length of the prefix, only for random dataset
|
|
47
48
|
prompt: Optional[str] = None # The prompt text
|
|
48
49
|
query_template: Optional[str] = None # Template for the query
|
|
50
|
+
apply_chat_template: Optional[bool] = None # Whether to apply chat template
|
|
49
51
|
|
|
50
52
|
# Dataset settings
|
|
51
53
|
dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
|
|
@@ -57,10 +59,10 @@ class Arguments:
|
|
|
57
59
|
max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
|
|
58
60
|
min_tokens: Optional[int] = None # Minimum number of tokens in the response
|
|
59
61
|
n_choices: Optional[int] = None # Number of response choices
|
|
60
|
-
seed: Optional[int] =
|
|
62
|
+
seed: Optional[int] = 0 # Random seed for reproducibility
|
|
61
63
|
stop: Optional[List[str]] = field(default_factory=list) # Stop sequences for the response
|
|
62
64
|
stop_token_ids: Optional[List[str]] = field(default_factory=list) # Stop token IDs for the response
|
|
63
|
-
stream: Optional[bool] =
|
|
65
|
+
stream: Optional[bool] = True # Whether to stream the response
|
|
64
66
|
temperature: float = 0.0 # Temperature setting for the response
|
|
65
67
|
top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
|
|
66
68
|
top_k: Optional[int] = None # Top-k sampling setting for the response
|
|
@@ -76,12 +78,26 @@ class Arguments:
|
|
|
76
78
|
return Arguments(**args_dict)
|
|
77
79
|
|
|
78
80
|
def __post_init__(self):
|
|
81
|
+
# Set the default headers
|
|
79
82
|
self.headers = self.headers or {} # Default to empty dictionary
|
|
80
83
|
if self.api_key:
|
|
81
84
|
# Assuming the API key is used as a Bearer token
|
|
82
85
|
self.headers['Authorization'] = f'Bearer {self.api_key}'
|
|
86
|
+
|
|
87
|
+
# Set the model ID based on the model name
|
|
83
88
|
self.model_id = os.path.basename(self.model)
|
|
84
89
|
|
|
90
|
+
# Set the URL based on the dataset type
|
|
91
|
+
if self.api.startswith('local'):
|
|
92
|
+
if self.dataset.startswith('speed_benchmark'):
|
|
93
|
+
self.url = f'http://127.0.0.1:{self.port}/v1/completions'
|
|
94
|
+
else:
|
|
95
|
+
self.url = f'http://127.0.0.1:{self.port}/v1/chat/completions'
|
|
96
|
+
|
|
97
|
+
# Set the apply_chat_template flag based on the URL
|
|
98
|
+
if self.apply_chat_template is None:
|
|
99
|
+
self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
|
|
100
|
+
|
|
85
101
|
def __str__(self):
|
|
86
102
|
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
87
103
|
|
|
@@ -135,7 +151,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
135
151
|
parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
|
|
136
152
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
|
|
137
153
|
parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
|
|
138
|
-
parser.add_argument('--
|
|
154
|
+
parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
|
|
155
|
+
parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
|
|
139
156
|
|
|
140
157
|
# Prompt settings
|
|
141
158
|
parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
|
|
@@ -143,6 +160,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
143
160
|
parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
|
|
144
161
|
parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
|
|
145
162
|
parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
|
|
163
|
+
parser.add_argument(
|
|
164
|
+
'--apply-chat-template', type=argparse.BooleanOptionalAction, default=None, help='Apply chat template to the prompt') # noqa: E501
|
|
146
165
|
|
|
147
166
|
# Output settings
|
|
148
167
|
parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
|
|
@@ -159,10 +178,10 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
159
178
|
parser.add_argument(
|
|
160
179
|
'--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
|
|
161
180
|
parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
|
|
162
|
-
parser.add_argument('--seed', type=int, help='The random seed', default=
|
|
181
|
+
parser.add_argument('--seed', type=int, help='The random seed', default=0)
|
|
163
182
|
parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
|
|
164
183
|
parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
|
|
165
|
-
parser.add_argument('--stream', action=
|
|
184
|
+
parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)
|
|
166
185
|
parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
|
|
167
186
|
parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
|
|
168
187
|
parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -18,6 +18,7 @@ from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
|
18
18
|
from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
|
|
19
19
|
from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
|
|
20
20
|
from evalscope.perf.utils.local_server import start_app
|
|
21
|
+
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
21
22
|
from evalscope.utils.logger import get_logger
|
|
22
23
|
|
|
23
24
|
logger = get_logger()
|
|
@@ -56,7 +57,7 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
|
|
|
56
57
|
|
|
57
58
|
if args.prompt:
|
|
58
59
|
prompt = load_prompt(args.prompt)
|
|
59
|
-
messages = [{'role': 'user', 'content': prompt}]
|
|
60
|
+
messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
|
|
60
61
|
generator = generate_requests_from_prompt(messages)
|
|
61
62
|
elif args.dataset:
|
|
62
63
|
generator = generate_requests_from_dataset()
|
|
@@ -81,6 +82,7 @@ async def send_request(
|
|
|
81
82
|
client = AioHttpClient(args)
|
|
82
83
|
async with client:
|
|
83
84
|
benchmark_data = BenchmarkData(request=request)
|
|
85
|
+
benchmark_data.start_time = time.perf_counter()
|
|
84
86
|
collected_messages = []
|
|
85
87
|
try:
|
|
86
88
|
async for is_error, state_code, response_data in client.post(request):
|
|
@@ -106,24 +108,18 @@ async def send_request(
|
|
|
106
108
|
|
|
107
109
|
|
|
108
110
|
@exception_handler
|
|
109
|
-
async def
|
|
111
|
+
async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments):
|
|
110
112
|
metrics = BenchmarkMetrics(concurrency=args.parallel)
|
|
111
113
|
|
|
112
114
|
api_plugin_class = ApiRegistry(args.api)
|
|
113
115
|
api_plugin = api_plugin_class(args.tokenizer_path)
|
|
114
116
|
|
|
115
117
|
result_db_path = get_result_db_path(args)
|
|
116
|
-
# Initialize wandb
|
|
117
|
-
if args.wandb_api_key:
|
|
118
|
-
import datetime
|
|
119
|
-
import wandb
|
|
120
|
-
os.environ['WANDB_SILENT'] = 'true'
|
|
121
|
-
os.environ['WANDB_DIR'] = args.outputs_dir
|
|
122
118
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
119
|
+
if args.wandb_api_key:
|
|
120
|
+
init_wandb(args)
|
|
121
|
+
if args.swanlab_api_key:
|
|
122
|
+
init_swanlab(args)
|
|
127
123
|
|
|
128
124
|
collected_benchmark_data = []
|
|
129
125
|
|
|
@@ -146,9 +142,13 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
146
142
|
# Create a message with the updated metrics
|
|
147
143
|
message = metrics.create_message()
|
|
148
144
|
|
|
149
|
-
# Log the message to wandb if the api key is provided
|
|
145
|
+
# Log the message to wandb\swanlab if the api key is provided
|
|
150
146
|
if args.wandb_api_key:
|
|
147
|
+
import wandb
|
|
151
148
|
wandb.log(message)
|
|
149
|
+
if args.swanlab_api_key:
|
|
150
|
+
import swanlab
|
|
151
|
+
swanlab.log(message)
|
|
152
152
|
|
|
153
153
|
# Log the message to the logger every n queries
|
|
154
154
|
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
@@ -169,17 +169,12 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
169
169
|
|
|
170
170
|
|
|
171
171
|
@exception_handler
|
|
172
|
-
async def
|
|
172
|
+
async def connect_test(args: Arguments) -> bool:
|
|
173
173
|
if args.api.startswith('local'):
|
|
174
174
|
# start local server
|
|
175
175
|
server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
|
|
176
176
|
server.start()
|
|
177
177
|
|
|
178
|
-
if args.dataset.startswith('speed_benchmark'):
|
|
179
|
-
args.url = f'http://127.0.0.1:{args.port}/v1/completions'
|
|
180
|
-
else:
|
|
181
|
-
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
182
|
-
|
|
183
178
|
if (not args.no_test_connection) and (not await test_connection(args)):
|
|
184
179
|
raise TimeoutError('Test connection failed')
|
|
185
180
|
|
|
@@ -192,31 +187,22 @@ async def benchmark(args: Arguments) -> None:
|
|
|
192
187
|
|
|
193
188
|
# init queue
|
|
194
189
|
benchmark_data_queue = asyncio.Queue()
|
|
195
|
-
|
|
196
190
|
# reset event
|
|
197
191
|
data_process_completed_event.clear()
|
|
198
|
-
|
|
192
|
+
# test connection
|
|
193
|
+
await connect_test(args)
|
|
194
|
+
# start statistic benchmark metric
|
|
195
|
+
statistic_benchmark_metric_task = asyncio.create_task(statistic_benchmark_metric(benchmark_data_queue, args))
|
|
196
|
+
# start send request
|
|
199
197
|
semaphore = asyncio.Semaphore(args.parallel)
|
|
198
|
+
send_request_tasks: List[asyncio.Task] = []
|
|
199
|
+
async for request in get_requests(args):
|
|
200
|
+
task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
|
|
201
|
+
send_request_tasks.append(task)
|
|
200
202
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
|
|
205
|
-
tasks.append(task)
|
|
206
|
-
return tasks
|
|
207
|
-
|
|
208
|
-
async def run_tasks():
|
|
209
|
-
await start_server(args)
|
|
210
|
-
|
|
211
|
-
statistic_benchmark_metric_task = asyncio.create_task(
|
|
212
|
-
statistic_benchmark_metric_worker(benchmark_data_queue, args))
|
|
213
|
-
send_request_tasks = await create_send_request_tasks()
|
|
214
|
-
|
|
215
|
-
await asyncio.gather(*send_request_tasks, return_exceptions=True)
|
|
216
|
-
await benchmark_data_queue.join()
|
|
217
|
-
data_process_completed_event.set()
|
|
218
|
-
|
|
219
|
-
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
220
|
-
summary_result(args, metrics, result_db_path)
|
|
203
|
+
await asyncio.gather(*send_request_tasks, return_exceptions=True)
|
|
204
|
+
await benchmark_data_queue.join()
|
|
205
|
+
data_process_completed_event.set()
|
|
221
206
|
|
|
222
|
-
await
|
|
207
|
+
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
208
|
+
summary_result(args, metrics, result_db_path)
|
evalscope/perf/http_client.py
CHANGED
|
@@ -24,7 +24,6 @@ class AioHttpClient:
|
|
|
24
24
|
self.connect_timeout = args.connect_timeout
|
|
25
25
|
self.client = aiohttp.ClientSession(
|
|
26
26
|
timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
|
|
27
|
-
connector=aiohttp.TCPConnector(limit=1),
|
|
28
27
|
trace_configs=[self._create_trace_config()] if args.debug else [])
|
|
29
28
|
|
|
30
29
|
def _create_trace_config(self):
|
|
@@ -144,7 +143,7 @@ async def test_connection(args: Arguments) -> bool:
|
|
|
144
143
|
async def attempt_connection():
|
|
145
144
|
client = AioHttpClient(args)
|
|
146
145
|
async with client:
|
|
147
|
-
if
|
|
146
|
+
if args.apply_chat_template:
|
|
148
147
|
request = {
|
|
149
148
|
'messages': [{
|
|
150
149
|
'role': 'user',
|
|
@@ -164,7 +163,7 @@ async def test_connection(args: Arguments) -> bool:
|
|
|
164
163
|
is_error, state_code, response_data = await asyncio.wait_for(
|
|
165
164
|
attempt_connection(), timeout=args.connect_timeout)
|
|
166
165
|
if not is_error:
|
|
167
|
-
logger.info('
|
|
166
|
+
logger.info('Test connection successful.')
|
|
168
167
|
return True
|
|
169
168
|
logger.warning(f'Retrying... <{state_code}> {response_data}')
|
|
170
169
|
except Exception as e:
|
|
@@ -24,7 +24,7 @@ class CustomPlugin(ApiPluginBase):
|
|
|
24
24
|
"""
|
|
25
25
|
super().__init__(model_path=mode_path)
|
|
26
26
|
if mode_path is not None:
|
|
27
|
-
from
|
|
27
|
+
from modelscope import AutoTokenizer
|
|
28
28
|
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
@@ -24,7 +24,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
24
24
|
"""
|
|
25
25
|
super().__init__(model_path=mode_path)
|
|
26
26
|
if mode_path is not None:
|
|
27
|
-
from
|
|
27
|
+
from modelscope import AutoTokenizer
|
|
28
28
|
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
@@ -70,7 +70,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
70
70
|
def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
|
|
71
71
|
payload['model'] = param.model
|
|
72
72
|
if param.max_tokens is not None:
|
|
73
|
-
payload['
|
|
73
|
+
payload['max_tokens'] = param.max_tokens
|
|
74
74
|
if param.min_tokens is not None:
|
|
75
75
|
payload['min_tokens'] = param.min_tokens
|
|
76
76
|
if param.frequency_penalty is not None:
|
|
@@ -18,4 +18,7 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
18
18
|
prompt = item.strip()
|
|
19
19
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
20
20
|
prompt) < self.query_parameters.max_prompt_length:
|
|
21
|
-
|
|
21
|
+
if self.query_parameters.apply_chat_template:
|
|
22
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
23
|
+
else:
|
|
24
|
+
yield prompt
|
|
@@ -19,4 +19,7 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
|
|
|
19
19
|
prompt = item.strip()
|
|
20
20
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
21
21
|
prompt) < self.query_parameters.max_prompt_length:
|
|
22
|
-
|
|
22
|
+
if self.query_parameters.apply_chat_template:
|
|
23
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
24
|
+
else:
|
|
25
|
+
yield prompt
|
|
@@ -24,4 +24,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
|
24
24
|
prompt = item['instruction'].strip()
|
|
25
25
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
26
26
|
prompt) < self.query_parameters.max_prompt_length:
|
|
27
|
-
|
|
27
|
+
if self.query_parameters.apply_chat_template:
|
|
28
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
29
|
+
else:
|
|
30
|
+
yield prompt
|
|
@@ -29,4 +29,7 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
29
29
|
prompt = item['question'].strip()
|
|
30
30
|
if (len(prompt) > self.query_parameters.min_prompt_length
|
|
31
31
|
and len(prompt) < self.query_parameters.max_prompt_length):
|
|
32
|
-
|
|
32
|
+
if self.query_parameters.apply_chat_template:
|
|
33
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
34
|
+
else:
|
|
35
|
+
yield prompt
|
|
@@ -23,8 +23,12 @@ class RandomDatasetPlugin(DatasetPluginBase):
|
|
|
23
23
|
self.number = self.query_parameters.number or 1
|
|
24
24
|
|
|
25
25
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
if self.query_parameters.apply_chat_template:
|
|
27
|
+
min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
|
|
28
|
+
max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
|
|
29
|
+
else:
|
|
30
|
+
min_prompt_length = self.query_parameters.min_prompt_length
|
|
31
|
+
max_prompt_length = self.query_parameters.max_prompt_length + 1
|
|
28
32
|
|
|
29
33
|
assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
|
|
30
34
|
assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
|
|
@@ -34,10 +38,13 @@ class RandomDatasetPlugin(DatasetPluginBase):
|
|
|
34
38
|
offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
|
|
35
39
|
|
|
36
40
|
for i in range(self.number):
|
|
37
|
-
prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
|
|
38
|
-
prompt = self.tokenizer.decode(
|
|
39
|
-
|
|
40
|
-
|
|
41
|
+
prompt_ids = ((offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size).tolist()
|
|
42
|
+
prompt = self.tokenizer.decode(self.prefix_ids + prompt_ids)
|
|
43
|
+
|
|
44
|
+
if self.query_parameters.apply_chat_template:
|
|
45
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
46
|
+
else:
|
|
47
|
+
yield prompt
|
|
41
48
|
|
|
42
49
|
def get_random_inputs(self, length: int) -> List[int]:
|
|
43
50
|
if length <= 0:
|
|
@@ -11,7 +11,7 @@ logger = get_logger()
|
|
|
11
11
|
@dataclass
|
|
12
12
|
class BenchmarkData:
|
|
13
13
|
request: Any = None
|
|
14
|
-
start_time: float =
|
|
14
|
+
start_time: float = 0.0
|
|
15
15
|
completed_time: float = 0.0
|
|
16
16
|
chunk_times: List[float] = field(default_factory=list)
|
|
17
17
|
success: bool = False
|
|
@@ -73,7 +73,9 @@ class BenchmarkMetrics:
|
|
|
73
73
|
avg_chunk_time: float = -1
|
|
74
74
|
avg_prompt_tokens: float = -1
|
|
75
75
|
avg_completion_tokens: float = -1
|
|
76
|
-
|
|
76
|
+
avg_input_token_per_seconds: float = -1
|
|
77
|
+
avg_output_token_per_seconds: float = -1
|
|
78
|
+
avg_total_token_per_seconds: float = -1
|
|
77
79
|
avg_time_per_token: float = -1
|
|
78
80
|
qps: float = -1
|
|
79
81
|
|
|
@@ -111,22 +113,26 @@ class BenchmarkMetrics:
|
|
|
111
113
|
self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
|
|
112
114
|
self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
|
|
113
115
|
self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
|
|
114
|
-
self.
|
|
116
|
+
self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
|
|
117
|
+
self.avg_output_token_per_seconds = self.n_total_completion_tokens / self.total_time
|
|
118
|
+
self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
|
|
119
|
+
+ self.n_total_completion_tokens) / self.total_time
|
|
115
120
|
self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
|
|
116
121
|
self.qps = self.n_succeed_queries / self.total_time
|
|
117
122
|
except ZeroDivisionError as e:
|
|
118
123
|
logger.exception(e)
|
|
119
124
|
return
|
|
120
125
|
|
|
121
|
-
def create_message(self, default_ndigits=
|
|
126
|
+
def create_message(self, default_ndigits=4):
|
|
122
127
|
message = {
|
|
123
128
|
'Time taken for tests (s)': round(self.total_time, default_ndigits),
|
|
124
129
|
'Number of concurrency': self.concurrency,
|
|
125
130
|
'Total requests': int(self.n_total_queries),
|
|
126
131
|
'Succeed requests': self.n_succeed_queries,
|
|
127
132
|
'Failed requests': self.n_failed_queries,
|
|
128
|
-
'
|
|
129
|
-
'
|
|
133
|
+
'Output token throughput (tok/s)': round(self.avg_output_token_per_seconds, default_ndigits),
|
|
134
|
+
'Total token throughput (tok/s)': round(self.avg_total_token_per_seconds, default_ndigits),
|
|
135
|
+
'Request throughput (req/s)': round(self.qps, default_ndigits),
|
|
130
136
|
'Average latency (s)': round(self.avg_latency, default_ndigits),
|
|
131
137
|
'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
|
|
132
138
|
'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -175,7 +175,7 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
175
175
|
|
|
176
176
|
metrics = {
|
|
177
177
|
'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
178
|
-
'
|
|
178
|
+
'ITL (s)':
|
|
179
179
|
inter_token_latencies_all,
|
|
180
180
|
'Latency (s)': [row[LATENCY_INDEX] for row in rows],
|
|
181
181
|
'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from evalscope.perf.arguments import Arguments
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def init_wandb(args: Arguments) -> None:
|
|
7
|
+
"""
|
|
8
|
+
Initialize WandB for logging.
|
|
9
|
+
"""
|
|
10
|
+
# Initialize wandb if the api key is provided
|
|
11
|
+
import datetime
|
|
12
|
+
try:
|
|
13
|
+
import wandb
|
|
14
|
+
except ImportError:
|
|
15
|
+
raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
|
|
16
|
+
os.environ['WANDB_SILENT'] = 'true'
|
|
17
|
+
os.environ['WANDB_DIR'] = args.outputs_dir
|
|
18
|
+
|
|
19
|
+
wandb.login(key=args.wandb_api_key)
|
|
20
|
+
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
21
|
+
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
22
|
+
wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def init_swanlab(args: Arguments) -> None:
|
|
26
|
+
import datetime
|
|
27
|
+
try:
|
|
28
|
+
import swanlab
|
|
29
|
+
except ImportError:
|
|
30
|
+
raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
|
|
31
|
+
os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
|
|
32
|
+
if not args.swanlab_api_key == 'local':
|
|
33
|
+
swanlab.login(api_key=args.swanlab_api_key)
|
|
34
|
+
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
35
|
+
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
36
|
+
swanlab.config.update({'framework': '📏evalscope'})
|
|
37
|
+
swanlab.init(
|
|
38
|
+
project='perf_benchmark',
|
|
39
|
+
name=name,
|
|
40
|
+
config=args.to_dict(),
|
|
41
|
+
mode='local' if args.swanlab_api_key == 'local' else None)
|