evalscope 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +4 -5
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope/benchmarks/arena_hard/utils.py +162 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- evalscope/benchmarks/data_adapter.py +26 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- evalscope/config.py +1 -1
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/models/chat_adapter.py +32 -11
- evalscope/perf/arguments.py +8 -6
- evalscope/perf/benchmark.py +31 -63
- evalscope/perf/plugin/api/openai_api.py +4 -2
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/utils/db_util.py +2 -2
- evalscope/version.py +2 -2
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/METADATA +10 -49
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/RECORD +35 -28
- tests/cli/test_all.py +33 -24
- tests/cli/test_run.py +35 -18
- tests/rag/test_ragas.py +4 -1
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
+
from evalscope.constants import EvalType, OutputType
|
|
6
|
+
from evalscope.metrics import exact_match
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
from evalscope.utils.utils import ResponseParser
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
SUBSET_LIST = [
|
|
13
|
+
'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
|
|
14
|
+
'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
|
|
15
|
+
'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
|
|
16
|
+
'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
|
|
17
|
+
'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
|
|
18
|
+
'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
|
|
19
|
+
'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
|
|
20
|
+
'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
|
|
21
|
+
'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
|
|
22
|
+
'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
|
|
23
|
+
'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
|
|
24
|
+
'world_religions'
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
SUBJECT_MAPPING = {
|
|
28
|
+
'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
|
|
29
|
+
'anatomy': ['Anatomy', 'health', 'Other'],
|
|
30
|
+
'astronomy': ['Astronomy', 'physics', 'STEM'],
|
|
31
|
+
'business_ethics': ['Business Ethics', 'business', 'Other'],
|
|
32
|
+
'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
|
|
33
|
+
'college_biology': ['College Biology', 'biology', 'STEM'],
|
|
34
|
+
'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
|
|
35
|
+
'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
|
|
36
|
+
'college_mathematics': ['College Mathematics', 'math', 'STEM'],
|
|
37
|
+
'college_medicine': ['College Medicine', 'health', 'Other'],
|
|
38
|
+
'college_physics': ['College Physics', 'physics', 'STEM'],
|
|
39
|
+
'computer_security': ['Computer Security', 'computer science', 'STEM'],
|
|
40
|
+
'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
|
|
41
|
+
'econometrics': ['Econometrics', 'economics', 'Social Science'],
|
|
42
|
+
'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
|
|
43
|
+
'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
|
|
44
|
+
'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
|
|
45
|
+
'global_facts': ['Global Facts', 'other', 'Other'],
|
|
46
|
+
'high_school_biology': ['High School Biology', 'biology', 'STEM'],
|
|
47
|
+
'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
|
|
48
|
+
'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
|
|
49
|
+
'high_school_european_history': ['High School European History', 'history', 'Humanities'],
|
|
50
|
+
'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
|
|
51
|
+
'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
|
|
52
|
+
'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
|
|
53
|
+
'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
|
|
54
|
+
'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
|
|
55
|
+
'high_school_physics': ['High School Physics', 'physics', 'STEM'],
|
|
56
|
+
'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
|
|
57
|
+
'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
|
|
58
|
+
'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
|
|
59
|
+
'high_school_world_history': ['High School World History', 'history', 'Humanities'],
|
|
60
|
+
'human_aging': ['Human Aging', 'health', 'Other'],
|
|
61
|
+
'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
|
|
62
|
+
'international_law': ['International Law', 'law', 'Humanities'],
|
|
63
|
+
'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
|
|
64
|
+
'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
|
|
65
|
+
'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
|
|
66
|
+
'management': ['Management', 'business', 'Other'],
|
|
67
|
+
'marketing': ['Marketing', 'business', 'Other'],
|
|
68
|
+
'medical_genetics': ['Medical Genetics', 'health', 'Other'],
|
|
69
|
+
'miscellaneous': ['Miscellaneous', 'other', 'Other'],
|
|
70
|
+
'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
|
|
71
|
+
'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
|
|
72
|
+
'nutrition': ['Nutrition', 'health', 'Other'],
|
|
73
|
+
'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
|
|
74
|
+
'prehistory': ['Prehistory', 'history', 'Humanities'],
|
|
75
|
+
'professional_accounting': ['Professional Accounting', 'other', 'Other'],
|
|
76
|
+
'professional_law': ['Professional Law', 'law', 'Humanities'],
|
|
77
|
+
'professional_medicine': ['Professional Medicine', 'health', 'Other'],
|
|
78
|
+
'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
|
|
79
|
+
'public_relations': ['Public Relations', 'politics', 'Social Science'],
|
|
80
|
+
'security_studies': ['Security Studies', 'politics', 'Social Science'],
|
|
81
|
+
'sociology': ['Sociology', 'culture', 'Social Science'],
|
|
82
|
+
'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
|
|
83
|
+
'virology': ['Virology', 'health', 'Other'],
|
|
84
|
+
'world_religions': ['World Religions', 'philosophy', 'Humanities'],
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@Benchmark.register(
|
|
89
|
+
name='mmlu_redux',
|
|
90
|
+
pretty_name='MMLU-Redux',
|
|
91
|
+
dataset_id='AI-ModelScope/mmlu-redux-2.0',
|
|
92
|
+
model_adapter=OutputType.GENERATION,
|
|
93
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
94
|
+
subset_list=SUBSET_LIST,
|
|
95
|
+
metric_list=['AverageAccuracy'],
|
|
96
|
+
few_shot_num=0,
|
|
97
|
+
train_split=None,
|
|
98
|
+
eval_split='test',
|
|
99
|
+
prompt_template=
|
|
100
|
+
'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
|
|
101
|
+
)
|
|
102
|
+
class MMLUReduxAdapter(DataAdapter):
|
|
103
|
+
|
|
104
|
+
def __init__(self, **kwargs):
|
|
105
|
+
super().__init__(**kwargs)
|
|
106
|
+
|
|
107
|
+
if self.few_shot_num > 0:
|
|
108
|
+
self.few_shot_num = 0
|
|
109
|
+
logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
|
|
110
|
+
|
|
111
|
+
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
112
|
+
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
113
|
+
|
|
114
|
+
def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
115
|
+
if self.few_shot_num > 0:
|
|
116
|
+
prefix = self.format_fewshot_examples(few_shot_list)
|
|
117
|
+
else:
|
|
118
|
+
prefix = ''
|
|
119
|
+
query = prefix + 'Q: ' + input_d['question'] + '\n' + \
|
|
120
|
+
self.__form_options(input_d['choices']) + '\n'
|
|
121
|
+
|
|
122
|
+
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
|
|
123
|
+
return self.gen_prompt_data(full_prompt)
|
|
124
|
+
|
|
125
|
+
def format_fewshot_examples(self, few_shot_list):
|
|
126
|
+
# load few-shot prompts for each category
|
|
127
|
+
prompts = ''
|
|
128
|
+
for index, d in enumerate(few_shot_list):
|
|
129
|
+
prompts += 'Q: ' + d['question'] + '\n' + \
|
|
130
|
+
self.__form_options(d['choices']) + '\n'
|
|
131
|
+
return prompts
|
|
132
|
+
|
|
133
|
+
def __form_options(self, options: list):
|
|
134
|
+
option_str = 'Options are:\n'
|
|
135
|
+
for opt, choice in zip(options, self.choices):
|
|
136
|
+
option_str += f'({choice}): {opt}' + '\n'
|
|
137
|
+
return option_str
|
|
138
|
+
|
|
139
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
140
|
+
"""
|
|
141
|
+
Parse the raw input labels (gold).
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
input_d: input raw data. Depending on the dataset.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
The parsed input. e.g. gold answer ... Depending on the dataset.
|
|
148
|
+
"""
|
|
149
|
+
answer_index = int(input_d['answer'])
|
|
150
|
+
return self.choices[answer_index]
|
|
151
|
+
|
|
152
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
153
|
+
"""
|
|
154
|
+
Parse the predicted result and extract proper answer.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
result: Predicted answer from the model. Usually a string for chat.
|
|
158
|
+
raw_input_d: The raw input. Depending on the dataset.
|
|
159
|
+
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
163
|
+
"""
|
|
164
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
165
|
+
return result
|
|
166
|
+
else:
|
|
167
|
+
return ResponseParser.parse_first_option(result)
|
|
168
|
+
|
|
169
|
+
def match(self, gold: str, pred: str) -> float:
|
|
170
|
+
"""
|
|
171
|
+
Match the gold answer and the predicted answer.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
|
|
175
|
+
e.g. 'A', extracted from get_gold_answer method.
|
|
176
|
+
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
|
|
177
|
+
e.g. 'B', extracted from parse_pred_result method.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
181
|
+
"""
|
|
182
|
+
return exact_match(gold=gold, pred=pred)
|
|
@@ -126,7 +126,7 @@ class SimpleQAAdapter(DataAdapter):
|
|
|
126
126
|
|
|
127
127
|
def match(self, gold: str, pred: str) -> float:
|
|
128
128
|
# simple match
|
|
129
|
-
logger.warning(f'Please use LLMJudge to match the result for
|
|
129
|
+
logger.warning(f'Please use LLMJudge to match the result for {self.name}')
|
|
130
130
|
is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
|
|
131
131
|
is_incorrect = not is_correct
|
|
132
132
|
is_not_attempted = 0
|
|
@@ -159,9 +159,6 @@ class SimpleQAAdapter(DataAdapter):
|
|
|
159
159
|
review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
|
|
160
160
|
"""
|
|
161
161
|
# zip dict answers
|
|
162
|
-
res_dict =
|
|
163
|
-
for res in review_res_list:
|
|
164
|
-
for key, value in res.items():
|
|
165
|
-
res_dict[key].append(value)
|
|
162
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
166
163
|
|
|
167
164
|
return super().compute_metric(res_dict, **kwargs)
|
evalscope/config.py
CHANGED
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -49,7 +49,7 @@ class LLMJudge:
|
|
|
49
49
|
"""
|
|
50
50
|
self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
|
|
51
51
|
self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
|
|
52
|
-
self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-
|
|
52
|
+
self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-4')
|
|
53
53
|
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
|
|
54
54
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
55
55
|
self.generation_config = generation_config
|
evalscope/models/chat_adapter.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
3
|
import torch
|
|
4
|
-
from typing import List, Union
|
|
4
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.constants import OutputType
|
|
7
7
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
8
8
|
from evalscope.models.local_model import LocalModel
|
|
9
9
|
from evalscope.models.register import register_model_adapter
|
|
10
|
-
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
10
|
+
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
from evalscope.utils.model_utils import fix_do_sample_warning
|
|
13
13
|
|
|
@@ -60,7 +60,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
60
60
|
|
|
61
61
|
return generation_config
|
|
62
62
|
|
|
63
|
-
def _model_generate(self,
|
|
63
|
+
def _model_generate(self,
|
|
64
|
+
queries: List[str],
|
|
65
|
+
system_prompts: List[str] = None,
|
|
66
|
+
infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
|
|
64
67
|
"""
|
|
65
68
|
Args:
|
|
66
69
|
queries: The input queries.
|
|
@@ -69,6 +72,11 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
69
72
|
Returns:
|
|
70
73
|
The prediction results.
|
|
71
74
|
"""
|
|
75
|
+
if system_prompts is None:
|
|
76
|
+
system_prompts = []
|
|
77
|
+
if infer_cfg is None:
|
|
78
|
+
infer_cfg = {}
|
|
79
|
+
|
|
72
80
|
# Process infer_cfg
|
|
73
81
|
num_return_sequences = infer_cfg.get('num_return_sequences', 1)
|
|
74
82
|
if num_return_sequences > 1:
|
|
@@ -111,7 +119,9 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
111
119
|
# Run inference
|
|
112
120
|
output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
|
|
113
121
|
|
|
122
|
+
# Decode output
|
|
114
123
|
responses = []
|
|
124
|
+
input_lengths = [len(self.tokenizer.encode(prompt)) for prompt in formatted_prompts]
|
|
115
125
|
for i in range(0, len(output_ids), num_return_sequences):
|
|
116
126
|
query_responses = []
|
|
117
127
|
for j in range(num_return_sequences):
|
|
@@ -121,7 +131,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
121
131
|
query_responses.append(response)
|
|
122
132
|
responses.append(query_responses)
|
|
123
133
|
|
|
124
|
-
return responses
|
|
134
|
+
return responses, input_lengths
|
|
125
135
|
|
|
126
136
|
@torch.no_grad()
|
|
127
137
|
def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
|
|
@@ -141,22 +151,33 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
141
151
|
queries.append(input_item['data'][0])
|
|
142
152
|
system_prompts.append(input_item.get('system_prompt', None))
|
|
143
153
|
|
|
144
|
-
|
|
154
|
+
# Run inference
|
|
155
|
+
responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
|
|
145
156
|
|
|
157
|
+
# Process outputs
|
|
146
158
|
results = []
|
|
147
|
-
for response in responses:
|
|
148
|
-
choices_list = [
|
|
149
|
-
|
|
159
|
+
for response, input_length in zip(responses, input_lengths):
|
|
160
|
+
choices_list = []
|
|
161
|
+
completion_tokens = 0
|
|
162
|
+
|
|
163
|
+
for index, one_response in enumerate(response):
|
|
164
|
+
choice = ChatCompletionResponseChoice(
|
|
150
165
|
index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
|
|
151
|
-
|
|
152
|
-
|
|
166
|
+
choices_list.append(choice)
|
|
167
|
+
|
|
168
|
+
completion_tokens += len(self.tokenizer.encode(one_response))
|
|
169
|
+
|
|
170
|
+
usage = Usage(
|
|
171
|
+
prompt_tokens=input_length,
|
|
172
|
+
completion_tokens=completion_tokens,
|
|
173
|
+
total_tokens=input_length + completion_tokens)
|
|
153
174
|
|
|
154
175
|
res_d = ChatCompletionResponse(
|
|
155
176
|
model=self.model_id,
|
|
156
177
|
choices=choices_list,
|
|
157
178
|
object='chat.completion',
|
|
158
179
|
created=int(time.time()),
|
|
159
|
-
usage=
|
|
180
|
+
usage=usage).model_dump(exclude_unset=True)
|
|
160
181
|
|
|
161
182
|
results.append(res_d)
|
|
162
183
|
|
evalscope/perf/arguments.py
CHANGED
|
@@ -27,7 +27,7 @@ class Arguments:
|
|
|
27
27
|
no_test_connection: bool = False # Test the connection before starting the benchmark
|
|
28
28
|
|
|
29
29
|
# Performance and parallelism
|
|
30
|
-
number:
|
|
30
|
+
number: int = 1000 # Number of requests to be made
|
|
31
31
|
parallel: int = 1 # Number of parallel requests
|
|
32
32
|
rate: int = -1 # Rate limit for requests (default: -1, no limit)
|
|
33
33
|
|
|
@@ -60,10 +60,11 @@ class Arguments:
|
|
|
60
60
|
seed: Optional[int] = 42 # Random seed for reproducibility
|
|
61
61
|
stop: Optional[List[str]] = field(default_factory=list) # Stop sequences for the response
|
|
62
62
|
stop_token_ids: Optional[List[str]] = field(default_factory=list) # Stop token IDs for the response
|
|
63
|
-
stream: Optional[bool] =
|
|
64
|
-
temperature:
|
|
63
|
+
stream: Optional[bool] = False # Whether to stream the response
|
|
64
|
+
temperature: float = 0.0 # Temperature setting for the response
|
|
65
65
|
top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
|
|
66
66
|
top_k: Optional[int] = None # Top-k sampling setting for the response
|
|
67
|
+
extra_args: Optional[Dict[str, Any]] = None # Extra arguments
|
|
67
68
|
|
|
68
69
|
@staticmethod
|
|
69
70
|
def from_args(args):
|
|
@@ -126,7 +127,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
126
127
|
parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
|
|
127
128
|
|
|
128
129
|
# Performance and parallelism
|
|
129
|
-
parser.add_argument('-n', '--number', type=int, default=
|
|
130
|
+
parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
|
|
130
131
|
parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
|
|
131
132
|
parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
|
|
132
133
|
|
|
@@ -161,10 +162,11 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
161
162
|
parser.add_argument('--seed', type=int, help='The random seed', default=42)
|
|
162
163
|
parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
|
|
163
164
|
parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
|
|
164
|
-
parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=
|
|
165
|
-
parser.add_argument('--temperature', type=float, help='The sample temperature', default=
|
|
165
|
+
parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=False)
|
|
166
|
+
parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
|
|
166
167
|
parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
|
|
167
168
|
parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
|
|
169
|
+
parser.add_argument('--extra-args', type=json.loads, default='{}', help='Extra arguments, should in JSON format',)
|
|
168
170
|
# yapf: enable
|
|
169
171
|
|
|
170
172
|
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -9,7 +9,7 @@ import threading
|
|
|
9
9
|
import time
|
|
10
10
|
from http import HTTPStatus
|
|
11
11
|
from tqdm import tqdm
|
|
12
|
-
from typing import List
|
|
12
|
+
from typing import AsyncGenerator, List
|
|
13
13
|
|
|
14
14
|
from evalscope.perf.arguments import Arguments
|
|
15
15
|
from evalscope.perf.http_client import AioHttpClient, test_connection
|
|
@@ -21,92 +21,68 @@ from evalscope.perf.utils.local_server import start_app
|
|
|
21
21
|
from evalscope.utils.logger import get_logger
|
|
22
22
|
|
|
23
23
|
logger = get_logger()
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
data_process_completed_event = asyncio.Event()
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
@exception_handler
|
|
29
|
-
async def
|
|
29
|
+
async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
|
|
30
30
|
query_generator_class = ApiRegistry(args.api)
|
|
31
31
|
query_generator = query_generator_class(args.tokenizer_path)
|
|
32
32
|
|
|
33
33
|
def load_prompt(prompt_path_or_text):
|
|
34
|
-
"""Load the prompt from a file or directly from the input text."""
|
|
35
34
|
if prompt_path_or_text.startswith('@'):
|
|
36
35
|
with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
|
|
37
36
|
return file.read()
|
|
38
37
|
return prompt_path_or_text
|
|
39
38
|
|
|
40
|
-
async def
|
|
41
|
-
"""Dispatch a single request with optional rate limiting."""
|
|
42
|
-
await request_queue.put(request)
|
|
43
|
-
if args.rate != -1:
|
|
44
|
-
interval = np.random.exponential(1.0 / args.rate)
|
|
45
|
-
await asyncio.sleep(interval)
|
|
46
|
-
|
|
47
|
-
async def dispatch_requests_from_prompt(messages):
|
|
48
|
-
"""Generate and dispatch requests based on the given prompt."""
|
|
39
|
+
async def generate_requests_from_prompt(messages):
|
|
49
40
|
request = query_generator.build_request(messages, args)
|
|
50
|
-
if args.number is None:
|
|
51
|
-
await dispatch_request(request)
|
|
52
|
-
return 1
|
|
53
41
|
for _ in range(args.number):
|
|
54
|
-
|
|
55
|
-
return args.number
|
|
42
|
+
yield request
|
|
56
43
|
|
|
57
|
-
async def
|
|
58
|
-
"""Generate and dispatch requests based on the dataset."""
|
|
59
|
-
total_query_count = 0
|
|
44
|
+
async def generate_requests_from_dataset():
|
|
60
45
|
message_generator_class = DatasetRegistry(args.dataset)
|
|
61
46
|
message_generator = message_generator_class(args)
|
|
62
47
|
|
|
48
|
+
count = 0
|
|
63
49
|
for messages in message_generator:
|
|
64
50
|
request = query_generator.build_request(messages, args)
|
|
65
|
-
if request is None:
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
break
|
|
71
|
-
|
|
72
|
-
return total_query_count
|
|
51
|
+
if request is not None:
|
|
52
|
+
yield request
|
|
53
|
+
count += 1
|
|
54
|
+
if args.number and count >= args.number:
|
|
55
|
+
break
|
|
73
56
|
|
|
74
|
-
# Load prompt or dataset and dispatch requests accordingly
|
|
75
57
|
if args.prompt:
|
|
76
58
|
prompt = load_prompt(args.prompt)
|
|
77
59
|
messages = [{'role': 'user', 'content': prompt}]
|
|
78
|
-
|
|
60
|
+
generator = generate_requests_from_prompt(messages)
|
|
79
61
|
elif args.dataset:
|
|
80
|
-
|
|
62
|
+
generator = generate_requests_from_dataset()
|
|
81
63
|
else:
|
|
82
64
|
raise Exception('Either prompt or dataset is required!')
|
|
83
65
|
|
|
84
|
-
|
|
66
|
+
async for request in generator:
|
|
67
|
+
yield request
|
|
68
|
+
if args.rate != -1:
|
|
69
|
+
interval = np.random.exponential(1.0 / args.rate)
|
|
70
|
+
await asyncio.sleep(interval)
|
|
85
71
|
|
|
86
72
|
|
|
87
73
|
@exception_handler
|
|
88
|
-
async def
|
|
89
|
-
|
|
90
|
-
|
|
74
|
+
async def send_request(
|
|
75
|
+
semaphore: asyncio.Semaphore,
|
|
76
|
+
request: dict,
|
|
91
77
|
benchmark_data_queue: asyncio.Queue,
|
|
92
78
|
args: Arguments,
|
|
93
79
|
):
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
try:
|
|
98
|
-
# Attempt to get a request from the queue with a timeout
|
|
99
|
-
request = await asyncio.wait_for(request_queue.get(), timeout=0.0001)
|
|
100
|
-
request_queue.task_done()
|
|
101
|
-
except asyncio.TimeoutError:
|
|
102
|
-
# If timeout, continue to the next iteration
|
|
103
|
-
continue
|
|
104
|
-
|
|
105
|
-
# Initialize benchmark data for the current request
|
|
80
|
+
async with semaphore:
|
|
81
|
+
client = AioHttpClient(args)
|
|
82
|
+
async with client:
|
|
106
83
|
benchmark_data = BenchmarkData(request=request)
|
|
107
84
|
collected_messages = []
|
|
108
85
|
try:
|
|
109
|
-
# Send the request and process the response
|
|
110
86
|
async for is_error, state_code, response_data in client.post(request):
|
|
111
87
|
if is_error or state_code != HTTPStatus.OK:
|
|
112
88
|
logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
|
|
@@ -124,7 +100,6 @@ async def send_requests_worker(
|
|
|
124
100
|
logger.exception(e)
|
|
125
101
|
logger.error(f'Request query: {request} exception')
|
|
126
102
|
finally:
|
|
127
|
-
# Record completion time and collected messages
|
|
128
103
|
benchmark_data.completed_time = time.perf_counter()
|
|
129
104
|
benchmark_data.response_messages = collected_messages
|
|
130
105
|
await benchmark_data_queue.put(benchmark_data)
|
|
@@ -152,7 +127,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
152
127
|
|
|
153
128
|
collected_benchmark_data = []
|
|
154
129
|
|
|
155
|
-
with tqdm(desc='Processing') as pbar:
|
|
130
|
+
with tqdm(desc='Processing', total=args.number) as pbar:
|
|
156
131
|
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
157
132
|
try:
|
|
158
133
|
# Attempt to get benchmark data from the queue with a timeout
|
|
@@ -216,39 +191,32 @@ async def benchmark(args: Arguments) -> None:
|
|
|
216
191
|
add_signal_handlers(loop)
|
|
217
192
|
|
|
218
193
|
# init queue
|
|
219
|
-
request_queue = asyncio.Queue()
|
|
220
194
|
benchmark_data_queue = asyncio.Queue()
|
|
221
195
|
|
|
222
196
|
# reset event
|
|
223
|
-
query_send_completed_event.clear()
|
|
224
197
|
data_process_completed_event.clear()
|
|
225
198
|
|
|
199
|
+
semaphore = asyncio.Semaphore(args.parallel)
|
|
200
|
+
|
|
226
201
|
async def create_send_request_tasks():
|
|
227
202
|
tasks: List[asyncio.Task] = []
|
|
228
|
-
for
|
|
229
|
-
task = asyncio.create_task(
|
|
203
|
+
async for request in get_requests(args):
|
|
204
|
+
task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
|
|
230
205
|
tasks.append(task)
|
|
231
206
|
return tasks
|
|
232
207
|
|
|
233
208
|
async def run_tasks():
|
|
234
209
|
await start_server(args)
|
|
235
210
|
|
|
236
|
-
dispatch_task = asyncio.create_task(dispatch_requests_worker(request_queue, args))
|
|
237
211
|
statistic_benchmark_metric_task = asyncio.create_task(
|
|
238
212
|
statistic_benchmark_metric_worker(benchmark_data_queue, args))
|
|
239
213
|
send_request_tasks = await create_send_request_tasks()
|
|
240
214
|
|
|
241
|
-
expected_number_of_queries = await dispatch_task
|
|
242
|
-
await request_queue.join()
|
|
243
|
-
query_send_completed_event.set()
|
|
244
|
-
|
|
245
215
|
await asyncio.gather(*send_request_tasks, return_exceptions=True)
|
|
246
216
|
await benchmark_data_queue.join()
|
|
247
217
|
data_process_completed_event.set()
|
|
248
218
|
|
|
249
219
|
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
250
|
-
summary_result(args, metrics,
|
|
251
|
-
|
|
252
|
-
await asyncio.sleep(0.250)
|
|
220
|
+
summary_result(args, metrics, result_db_path)
|
|
253
221
|
|
|
254
222
|
await run_tasks()
|
|
@@ -70,7 +70,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
70
70
|
def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
|
|
71
71
|
payload['model'] = param.model
|
|
72
72
|
if param.max_tokens is not None:
|
|
73
|
-
payload['
|
|
73
|
+
payload['max_completion_tokens'] = param.max_tokens
|
|
74
74
|
if param.min_tokens is not None:
|
|
75
75
|
payload['min_tokens'] = param.min_tokens
|
|
76
76
|
if param.frequency_penalty is not None:
|
|
@@ -94,9 +94,11 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
94
94
|
payload['top_p'] = param.top_p
|
|
95
95
|
if param.top_k is not None:
|
|
96
96
|
payload['top_k'] = param.top_k
|
|
97
|
+
if param.extra_args is not None:
|
|
98
|
+
payload.update(param.extra_args)
|
|
97
99
|
return payload
|
|
98
100
|
|
|
99
|
-
def parse_responses(self, responses, request: Any = None, **kwargs) ->
|
|
101
|
+
def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
|
|
100
102
|
"""Parser responses and return number of request and response tokens.
|
|
101
103
|
Only one response for non-stream, multiple responses for stream.
|
|
102
104
|
"""
|
|
@@ -3,6 +3,9 @@ from typing import Dict, Iterator, List, Tuple
|
|
|
3
3
|
from evalscope.perf.arguments import Arguments
|
|
4
4
|
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
5
5
|
from evalscope.perf.plugin.registry import register_dataset
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
@register_dataset('speed_benchmark')
|
|
@@ -18,6 +21,14 @@ class SpeedBenchmarkDatasetPlugin(DatasetPluginBase):
|
|
|
18
21
|
def __init__(self, query_parameters: Arguments):
|
|
19
22
|
super().__init__(query_parameters)
|
|
20
23
|
|
|
24
|
+
url = self.query_parameters.url
|
|
25
|
+
if url.endswith('v1/chat/completions'):
|
|
26
|
+
logger.warning(
|
|
27
|
+
'The API URL is not set correctly for `speed_benchmark`. Using `v1/completions` instead of `v1/chat/completions` by system.' # noqa
|
|
28
|
+
)
|
|
29
|
+
url = url.replace('v1/chat/completions', 'v1/completions')
|
|
30
|
+
self.query_parameters.url = url
|
|
31
|
+
|
|
21
32
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
22
33
|
for input_len in self.INPUT_LENGTH:
|
|
23
34
|
for _ in range(self.REPEAT):
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -194,12 +194,12 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
194
194
|
return results
|
|
195
195
|
|
|
196
196
|
|
|
197
|
-
def summary_result(args: Arguments, metrics: BenchmarkMetrics,
|
|
197
|
+
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
|
|
198
198
|
result_path = os.path.dirname(result_db_path)
|
|
199
199
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
200
200
|
|
|
201
201
|
data = metrics.create_message()
|
|
202
|
-
data.update({'Expected number of requests':
|
|
202
|
+
data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
|
|
203
203
|
write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
|
|
204
204
|
|
|
205
205
|
# Print summary in a table
|
evalscope/version.py
CHANGED