evalscope 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (35) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/utils/llm.py +4 -5
  3. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  4. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  5. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  6. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  7. evalscope/benchmarks/arena_hard/utils.py +162 -0
  8. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  10. evalscope/benchmarks/data_adapter.py +26 -2
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
  13. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  14. evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
  15. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  16. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  17. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  18. evalscope/config.py +1 -1
  19. evalscope/metrics/llm_judge.py +1 -1
  20. evalscope/models/chat_adapter.py +32 -11
  21. evalscope/perf/arguments.py +8 -6
  22. evalscope/perf/benchmark.py +31 -63
  23. evalscope/perf/plugin/api/openai_api.py +4 -2
  24. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  25. evalscope/perf/utils/db_util.py +2 -2
  26. evalscope/version.py +2 -2
  27. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/METADATA +10 -49
  28. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/RECORD +35 -28
  29. tests/cli/test_all.py +33 -24
  30. tests/cli/test_run.py +35 -18
  31. tests/rag/test_ragas.py +4 -1
  32. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
  33. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
  34. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
  35. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,182 @@
1
+ from collections import defaultdict
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.benchmarks import Benchmark, DataAdapter
5
+ from evalscope.constants import EvalType, OutputType
6
+ from evalscope.metrics import exact_match
7
+ from evalscope.utils.logger import get_logger
8
+ from evalscope.utils.utils import ResponseParser
9
+
10
+ logger = get_logger()
11
+
12
+ SUBSET_LIST = [
13
+ 'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
14
+ 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
15
+ 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
16
+ 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
17
+ 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
18
+ 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
19
+ 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
20
+ 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
21
+ 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
22
+ 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
23
+ 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
24
+ 'world_religions'
25
+ ]
26
+
27
+ SUBJECT_MAPPING = {
28
+ 'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
29
+ 'anatomy': ['Anatomy', 'health', 'Other'],
30
+ 'astronomy': ['Astronomy', 'physics', 'STEM'],
31
+ 'business_ethics': ['Business Ethics', 'business', 'Other'],
32
+ 'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
33
+ 'college_biology': ['College Biology', 'biology', 'STEM'],
34
+ 'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
35
+ 'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
36
+ 'college_mathematics': ['College Mathematics', 'math', 'STEM'],
37
+ 'college_medicine': ['College Medicine', 'health', 'Other'],
38
+ 'college_physics': ['College Physics', 'physics', 'STEM'],
39
+ 'computer_security': ['Computer Security', 'computer science', 'STEM'],
40
+ 'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
41
+ 'econometrics': ['Econometrics', 'economics', 'Social Science'],
42
+ 'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
43
+ 'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
44
+ 'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
45
+ 'global_facts': ['Global Facts', 'other', 'Other'],
46
+ 'high_school_biology': ['High School Biology', 'biology', 'STEM'],
47
+ 'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
48
+ 'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
49
+ 'high_school_european_history': ['High School European History', 'history', 'Humanities'],
50
+ 'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
51
+ 'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
52
+ 'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
53
+ 'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
54
+ 'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
55
+ 'high_school_physics': ['High School Physics', 'physics', 'STEM'],
56
+ 'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
57
+ 'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
58
+ 'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
59
+ 'high_school_world_history': ['High School World History', 'history', 'Humanities'],
60
+ 'human_aging': ['Human Aging', 'health', 'Other'],
61
+ 'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
62
+ 'international_law': ['International Law', 'law', 'Humanities'],
63
+ 'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
64
+ 'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
65
+ 'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
66
+ 'management': ['Management', 'business', 'Other'],
67
+ 'marketing': ['Marketing', 'business', 'Other'],
68
+ 'medical_genetics': ['Medical Genetics', 'health', 'Other'],
69
+ 'miscellaneous': ['Miscellaneous', 'other', 'Other'],
70
+ 'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
71
+ 'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
72
+ 'nutrition': ['Nutrition', 'health', 'Other'],
73
+ 'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
74
+ 'prehistory': ['Prehistory', 'history', 'Humanities'],
75
+ 'professional_accounting': ['Professional Accounting', 'other', 'Other'],
76
+ 'professional_law': ['Professional Law', 'law', 'Humanities'],
77
+ 'professional_medicine': ['Professional Medicine', 'health', 'Other'],
78
+ 'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
79
+ 'public_relations': ['Public Relations', 'politics', 'Social Science'],
80
+ 'security_studies': ['Security Studies', 'politics', 'Social Science'],
81
+ 'sociology': ['Sociology', 'culture', 'Social Science'],
82
+ 'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
83
+ 'virology': ['Virology', 'health', 'Other'],
84
+ 'world_religions': ['World Religions', 'philosophy', 'Humanities'],
85
+ }
86
+
87
+
88
+ @Benchmark.register(
89
+ name='mmlu_redux',
90
+ pretty_name='MMLU-Redux',
91
+ dataset_id='AI-ModelScope/mmlu-redux-2.0',
92
+ model_adapter=OutputType.GENERATION,
93
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
94
+ subset_list=SUBSET_LIST,
95
+ metric_list=['AverageAccuracy'],
96
+ few_shot_num=0,
97
+ train_split=None,
98
+ eval_split='test',
99
+ prompt_template=
100
+ 'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
101
+ )
102
+ class MMLUReduxAdapter(DataAdapter):
103
+
104
+ def __init__(self, **kwargs):
105
+ super().__init__(**kwargs)
106
+
107
+ if self.few_shot_num > 0:
108
+ self.few_shot_num = 0
109
+ logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
110
+
111
+ self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
112
+ self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
113
+
114
+ def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
115
+ if self.few_shot_num > 0:
116
+ prefix = self.format_fewshot_examples(few_shot_list)
117
+ else:
118
+ prefix = ''
119
+ query = prefix + 'Q: ' + input_d['question'] + '\n' + \
120
+ self.__form_options(input_d['choices']) + '\n'
121
+
122
+ full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
123
+ return self.gen_prompt_data(full_prompt)
124
+
125
+ def format_fewshot_examples(self, few_shot_list):
126
+ # load few-shot prompts for each category
127
+ prompts = ''
128
+ for index, d in enumerate(few_shot_list):
129
+ prompts += 'Q: ' + d['question'] + '\n' + \
130
+ self.__form_options(d['choices']) + '\n'
131
+ return prompts
132
+
133
+ def __form_options(self, options: list):
134
+ option_str = 'Options are:\n'
135
+ for opt, choice in zip(options, self.choices):
136
+ option_str += f'({choice}): {opt}' + '\n'
137
+ return option_str
138
+
139
+ def get_gold_answer(self, input_d: dict) -> str:
140
+ """
141
+ Parse the raw input labels (gold).
142
+
143
+ Args:
144
+ input_d: input raw data. Depending on the dataset.
145
+
146
+ Returns:
147
+ The parsed input. e.g. gold answer ... Depending on the dataset.
148
+ """
149
+ answer_index = int(input_d['answer'])
150
+ return self.choices[answer_index]
151
+
152
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
153
+ """
154
+ Parse the predicted result and extract proper answer.
155
+
156
+ Args:
157
+ result: Predicted answer from the model. Usually a string for chat.
158
+ raw_input_d: The raw input. Depending on the dataset.
159
+ eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
160
+
161
+ Returns:
162
+ The parsed answer. Depending on the dataset. Usually a string for chat.
163
+ """
164
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
165
+ return result
166
+ else:
167
+ return ResponseParser.parse_first_option(result)
168
+
169
+ def match(self, gold: str, pred: str) -> float:
170
+ """
171
+ Match the gold answer and the predicted answer.
172
+
173
+ Args:
174
+ gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
175
+ e.g. 'A', extracted from get_gold_answer method.
176
+ pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
177
+ e.g. 'B', extracted from parse_pred_result method.
178
+
179
+ Returns:
180
+ The match result. Usually a score (float) for chat/multiple-choice-questions.
181
+ """
182
+ return exact_match(gold=gold, pred=pred)
@@ -126,7 +126,7 @@ class SimpleQAAdapter(DataAdapter):
126
126
 
127
127
  def match(self, gold: str, pred: str) -> float:
128
128
  # simple match
129
- logger.warning(f'Please use LLMJudge to match the result for SimpleQA')
129
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
130
130
  is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
131
131
  is_incorrect = not is_correct
132
132
  is_not_attempted = 0
@@ -159,9 +159,6 @@ class SimpleQAAdapter(DataAdapter):
159
159
  review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
160
160
  """
161
161
  # zip dict answers
162
- res_dict = defaultdict(list)
163
- for res in review_res_list:
164
- for key, value in res.items():
165
- res_dict[key].append(value)
162
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
166
163
 
167
164
  return super().compute_metric(res_dict, **kwargs)
evalscope/config.py CHANGED
@@ -75,7 +75,7 @@ class TaskConfig:
75
75
 
76
76
  # LLMJudge arguments
77
77
  judge_strategy: str = JudgeStrategy.AUTO
78
- judge_worker_num: int = 8
78
+ judge_worker_num: int = 1
79
79
  judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
80
80
 
81
81
  def __post_init__(self):
@@ -49,7 +49,7 @@ class LLMJudge:
49
49
  """
50
50
  self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
51
51
  self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
52
- self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-3.5-turbo')
52
+ self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-4')
53
53
  self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
54
54
  self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
55
55
  self.generation_config = generation_config
@@ -1,13 +1,13 @@
1
1
  import os
2
2
  import time
3
3
  import torch
4
- from typing import List, Union
4
+ from typing import Any, Dict, List, Tuple, Union
5
5
 
6
6
  from evalscope.constants import OutputType
7
7
  from evalscope.models.base_adapter import BaseModelAdapter
8
8
  from evalscope.models.local_model import LocalModel
9
9
  from evalscope.models.register import register_model_adapter
10
- from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
10
+ from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
11
11
  from evalscope.utils.logger import get_logger
12
12
  from evalscope.utils.model_utils import fix_do_sample_warning
13
13
 
@@ -60,7 +60,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
60
60
 
61
61
  return generation_config
62
62
 
63
- def _model_generate(self, queries: List[str], system_prompts: List[str] = None, infer_cfg: dict = {}) -> List[str]:
63
+ def _model_generate(self,
64
+ queries: List[str],
65
+ system_prompts: List[str] = None,
66
+ infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
64
67
  """
65
68
  Args:
66
69
  queries: The input queries.
@@ -69,6 +72,11 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
69
72
  Returns:
70
73
  The prediction results.
71
74
  """
75
+ if system_prompts is None:
76
+ system_prompts = []
77
+ if infer_cfg is None:
78
+ infer_cfg = {}
79
+
72
80
  # Process infer_cfg
73
81
  num_return_sequences = infer_cfg.get('num_return_sequences', 1)
74
82
  if num_return_sequences > 1:
@@ -111,7 +119,9 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
111
119
  # Run inference
112
120
  output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
113
121
 
122
+ # Decode output
114
123
  responses = []
124
+ input_lengths = [len(self.tokenizer.encode(prompt)) for prompt in formatted_prompts]
115
125
  for i in range(0, len(output_ids), num_return_sequences):
116
126
  query_responses = []
117
127
  for j in range(num_return_sequences):
@@ -121,7 +131,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
121
131
  query_responses.append(response)
122
132
  responses.append(query_responses)
123
133
 
124
- return responses
134
+ return responses, input_lengths
125
135
 
126
136
  @torch.no_grad()
127
137
  def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
@@ -141,22 +151,33 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
141
151
  queries.append(input_item['data'][0])
142
152
  system_prompts.append(input_item.get('system_prompt', None))
143
153
 
144
- responses = self._model_generate(queries, system_prompts, infer_cfg)
154
+ # Run inference
155
+ responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
145
156
 
157
+ # Process outputs
146
158
  results = []
147
- for response in responses:
148
- choices_list = [
149
- ChatCompletionResponseChoice(
159
+ for response, input_length in zip(responses, input_lengths):
160
+ choices_list = []
161
+ completion_tokens = 0
162
+
163
+ for index, one_response in enumerate(response):
164
+ choice = ChatCompletionResponseChoice(
150
165
  index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
151
- for index, one_response in enumerate(response)
152
- ]
166
+ choices_list.append(choice)
167
+
168
+ completion_tokens += len(self.tokenizer.encode(one_response))
169
+
170
+ usage = Usage(
171
+ prompt_tokens=input_length,
172
+ completion_tokens=completion_tokens,
173
+ total_tokens=input_length + completion_tokens)
153
174
 
154
175
  res_d = ChatCompletionResponse(
155
176
  model=self.model_id,
156
177
  choices=choices_list,
157
178
  object='chat.completion',
158
179
  created=int(time.time()),
159
- usage=None).model_dump(exclude_unset=True)
180
+ usage=usage).model_dump(exclude_unset=True)
160
181
 
161
182
  results.append(res_d)
162
183
 
@@ -27,7 +27,7 @@ class Arguments:
27
27
  no_test_connection: bool = False # Test the connection before starting the benchmark
28
28
 
29
29
  # Performance and parallelism
30
- number: Optional[int] = None # Number of requests to be made
30
+ number: int = 1000 # Number of requests to be made
31
31
  parallel: int = 1 # Number of parallel requests
32
32
  rate: int = -1 # Rate limit for requests (default: -1, no limit)
33
33
 
@@ -60,10 +60,11 @@ class Arguments:
60
60
  seed: Optional[int] = 42 # Random seed for reproducibility
61
61
  stop: Optional[List[str]] = field(default_factory=list) # Stop sequences for the response
62
62
  stop_token_ids: Optional[List[str]] = field(default_factory=list) # Stop token IDs for the response
63
- stream: Optional[bool] = None # Whether to stream the response
64
- temperature: Optional[float] = None # Temperature setting for the response
63
+ stream: Optional[bool] = False # Whether to stream the response
64
+ temperature: float = 0.0 # Temperature setting for the response
65
65
  top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
66
66
  top_k: Optional[int] = None # Top-k sampling setting for the response
67
+ extra_args: Optional[Dict[str, Any]] = None # Extra arguments
67
68
 
68
69
  @staticmethod
69
70
  def from_args(args):
@@ -126,7 +127,7 @@ def add_argument(parser: argparse.ArgumentParser):
126
127
  parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
127
128
 
128
129
  # Performance and parallelism
129
- parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
130
+ parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
130
131
  parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
131
132
  parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
132
133
 
@@ -161,10 +162,11 @@ def add_argument(parser: argparse.ArgumentParser):
161
162
  parser.add_argument('--seed', type=int, help='The random seed', default=42)
162
163
  parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
163
164
  parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
164
- parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
165
- parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
165
+ parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=False)
166
+ parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
166
167
  parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
167
168
  parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
169
+ parser.add_argument('--extra-args', type=json.loads, default='{}', help='Extra arguments, should in JSON format',)
168
170
  # yapf: enable
169
171
 
170
172
 
@@ -9,7 +9,7 @@ import threading
9
9
  import time
10
10
  from http import HTTPStatus
11
11
  from tqdm import tqdm
12
- from typing import List
12
+ from typing import AsyncGenerator, List
13
13
 
14
14
  from evalscope.perf.arguments import Arguments
15
15
  from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -21,92 +21,68 @@ from evalscope.perf.utils.local_server import start_app
21
21
  from evalscope.utils.logger import get_logger
22
22
 
23
23
  logger = get_logger()
24
- query_send_completed_event = asyncio.Event()
24
+
25
25
  data_process_completed_event = asyncio.Event()
26
26
 
27
27
 
28
28
  @exception_handler
29
- async def dispatch_requests_worker(request_queue: asyncio.Queue, args: Arguments):
29
+ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
30
30
  query_generator_class = ApiRegistry(args.api)
31
31
  query_generator = query_generator_class(args.tokenizer_path)
32
32
 
33
33
  def load_prompt(prompt_path_or_text):
34
- """Load the prompt from a file or directly from the input text."""
35
34
  if prompt_path_or_text.startswith('@'):
36
35
  with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
37
36
  return file.read()
38
37
  return prompt_path_or_text
39
38
 
40
- async def dispatch_request(request):
41
- """Dispatch a single request with optional rate limiting."""
42
- await request_queue.put(request)
43
- if args.rate != -1:
44
- interval = np.random.exponential(1.0 / args.rate)
45
- await asyncio.sleep(interval)
46
-
47
- async def dispatch_requests_from_prompt(messages):
48
- """Generate and dispatch requests based on the given prompt."""
39
+ async def generate_requests_from_prompt(messages):
49
40
  request = query_generator.build_request(messages, args)
50
- if args.number is None:
51
- await dispatch_request(request)
52
- return 1
53
41
  for _ in range(args.number):
54
- await dispatch_request(request)
55
- return args.number
42
+ yield request
56
43
 
57
- async def dispatch_requests_from_dataset():
58
- """Generate and dispatch requests based on the dataset."""
59
- total_query_count = 0
44
+ async def generate_requests_from_dataset():
60
45
  message_generator_class = DatasetRegistry(args.dataset)
61
46
  message_generator = message_generator_class(args)
62
47
 
48
+ count = 0
63
49
  for messages in message_generator:
64
50
  request = query_generator.build_request(messages, args)
65
- if request is None:
66
- continue
67
- await dispatch_request(request)
68
- total_query_count += 1
69
- if args.number and total_query_count >= args.number:
70
- break
71
-
72
- return total_query_count
51
+ if request is not None:
52
+ yield request
53
+ count += 1
54
+ if args.number and count >= args.number:
55
+ break
73
56
 
74
- # Load prompt or dataset and dispatch requests accordingly
75
57
  if args.prompt:
76
58
  prompt = load_prompt(args.prompt)
77
59
  messages = [{'role': 'user', 'content': prompt}]
78
- total_queries = await dispatch_requests_from_prompt(messages)
60
+ generator = generate_requests_from_prompt(messages)
79
61
  elif args.dataset:
80
- total_queries = await dispatch_requests_from_dataset()
62
+ generator = generate_requests_from_dataset()
81
63
  else:
82
64
  raise Exception('Either prompt or dataset is required!')
83
65
 
84
- return total_queries
66
+ async for request in generator:
67
+ yield request
68
+ if args.rate != -1:
69
+ interval = np.random.exponential(1.0 / args.rate)
70
+ await asyncio.sleep(interval)
85
71
 
86
72
 
87
73
  @exception_handler
88
- async def send_requests_worker(
89
- task_id,
90
- request_queue: asyncio.Queue,
74
+ async def send_request(
75
+ semaphore: asyncio.Semaphore,
76
+ request: dict,
91
77
  benchmark_data_queue: asyncio.Queue,
92
78
  args: Arguments,
93
79
  ):
94
- client = AioHttpClient(args)
95
- async with client:
96
- while not (query_send_completed_event.is_set() and request_queue.empty()):
97
- try:
98
- # Attempt to get a request from the queue with a timeout
99
- request = await asyncio.wait_for(request_queue.get(), timeout=0.0001)
100
- request_queue.task_done()
101
- except asyncio.TimeoutError:
102
- # If timeout, continue to the next iteration
103
- continue
104
-
105
- # Initialize benchmark data for the current request
80
+ async with semaphore:
81
+ client = AioHttpClient(args)
82
+ async with client:
106
83
  benchmark_data = BenchmarkData(request=request)
107
84
  collected_messages = []
108
85
  try:
109
- # Send the request and process the response
110
86
  async for is_error, state_code, response_data in client.post(request):
111
87
  if is_error or state_code != HTTPStatus.OK:
112
88
  logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
@@ -124,7 +100,6 @@ async def send_requests_worker(
124
100
  logger.exception(e)
125
101
  logger.error(f'Request query: {request} exception')
126
102
  finally:
127
- # Record completion time and collected messages
128
103
  benchmark_data.completed_time = time.perf_counter()
129
104
  benchmark_data.response_messages = collected_messages
130
105
  await benchmark_data_queue.put(benchmark_data)
@@ -152,7 +127,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
152
127
 
153
128
  collected_benchmark_data = []
154
129
 
155
- with tqdm(desc='Processing') as pbar:
130
+ with tqdm(desc='Processing', total=args.number) as pbar:
156
131
  while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
157
132
  try:
158
133
  # Attempt to get benchmark data from the queue with a timeout
@@ -216,39 +191,32 @@ async def benchmark(args: Arguments) -> None:
216
191
  add_signal_handlers(loop)
217
192
 
218
193
  # init queue
219
- request_queue = asyncio.Queue()
220
194
  benchmark_data_queue = asyncio.Queue()
221
195
 
222
196
  # reset event
223
- query_send_completed_event.clear()
224
197
  data_process_completed_event.clear()
225
198
 
199
+ semaphore = asyncio.Semaphore(args.parallel)
200
+
226
201
  async def create_send_request_tasks():
227
202
  tasks: List[asyncio.Task] = []
228
- for idx in range(args.parallel):
229
- task = asyncio.create_task(send_requests_worker(idx, request_queue, benchmark_data_queue, args))
203
+ async for request in get_requests(args):
204
+ task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
230
205
  tasks.append(task)
231
206
  return tasks
232
207
 
233
208
  async def run_tasks():
234
209
  await start_server(args)
235
210
 
236
- dispatch_task = asyncio.create_task(dispatch_requests_worker(request_queue, args))
237
211
  statistic_benchmark_metric_task = asyncio.create_task(
238
212
  statistic_benchmark_metric_worker(benchmark_data_queue, args))
239
213
  send_request_tasks = await create_send_request_tasks()
240
214
 
241
- expected_number_of_queries = await dispatch_task
242
- await request_queue.join()
243
- query_send_completed_event.set()
244
-
245
215
  await asyncio.gather(*send_request_tasks, return_exceptions=True)
246
216
  await benchmark_data_queue.join()
247
217
  data_process_completed_event.set()
248
218
 
249
219
  metrics, result_db_path = await statistic_benchmark_metric_task
250
- summary_result(args, metrics, expected_number_of_queries, result_db_path)
251
-
252
- await asyncio.sleep(0.250)
220
+ summary_result(args, metrics, result_db_path)
253
221
 
254
222
  await run_tasks()
@@ -70,7 +70,7 @@ class OpenaiPlugin(ApiPluginBase):
70
70
  def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
71
71
  payload['model'] = param.model
72
72
  if param.max_tokens is not None:
73
- payload['max_tokens'] = param.max_tokens
73
+ payload['max_completion_tokens'] = param.max_tokens
74
74
  if param.min_tokens is not None:
75
75
  payload['min_tokens'] = param.min_tokens
76
76
  if param.frequency_penalty is not None:
@@ -94,9 +94,11 @@ class OpenaiPlugin(ApiPluginBase):
94
94
  payload['top_p'] = param.top_p
95
95
  if param.top_k is not None:
96
96
  payload['top_k'] = param.top_k
97
+ if param.extra_args is not None:
98
+ payload.update(param.extra_args)
97
99
  return payload
98
100
 
99
- def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
101
+ def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
100
102
  """Parser responses and return number of request and response tokens.
101
103
  Only one response for non-stream, multiple responses for stream.
102
104
  """
@@ -3,6 +3,9 @@ from typing import Dict, Iterator, List, Tuple
3
3
  from evalscope.perf.arguments import Arguments
4
4
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
5
5
  from evalscope.perf.plugin.registry import register_dataset
6
+ from evalscope.utils.logger import get_logger
7
+
8
+ logger = get_logger()
6
9
 
7
10
 
8
11
  @register_dataset('speed_benchmark')
@@ -18,6 +21,14 @@ class SpeedBenchmarkDatasetPlugin(DatasetPluginBase):
18
21
  def __init__(self, query_parameters: Arguments):
19
22
  super().__init__(query_parameters)
20
23
 
24
+ url = self.query_parameters.url
25
+ if url.endswith('v1/chat/completions'):
26
+ logger.warning(
27
+ 'The API URL is not set correctly for `speed_benchmark`. Using `v1/completions` instead of `v1/chat/completions` by system.' # noqa
28
+ )
29
+ url = url.replace('v1/chat/completions', 'v1/completions')
30
+ self.query_parameters.url = url
31
+
21
32
  def build_messages(self) -> Iterator[List[Dict]]:
22
33
  for input_len in self.INPUT_LENGTH:
23
34
  for _ in range(self.REPEAT):
@@ -194,12 +194,12 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
194
194
  return results
195
195
 
196
196
 
197
- def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
197
+ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
198
198
  result_path = os.path.dirname(result_db_path)
199
199
  write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
200
200
 
201
201
  data = metrics.create_message()
202
- data.update({'Expected number of requests': expected_number_of_queries, 'Result DB path': result_db_path})
202
+ data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
203
203
  write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
204
204
 
205
205
  # Print summary in a table
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.13.1'
4
- __release_datetime__ = '2025-03-24 18:00:00'
3
+ __version__ = '0.13.2'
4
+ __release_datetime__ = '2025-04-01 20:00:00'