evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +8 -9
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope/benchmarks/arena_hard/utils.py +162 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- evalscope/benchmarks/data_adapter.py +30 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
- evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- evalscope/collections/evaluator.py +4 -2
- evalscope/config.py +2 -2
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/models/chat_adapter.py +32 -11
- evalscope/perf/arguments.py +30 -9
- evalscope/perf/benchmark.py +57 -103
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +4 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/utils/benchmark_util.py +12 -6
- evalscope/perf/utils/db_util.py +3 -3
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/app.py +11 -11
- evalscope/run.py +7 -0
- evalscope/summarizer.py +2 -1
- evalscope/utils/utils.py +36 -25
- evalscope/version.py +2 -2
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
- tests/cli/test_all.py +36 -27
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +38 -20
- tests/perf/test_perf.py +1 -2
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +33 -27
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
evalscope/perf/benchmark.py
CHANGED
|
@@ -9,7 +9,7 @@ import threading
|
|
|
9
9
|
import time
|
|
10
10
|
from http import HTTPStatus
|
|
11
11
|
from tqdm import tqdm
|
|
12
|
-
from typing import List
|
|
12
|
+
from typing import AsyncGenerator, List
|
|
13
13
|
|
|
14
14
|
from evalscope.perf.arguments import Arguments
|
|
15
15
|
from evalscope.perf.http_client import AioHttpClient, test_connection
|
|
@@ -18,95 +18,73 @@ from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
|
18
18
|
from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
|
|
19
19
|
from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
|
|
20
20
|
from evalscope.perf.utils.local_server import start_app
|
|
21
|
+
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
21
22
|
from evalscope.utils.logger import get_logger
|
|
22
23
|
|
|
23
24
|
logger = get_logger()
|
|
24
|
-
|
|
25
|
+
|
|
25
26
|
data_process_completed_event = asyncio.Event()
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
@exception_handler
|
|
29
|
-
async def
|
|
30
|
+
async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
|
|
30
31
|
query_generator_class = ApiRegistry(args.api)
|
|
31
32
|
query_generator = query_generator_class(args.tokenizer_path)
|
|
32
33
|
|
|
33
34
|
def load_prompt(prompt_path_or_text):
|
|
34
|
-
"""Load the prompt from a file or directly from the input text."""
|
|
35
35
|
if prompt_path_or_text.startswith('@'):
|
|
36
36
|
with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
|
|
37
37
|
return file.read()
|
|
38
38
|
return prompt_path_or_text
|
|
39
39
|
|
|
40
|
-
async def
|
|
41
|
-
"""Dispatch a single request with optional rate limiting."""
|
|
42
|
-
await request_queue.put(request)
|
|
43
|
-
if args.rate != -1:
|
|
44
|
-
interval = np.random.exponential(1.0 / args.rate)
|
|
45
|
-
await asyncio.sleep(interval)
|
|
46
|
-
|
|
47
|
-
async def dispatch_requests_from_prompt(messages):
|
|
48
|
-
"""Generate and dispatch requests based on the given prompt."""
|
|
40
|
+
async def generate_requests_from_prompt(messages):
|
|
49
41
|
request = query_generator.build_request(messages, args)
|
|
50
|
-
if args.number is None:
|
|
51
|
-
await dispatch_request(request)
|
|
52
|
-
return 1
|
|
53
42
|
for _ in range(args.number):
|
|
54
|
-
|
|
55
|
-
return args.number
|
|
43
|
+
yield request
|
|
56
44
|
|
|
57
|
-
async def
|
|
58
|
-
"""Generate and dispatch requests based on the dataset."""
|
|
59
|
-
total_query_count = 0
|
|
45
|
+
async def generate_requests_from_dataset():
|
|
60
46
|
message_generator_class = DatasetRegistry(args.dataset)
|
|
61
47
|
message_generator = message_generator_class(args)
|
|
62
48
|
|
|
49
|
+
count = 0
|
|
63
50
|
for messages in message_generator:
|
|
64
51
|
request = query_generator.build_request(messages, args)
|
|
65
|
-
if request is None:
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
break
|
|
71
|
-
|
|
72
|
-
return total_query_count
|
|
52
|
+
if request is not None:
|
|
53
|
+
yield request
|
|
54
|
+
count += 1
|
|
55
|
+
if args.number and count >= args.number:
|
|
56
|
+
break
|
|
73
57
|
|
|
74
|
-
# Load prompt or dataset and dispatch requests accordingly
|
|
75
58
|
if args.prompt:
|
|
76
59
|
prompt = load_prompt(args.prompt)
|
|
77
|
-
messages = [{'role': 'user', 'content': prompt}]
|
|
78
|
-
|
|
60
|
+
messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
|
|
61
|
+
generator = generate_requests_from_prompt(messages)
|
|
79
62
|
elif args.dataset:
|
|
80
|
-
|
|
63
|
+
generator = generate_requests_from_dataset()
|
|
81
64
|
else:
|
|
82
65
|
raise Exception('Either prompt or dataset is required!')
|
|
83
66
|
|
|
84
|
-
|
|
67
|
+
async for request in generator:
|
|
68
|
+
yield request
|
|
69
|
+
if args.rate != -1:
|
|
70
|
+
interval = np.random.exponential(1.0 / args.rate)
|
|
71
|
+
await asyncio.sleep(interval)
|
|
85
72
|
|
|
86
73
|
|
|
87
74
|
@exception_handler
|
|
88
|
-
async def
|
|
89
|
-
|
|
90
|
-
|
|
75
|
+
async def send_request(
|
|
76
|
+
semaphore: asyncio.Semaphore,
|
|
77
|
+
request: dict,
|
|
91
78
|
benchmark_data_queue: asyncio.Queue,
|
|
92
79
|
args: Arguments,
|
|
93
80
|
):
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
try:
|
|
98
|
-
# Attempt to get a request from the queue with a timeout
|
|
99
|
-
request = await asyncio.wait_for(request_queue.get(), timeout=0.0001)
|
|
100
|
-
request_queue.task_done()
|
|
101
|
-
except asyncio.TimeoutError:
|
|
102
|
-
# If timeout, continue to the next iteration
|
|
103
|
-
continue
|
|
104
|
-
|
|
105
|
-
# Initialize benchmark data for the current request
|
|
81
|
+
async with semaphore:
|
|
82
|
+
client = AioHttpClient(args)
|
|
83
|
+
async with client:
|
|
106
84
|
benchmark_data = BenchmarkData(request=request)
|
|
85
|
+
benchmark_data.start_time = time.perf_counter()
|
|
107
86
|
collected_messages = []
|
|
108
87
|
try:
|
|
109
|
-
# Send the request and process the response
|
|
110
88
|
async for is_error, state_code, response_data in client.post(request):
|
|
111
89
|
if is_error or state_code != HTTPStatus.OK:
|
|
112
90
|
logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
|
|
@@ -124,35 +102,28 @@ async def send_requests_worker(
|
|
|
124
102
|
logger.exception(e)
|
|
125
103
|
logger.error(f'Request query: {request} exception')
|
|
126
104
|
finally:
|
|
127
|
-
# Record completion time and collected messages
|
|
128
105
|
benchmark_data.completed_time = time.perf_counter()
|
|
129
106
|
benchmark_data.response_messages = collected_messages
|
|
130
107
|
await benchmark_data_queue.put(benchmark_data)
|
|
131
108
|
|
|
132
109
|
|
|
133
110
|
@exception_handler
|
|
134
|
-
async def
|
|
111
|
+
async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments):
|
|
135
112
|
metrics = BenchmarkMetrics(concurrency=args.parallel)
|
|
136
113
|
|
|
137
114
|
api_plugin_class = ApiRegistry(args.api)
|
|
138
115
|
api_plugin = api_plugin_class(args.tokenizer_path)
|
|
139
116
|
|
|
140
117
|
result_db_path = get_result_db_path(args)
|
|
141
|
-
# Initialize wandb
|
|
142
|
-
if args.wandb_api_key:
|
|
143
|
-
import datetime
|
|
144
|
-
import wandb
|
|
145
|
-
os.environ['WANDB_SILENT'] = 'true'
|
|
146
|
-
os.environ['WANDB_DIR'] = args.outputs_dir
|
|
147
118
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
119
|
+
if args.wandb_api_key:
|
|
120
|
+
init_wandb(args)
|
|
121
|
+
if args.swanlab_api_key:
|
|
122
|
+
init_swanlab(args)
|
|
152
123
|
|
|
153
124
|
collected_benchmark_data = []
|
|
154
125
|
|
|
155
|
-
with tqdm(desc='Processing') as pbar:
|
|
126
|
+
with tqdm(desc='Processing', total=args.number) as pbar:
|
|
156
127
|
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
157
128
|
try:
|
|
158
129
|
# Attempt to get benchmark data from the queue with a timeout
|
|
@@ -171,9 +142,13 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
171
142
|
# Create a message with the updated metrics
|
|
172
143
|
message = metrics.create_message()
|
|
173
144
|
|
|
174
|
-
# Log the message to wandb if the api key is provided
|
|
145
|
+
# Log the message to wandb\swanlab if the api key is provided
|
|
175
146
|
if args.wandb_api_key:
|
|
147
|
+
import wandb
|
|
176
148
|
wandb.log(message)
|
|
149
|
+
if args.swanlab_api_key:
|
|
150
|
+
import swanlab
|
|
151
|
+
swanlab.log(message)
|
|
177
152
|
|
|
178
153
|
# Log the message to the logger every n queries
|
|
179
154
|
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
@@ -194,17 +169,12 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
194
169
|
|
|
195
170
|
|
|
196
171
|
@exception_handler
|
|
197
|
-
async def
|
|
172
|
+
async def connect_test(args: Arguments) -> bool:
|
|
198
173
|
if args.api.startswith('local'):
|
|
199
174
|
# start local server
|
|
200
175
|
server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
|
|
201
176
|
server.start()
|
|
202
177
|
|
|
203
|
-
if args.dataset.startswith('speed_benchmark'):
|
|
204
|
-
args.url = f'http://127.0.0.1:{args.port}/v1/completions'
|
|
205
|
-
else:
|
|
206
|
-
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
207
|
-
|
|
208
178
|
if (not args.no_test_connection) and (not await test_connection(args)):
|
|
209
179
|
raise TimeoutError('Test connection failed')
|
|
210
180
|
|
|
@@ -216,39 +186,23 @@ async def benchmark(args: Arguments) -> None:
|
|
|
216
186
|
add_signal_handlers(loop)
|
|
217
187
|
|
|
218
188
|
# init queue
|
|
219
|
-
request_queue = asyncio.Queue()
|
|
220
189
|
benchmark_data_queue = asyncio.Queue()
|
|
221
|
-
|
|
222
190
|
# reset event
|
|
223
|
-
query_send_completed_event.clear()
|
|
224
191
|
data_process_completed_event.clear()
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
await request_queue.join()
|
|
243
|
-
query_send_completed_event.set()
|
|
244
|
-
|
|
245
|
-
await asyncio.gather(*send_request_tasks, return_exceptions=True)
|
|
246
|
-
await benchmark_data_queue.join()
|
|
247
|
-
data_process_completed_event.set()
|
|
248
|
-
|
|
249
|
-
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
250
|
-
summary_result(args, metrics, expected_number_of_queries, result_db_path)
|
|
251
|
-
|
|
252
|
-
await asyncio.sleep(0.250)
|
|
253
|
-
|
|
254
|
-
await run_tasks()
|
|
192
|
+
# test connection
|
|
193
|
+
await connect_test(args)
|
|
194
|
+
# start statistic benchmark metric
|
|
195
|
+
statistic_benchmark_metric_task = asyncio.create_task(statistic_benchmark_metric(benchmark_data_queue, args))
|
|
196
|
+
# start send request
|
|
197
|
+
semaphore = asyncio.Semaphore(args.parallel)
|
|
198
|
+
send_request_tasks: List[asyncio.Task] = []
|
|
199
|
+
async for request in get_requests(args):
|
|
200
|
+
task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
|
|
201
|
+
send_request_tasks.append(task)
|
|
202
|
+
|
|
203
|
+
await asyncio.gather(*send_request_tasks, return_exceptions=True)
|
|
204
|
+
await benchmark_data_queue.join()
|
|
205
|
+
data_process_completed_event.set()
|
|
206
|
+
|
|
207
|
+
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
208
|
+
summary_result(args, metrics, result_db_path)
|
evalscope/perf/http_client.py
CHANGED
|
@@ -24,7 +24,6 @@ class AioHttpClient:
|
|
|
24
24
|
self.connect_timeout = args.connect_timeout
|
|
25
25
|
self.client = aiohttp.ClientSession(
|
|
26
26
|
timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
|
|
27
|
-
connector=aiohttp.TCPConnector(limit=1),
|
|
28
27
|
trace_configs=[self._create_trace_config()] if args.debug else [])
|
|
29
28
|
|
|
30
29
|
def _create_trace_config(self):
|
|
@@ -144,7 +143,7 @@ async def test_connection(args: Arguments) -> bool:
|
|
|
144
143
|
async def attempt_connection():
|
|
145
144
|
client = AioHttpClient(args)
|
|
146
145
|
async with client:
|
|
147
|
-
if
|
|
146
|
+
if args.apply_chat_template:
|
|
148
147
|
request = {
|
|
149
148
|
'messages': [{
|
|
150
149
|
'role': 'user',
|
|
@@ -164,7 +163,7 @@ async def test_connection(args: Arguments) -> bool:
|
|
|
164
163
|
is_error, state_code, response_data = await asyncio.wait_for(
|
|
165
164
|
attempt_connection(), timeout=args.connect_timeout)
|
|
166
165
|
if not is_error:
|
|
167
|
-
logger.info('
|
|
166
|
+
logger.info('Test connection successful.')
|
|
168
167
|
return True
|
|
169
168
|
logger.warning(f'Retrying... <{state_code}> {response_data}')
|
|
170
169
|
except Exception as e:
|
|
@@ -24,7 +24,7 @@ class CustomPlugin(ApiPluginBase):
|
|
|
24
24
|
"""
|
|
25
25
|
super().__init__(model_path=mode_path)
|
|
26
26
|
if mode_path is not None:
|
|
27
|
-
from
|
|
27
|
+
from modelscope import AutoTokenizer
|
|
28
28
|
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
@@ -24,7 +24,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
24
24
|
"""
|
|
25
25
|
super().__init__(model_path=mode_path)
|
|
26
26
|
if mode_path is not None:
|
|
27
|
-
from
|
|
27
|
+
from modelscope import AutoTokenizer
|
|
28
28
|
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
@@ -94,9 +94,11 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
94
94
|
payload['top_p'] = param.top_p
|
|
95
95
|
if param.top_k is not None:
|
|
96
96
|
payload['top_k'] = param.top_k
|
|
97
|
+
if param.extra_args is not None:
|
|
98
|
+
payload.update(param.extra_args)
|
|
97
99
|
return payload
|
|
98
100
|
|
|
99
|
-
def parse_responses(self, responses, request: Any = None, **kwargs) ->
|
|
101
|
+
def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
|
|
100
102
|
"""Parser responses and return number of request and response tokens.
|
|
101
103
|
Only one response for non-stream, multiple responses for stream.
|
|
102
104
|
"""
|
|
@@ -18,4 +18,7 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
18
18
|
prompt = item.strip()
|
|
19
19
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
20
20
|
prompt) < self.query_parameters.max_prompt_length:
|
|
21
|
-
|
|
21
|
+
if self.query_parameters.apply_chat_template:
|
|
22
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
23
|
+
else:
|
|
24
|
+
yield prompt
|
|
@@ -19,4 +19,7 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
|
|
|
19
19
|
prompt = item.strip()
|
|
20
20
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
21
21
|
prompt) < self.query_parameters.max_prompt_length:
|
|
22
|
-
|
|
22
|
+
if self.query_parameters.apply_chat_template:
|
|
23
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
24
|
+
else:
|
|
25
|
+
yield prompt
|
|
@@ -24,4 +24,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
|
24
24
|
prompt = item['instruction'].strip()
|
|
25
25
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
26
26
|
prompt) < self.query_parameters.max_prompt_length:
|
|
27
|
-
|
|
27
|
+
if self.query_parameters.apply_chat_template:
|
|
28
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
29
|
+
else:
|
|
30
|
+
yield prompt
|
|
@@ -29,4 +29,7 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
29
29
|
prompt = item['question'].strip()
|
|
30
30
|
if (len(prompt) > self.query_parameters.min_prompt_length
|
|
31
31
|
and len(prompt) < self.query_parameters.max_prompt_length):
|
|
32
|
-
|
|
32
|
+
if self.query_parameters.apply_chat_template:
|
|
33
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
34
|
+
else:
|
|
35
|
+
yield prompt
|
|
@@ -23,8 +23,12 @@ class RandomDatasetPlugin(DatasetPluginBase):
|
|
|
23
23
|
self.number = self.query_parameters.number or 1
|
|
24
24
|
|
|
25
25
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
if self.query_parameters.apply_chat_template:
|
|
27
|
+
min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
|
|
28
|
+
max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
|
|
29
|
+
else:
|
|
30
|
+
min_prompt_length = self.query_parameters.min_prompt_length
|
|
31
|
+
max_prompt_length = self.query_parameters.max_prompt_length + 1
|
|
28
32
|
|
|
29
33
|
assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
|
|
30
34
|
assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
|
|
@@ -34,10 +38,13 @@ class RandomDatasetPlugin(DatasetPluginBase):
|
|
|
34
38
|
offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
|
|
35
39
|
|
|
36
40
|
for i in range(self.number):
|
|
37
|
-
prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
|
|
38
|
-
prompt = self.tokenizer.decode(
|
|
39
|
-
|
|
40
|
-
|
|
41
|
+
prompt_ids = ((offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size).tolist()
|
|
42
|
+
prompt = self.tokenizer.decode(self.prefix_ids + prompt_ids)
|
|
43
|
+
|
|
44
|
+
if self.query_parameters.apply_chat_template:
|
|
45
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
46
|
+
else:
|
|
47
|
+
yield prompt
|
|
41
48
|
|
|
42
49
|
def get_random_inputs(self, length: int) -> List[int]:
|
|
43
50
|
if length <= 0:
|
|
@@ -3,6 +3,9 @@ from typing import Dict, Iterator, List, Tuple
|
|
|
3
3
|
from evalscope.perf.arguments import Arguments
|
|
4
4
|
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
5
5
|
from evalscope.perf.plugin.registry import register_dataset
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
@register_dataset('speed_benchmark')
|
|
@@ -18,6 +21,14 @@ class SpeedBenchmarkDatasetPlugin(DatasetPluginBase):
|
|
|
18
21
|
def __init__(self, query_parameters: Arguments):
|
|
19
22
|
super().__init__(query_parameters)
|
|
20
23
|
|
|
24
|
+
url = self.query_parameters.url
|
|
25
|
+
if url.endswith('v1/chat/completions'):
|
|
26
|
+
logger.warning(
|
|
27
|
+
'The API URL is not set correctly for `speed_benchmark`. Using `v1/completions` instead of `v1/chat/completions` by system.' # noqa
|
|
28
|
+
)
|
|
29
|
+
url = url.replace('v1/chat/completions', 'v1/completions')
|
|
30
|
+
self.query_parameters.url = url
|
|
31
|
+
|
|
21
32
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
22
33
|
for input_len in self.INPUT_LENGTH:
|
|
23
34
|
for _ in range(self.REPEAT):
|
|
@@ -11,7 +11,7 @@ logger = get_logger()
|
|
|
11
11
|
@dataclass
|
|
12
12
|
class BenchmarkData:
|
|
13
13
|
request: Any = None
|
|
14
|
-
start_time: float =
|
|
14
|
+
start_time: float = 0.0
|
|
15
15
|
completed_time: float = 0.0
|
|
16
16
|
chunk_times: List[float] = field(default_factory=list)
|
|
17
17
|
success: bool = False
|
|
@@ -73,7 +73,9 @@ class BenchmarkMetrics:
|
|
|
73
73
|
avg_chunk_time: float = -1
|
|
74
74
|
avg_prompt_tokens: float = -1
|
|
75
75
|
avg_completion_tokens: float = -1
|
|
76
|
-
|
|
76
|
+
avg_input_token_per_seconds: float = -1
|
|
77
|
+
avg_output_token_per_seconds: float = -1
|
|
78
|
+
avg_total_token_per_seconds: float = -1
|
|
77
79
|
avg_time_per_token: float = -1
|
|
78
80
|
qps: float = -1
|
|
79
81
|
|
|
@@ -111,22 +113,26 @@ class BenchmarkMetrics:
|
|
|
111
113
|
self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
|
|
112
114
|
self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
|
|
113
115
|
self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
|
|
114
|
-
self.
|
|
116
|
+
self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
|
|
117
|
+
self.avg_output_token_per_seconds = self.n_total_completion_tokens / self.total_time
|
|
118
|
+
self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
|
|
119
|
+
+ self.n_total_completion_tokens) / self.total_time
|
|
115
120
|
self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
|
|
116
121
|
self.qps = self.n_succeed_queries / self.total_time
|
|
117
122
|
except ZeroDivisionError as e:
|
|
118
123
|
logger.exception(e)
|
|
119
124
|
return
|
|
120
125
|
|
|
121
|
-
def create_message(self, default_ndigits=
|
|
126
|
+
def create_message(self, default_ndigits=4):
|
|
122
127
|
message = {
|
|
123
128
|
'Time taken for tests (s)': round(self.total_time, default_ndigits),
|
|
124
129
|
'Number of concurrency': self.concurrency,
|
|
125
130
|
'Total requests': int(self.n_total_queries),
|
|
126
131
|
'Succeed requests': self.n_succeed_queries,
|
|
127
132
|
'Failed requests': self.n_failed_queries,
|
|
128
|
-
'
|
|
129
|
-
'
|
|
133
|
+
'Output token throughput (tok/s)': round(self.avg_output_token_per_seconds, default_ndigits),
|
|
134
|
+
'Total token throughput (tok/s)': round(self.avg_total_token_per_seconds, default_ndigits),
|
|
135
|
+
'Request throughput (req/s)': round(self.qps, default_ndigits),
|
|
130
136
|
'Average latency (s)': round(self.avg_latency, default_ndigits),
|
|
131
137
|
'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
|
|
132
138
|
'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -175,7 +175,7 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
175
175
|
|
|
176
176
|
metrics = {
|
|
177
177
|
'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
178
|
-
'
|
|
178
|
+
'ITL (s)':
|
|
179
179
|
inter_token_latencies_all,
|
|
180
180
|
'Latency (s)': [row[LATENCY_INDEX] for row in rows],
|
|
181
181
|
'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
@@ -194,12 +194,12 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
194
194
|
return results
|
|
195
195
|
|
|
196
196
|
|
|
197
|
-
def summary_result(args: Arguments, metrics: BenchmarkMetrics,
|
|
197
|
+
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
|
|
198
198
|
result_path = os.path.dirname(result_db_path)
|
|
199
199
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
200
200
|
|
|
201
201
|
data = metrics.create_message()
|
|
202
|
-
data.update({'Expected number of requests':
|
|
202
|
+
data.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
|
|
203
203
|
write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
|
|
204
204
|
|
|
205
205
|
# Print summary in a table
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from evalscope.perf.arguments import Arguments
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def init_wandb(args: Arguments) -> None:
|
|
7
|
+
"""
|
|
8
|
+
Initialize WandB for logging.
|
|
9
|
+
"""
|
|
10
|
+
# Initialize wandb if the api key is provided
|
|
11
|
+
import datetime
|
|
12
|
+
try:
|
|
13
|
+
import wandb
|
|
14
|
+
except ImportError:
|
|
15
|
+
raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
|
|
16
|
+
os.environ['WANDB_SILENT'] = 'true'
|
|
17
|
+
os.environ['WANDB_DIR'] = args.outputs_dir
|
|
18
|
+
|
|
19
|
+
wandb.login(key=args.wandb_api_key)
|
|
20
|
+
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
21
|
+
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
22
|
+
wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def init_swanlab(args: Arguments) -> None:
|
|
26
|
+
import datetime
|
|
27
|
+
try:
|
|
28
|
+
import swanlab
|
|
29
|
+
except ImportError:
|
|
30
|
+
raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
|
|
31
|
+
os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
|
|
32
|
+
if not args.swanlab_api_key == 'local':
|
|
33
|
+
swanlab.login(api_key=args.swanlab_api_key)
|
|
34
|
+
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
35
|
+
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
36
|
+
swanlab.config.update({'framework': '📏evalscope'})
|
|
37
|
+
swanlab.init(
|
|
38
|
+
project='perf_benchmark',
|
|
39
|
+
name=name,
|
|
40
|
+
config=args.to_dict(),
|
|
41
|
+
mode='local' if args.swanlab_api_key == 'local' else None)
|
evalscope/report/app.py
CHANGED
|
@@ -44,7 +44,7 @@ def scan_for_report_folders(root_path):
|
|
|
44
44
|
continue
|
|
45
45
|
datasets = []
|
|
46
46
|
for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
|
|
47
|
-
datasets.append(os.path.basename(dataset_item)
|
|
47
|
+
datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
|
|
48
48
|
datasets = DATASET_TOKEN.join(datasets)
|
|
49
49
|
reports.append(
|
|
50
50
|
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
|
|
@@ -253,17 +253,17 @@ def process_model_prediction(item: Any):
|
|
|
253
253
|
|
|
254
254
|
|
|
255
255
|
def normalize_score(score):
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
else:
|
|
263
|
-
try:
|
|
264
|
-
return float(score)
|
|
265
|
-
except (ValueError, TypeError):
|
|
256
|
+
try:
|
|
257
|
+
if isinstance(score, bool):
|
|
258
|
+
return 1.0 if score else 0.0
|
|
259
|
+
elif isinstance(score, dict):
|
|
260
|
+
for key in score:
|
|
261
|
+
return float(score[key])
|
|
266
262
|
return 0.0
|
|
263
|
+
else:
|
|
264
|
+
return float(score)
|
|
265
|
+
except (ValueError, TypeError):
|
|
266
|
+
return 0.0
|
|
267
267
|
|
|
268
268
|
|
|
269
269
|
def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
|
evalscope/run.py
CHANGED
|
@@ -58,10 +58,17 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
|
58
58
|
|
|
59
59
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
60
60
|
|
|
61
|
+
# Unify the output directory structure
|
|
61
62
|
if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
|
|
62
63
|
task_cfg.eval_config['time_str'] = run_time
|
|
63
64
|
elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
64
65
|
task_cfg.eval_config['work_dir'] = task_cfg.work_dir
|
|
66
|
+
elif task_cfg.eval_backend == EvalBackend.RAG_EVAL:
|
|
67
|
+
from evalscope.backend.rag_eval import Tools
|
|
68
|
+
if task_cfg.eval_config['tool'].lower() == Tools.MTEB:
|
|
69
|
+
task_cfg.eval_config['eval']['output_folder'] = task_cfg.work_dir
|
|
70
|
+
elif task_cfg.eval_config['tool'].lower() == Tools.CLIP_BENCHMARK:
|
|
71
|
+
task_cfg.eval_config['eval']['output_dir'] = task_cfg.work_dir
|
|
65
72
|
return outputs
|
|
66
73
|
|
|
67
74
|
|
evalscope/summarizer.py
CHANGED
|
@@ -105,7 +105,8 @@ class Summarizer:
|
|
|
105
105
|
summary_res: dict = csv_to_list(summary_file_path)[0]
|
|
106
106
|
elif summary_file_path.endswith('json'):
|
|
107
107
|
summary_res: dict = json_to_dict(summary_file_path)
|
|
108
|
-
|
|
108
|
+
base_name = os.path.basename(summary_file_path)
|
|
109
|
+
file_name = os.path.splitext(base_name)[0]
|
|
109
110
|
final_res_list.append({file_name: summary_res})
|
|
110
111
|
|
|
111
112
|
elif eval_backend == EvalBackend.THIRD_PARTY:
|