evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +8 -1
- evalscope/api/benchmark/adapters/__init__.py +1 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/benchmark.py +14 -0
- evalscope/api/dataset/dataset.py +21 -0
- evalscope/api/dataset/loader.py +6 -2
- evalscope/api/mixin/sandbox_mixin.py +32 -54
- evalscope/api/model/generate_config.py +6 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +111 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +72 -79
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +52 -1
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/utils/openai.py +4 -0
- evalscope/perf/arguments.py +24 -4
- evalscope/perf/benchmark.py +74 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +179 -79
- evalscope/perf/plugin/api/openai_api.py +4 -3
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/utils/benchmark_util.py +36 -22
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +0 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +11 -2
- evalscope/report/combinator.py +52 -2
- evalscope/run.py +4 -0
- evalscope/utils/function_utils.py +195 -12
- evalscope/utils/io_utils.py +74 -0
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/logger.py +49 -17
- evalscope/utils/multi_choices.py +16 -1
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
evalscope/perf/arguments.py
CHANGED
|
@@ -33,11 +33,17 @@ class Arguments(BaseArgument):
|
|
|
33
33
|
rate: int = -1 # Rate limit for requests (default: -1, no limit)
|
|
34
34
|
sleep_interval: int = 5 # Sleep interval between performance runs, in seconds
|
|
35
35
|
|
|
36
|
+
# Tuning knobs
|
|
37
|
+
db_commit_interval: int = 1000 # Number of rows buffered before committing to the DB
|
|
38
|
+
queue_size_multiplier: int = 5 # Maxsize for queue = parallel * this multiplier
|
|
39
|
+
in_flight_task_multiplier: int = 2 # Max scheduled tasks = parallel * this multiplier
|
|
40
|
+
|
|
36
41
|
# Logging and debugging
|
|
37
42
|
log_every_n_query: int = 10 # Log every N queries
|
|
38
43
|
debug: bool = False # Debug mode
|
|
39
|
-
|
|
40
|
-
|
|
44
|
+
visualizer: Optional[str] = None # Visualizer for logging, supports 'swanlab' or 'wandb'
|
|
45
|
+
wandb_api_key: Optional[str] = None # Will be deprecated in the future
|
|
46
|
+
swanlab_api_key: Optional[str] = None # Will be deprecated in the future
|
|
41
47
|
name: Optional[str] = None # Name for the run
|
|
42
48
|
|
|
43
49
|
# Output settings
|
|
@@ -68,7 +74,7 @@ class Arguments(BaseArgument):
|
|
|
68
74
|
max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
|
|
69
75
|
min_tokens: Optional[int] = None # Minimum number of tokens in the response
|
|
70
76
|
n_choices: Optional[int] = None # Number of response choices
|
|
71
|
-
seed: Optional[int] =
|
|
77
|
+
seed: Optional[int] = None # Random seed for reproducibility
|
|
72
78
|
stop: Optional[List[str]] = None # Stop sequences for the response
|
|
73
79
|
stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
|
|
74
80
|
stream: Optional[bool] = True # Whether to stream the response
|
|
@@ -107,6 +113,14 @@ class Arguments(BaseArgument):
|
|
|
107
113
|
self.parallel
|
|
108
114
|
), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
|
|
109
115
|
|
|
116
|
+
# Validate tuning knobs
|
|
117
|
+
if self.db_commit_interval <= 0:
|
|
118
|
+
self.db_commit_interval = 1
|
|
119
|
+
if self.queue_size_multiplier <= 0:
|
|
120
|
+
self.queue_size_multiplier = 1
|
|
121
|
+
if self.in_flight_task_multiplier <= 0:
|
|
122
|
+
self.in_flight_task_multiplier = 1
|
|
123
|
+
|
|
110
124
|
|
|
111
125
|
class ParseKVAction(argparse.Action):
|
|
112
126
|
|
|
@@ -152,9 +166,15 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
152
166
|
parser.add_argument(
|
|
153
167
|
'--sleep-interval', type=int, default=5, help='Sleep interval between performance runs, in seconds. Default 5') # noqa: E501
|
|
154
168
|
|
|
169
|
+
# Tuning knobs
|
|
170
|
+
parser.add_argument('--db-commit-interval', type=int, default=1000, help='Rows buffered before SQLite commit')
|
|
171
|
+
parser.add_argument('--queue-size-multiplier', type=int, default=5, help='Queue maxsize = parallel * multiplier')
|
|
172
|
+
parser.add_argument('--in-flight-task-multiplier', type=int, default=2, help='Max scheduled tasks = parallel * multiplier') # noqa: E501
|
|
173
|
+
|
|
155
174
|
# Logging and debugging
|
|
156
175
|
parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
|
|
157
176
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
|
|
177
|
+
parser.add_argument('--visualizer', type=str, default=None, help='The visualizer to use, default None')
|
|
158
178
|
parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
|
|
159
179
|
parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
|
|
160
180
|
parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
|
|
@@ -190,7 +210,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
190
210
|
parser.add_argument(
|
|
191
211
|
'--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
|
|
192
212
|
parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
|
|
193
|
-
parser.add_argument('--seed', type=int, help='The random seed', default=
|
|
213
|
+
parser.add_argument('--seed', type=int, help='The random seed', default=None)
|
|
194
214
|
parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
|
|
195
215
|
parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
|
|
196
216
|
parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -3,8 +3,6 @@ import json
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import platform
|
|
5
5
|
import sqlite3
|
|
6
|
-
import time
|
|
7
|
-
from http import HTTPStatus
|
|
8
6
|
from tqdm import tqdm
|
|
9
7
|
from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
|
|
10
8
|
|
|
@@ -82,86 +80,58 @@ async def send_request(
|
|
|
82
80
|
request: dict,
|
|
83
81
|
benchmark_data_queue: asyncio.Queue,
|
|
84
82
|
args: Arguments,
|
|
85
|
-
|
|
83
|
+
client: AioHttpClient, # reuse shared client
|
|
86
84
|
):
|
|
87
85
|
async with semaphore:
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
benchmark_data.start_time = time.perf_counter()
|
|
92
|
-
collected_messages = []
|
|
93
|
-
try:
|
|
94
|
-
async for is_error, state_code, response_data in client.post(request):
|
|
95
|
-
if is_error or state_code != HTTPStatus.OK:
|
|
96
|
-
error_msg = str(response_data) if response_data else 'Unknown error'
|
|
97
|
-
logger.error(f'Request: {request} failed, state_code: {state_code}, data: {error_msg}')
|
|
98
|
-
benchmark_data.success = False
|
|
99
|
-
break
|
|
100
|
-
if response_data:
|
|
101
|
-
collected_messages.append(response_data)
|
|
102
|
-
benchmark_data.chunk_times.append(time.perf_counter())
|
|
103
|
-
benchmark_data.success = True
|
|
104
|
-
benchmark_data.update_gpu_usage()
|
|
105
|
-
except Exception as e:
|
|
106
|
-
if response_data:
|
|
107
|
-
collected_messages.append(response_data)
|
|
108
|
-
benchmark_data.success = False
|
|
109
|
-
logger.exception(e)
|
|
110
|
-
logger.error(f'Request query: {request} exception')
|
|
111
|
-
finally:
|
|
112
|
-
benchmark_data.completed_time = time.perf_counter()
|
|
113
|
-
benchmark_data.response_messages = collected_messages
|
|
114
|
-
await benchmark_data_queue.put(benchmark_data)
|
|
86
|
+
benchmark_data = await client.post(request)
|
|
87
|
+
benchmark_data.update_gpu_usage()
|
|
88
|
+
await benchmark_data_queue.put(benchmark_data)
|
|
115
89
|
|
|
116
90
|
|
|
117
91
|
@exception_handler
|
|
118
92
|
async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
|
|
119
93
|
metrics = BenchmarkMetrics(concurrency=args.parallel)
|
|
120
|
-
|
|
121
94
|
result_db_path = get_result_db_path(args)
|
|
122
95
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
127
|
-
try:
|
|
128
|
-
# Attempt to get benchmark data from the queue with a timeout
|
|
129
|
-
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
130
|
-
benchmark_data_queue.task_done()
|
|
131
|
-
except asyncio.TimeoutError:
|
|
132
|
-
# If timeout, continue to the next iteration
|
|
133
|
-
continue
|
|
134
|
-
|
|
135
|
-
# Update metrics based on the benchmark data
|
|
136
|
-
metrics.update_metrics(benchmark_data, api_plugin)
|
|
137
|
-
|
|
138
|
-
# Collect benchmark data for later database insertion
|
|
139
|
-
collected_benchmark_data.append(benchmark_data)
|
|
140
|
-
|
|
141
|
-
# Create a message with the updated metrics
|
|
142
|
-
message = metrics.create_message()
|
|
96
|
+
# Stream inserts to DB to avoid accumulating all results in memory
|
|
97
|
+
commit_every = args.db_commit_interval
|
|
98
|
+
processed_since_commit = 0
|
|
143
99
|
|
|
144
|
-
# Log the message to wandb\swanlab if the api key is provided
|
|
145
|
-
if args.wandb_api_key:
|
|
146
|
-
import wandb
|
|
147
|
-
wandb.log(message)
|
|
148
|
-
if args.swanlab_api_key:
|
|
149
|
-
import swanlab
|
|
150
|
-
swanlab.log(message)
|
|
151
|
-
|
|
152
|
-
# Log the message to the logger every n queries
|
|
153
|
-
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
154
|
-
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
155
|
-
logger.info(msg)
|
|
156
|
-
|
|
157
|
-
pbar.update(1) # Update the progress bar
|
|
158
|
-
|
|
159
|
-
# Now perform database operations after all benchmark data has been processed
|
|
160
100
|
with sqlite3.connect(result_db_path) as con:
|
|
161
101
|
cursor = con.cursor()
|
|
162
102
|
create_result_table(cursor)
|
|
163
|
-
|
|
164
|
-
|
|
103
|
+
|
|
104
|
+
with tqdm(desc='Processing', total=args.number) as pbar:
|
|
105
|
+
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
106
|
+
try:
|
|
107
|
+
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.1)
|
|
108
|
+
except asyncio.TimeoutError:
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
# Update metrics and write to DB immediately
|
|
112
|
+
metrics.update_metrics(benchmark_data, api_plugin)
|
|
113
|
+
insert_benchmark_data(cursor, benchmark_data)
|
|
114
|
+
processed_since_commit += 1
|
|
115
|
+
if processed_since_commit >= commit_every:
|
|
116
|
+
con.commit()
|
|
117
|
+
processed_since_commit = 0
|
|
118
|
+
|
|
119
|
+
message = metrics.create_message()
|
|
120
|
+
|
|
121
|
+
if args.wandb_api_key:
|
|
122
|
+
import wandb
|
|
123
|
+
wandb.log(message)
|
|
124
|
+
if args.swanlab_api_key:
|
|
125
|
+
import swanlab
|
|
126
|
+
swanlab.log(message)
|
|
127
|
+
|
|
128
|
+
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
129
|
+
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
130
|
+
logger.info(msg)
|
|
131
|
+
|
|
132
|
+
benchmark_data_queue.task_done()
|
|
133
|
+
pbar.update(1)
|
|
134
|
+
|
|
165
135
|
con.commit()
|
|
166
136
|
|
|
167
137
|
return metrics, result_db_path
|
|
@@ -179,31 +149,46 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
|
|
|
179
149
|
loop = asyncio.get_running_loop()
|
|
180
150
|
add_signal_handlers(loop)
|
|
181
151
|
|
|
182
|
-
# Create API plugin instance for request/response processing
|
|
183
152
|
api_plugin_class = ApiRegistry.get_class(args.api)
|
|
184
153
|
api_plugin = api_plugin_class(args)
|
|
185
154
|
|
|
186
|
-
|
|
187
|
-
benchmark_data_queue = asyncio.Queue()
|
|
188
|
-
# reset event
|
|
155
|
+
benchmark_data_queue: asyncio.Queue = asyncio.Queue(maxsize=max(1, args.parallel * args.queue_size_multiplier))
|
|
189
156
|
data_process_completed_event.clear()
|
|
157
|
+
|
|
190
158
|
# test connection
|
|
191
159
|
await connect_test(args, api_plugin)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
160
|
+
|
|
161
|
+
# Create a single shared client session for all requests
|
|
162
|
+
client = AioHttpClient(args, api_plugin)
|
|
163
|
+
async with client:
|
|
164
|
+
# start statistic benchmark metric (consumer)
|
|
165
|
+
statistic_benchmark_metric_task = asyncio.create_task(
|
|
166
|
+
statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# start sending requests with bounded in-flight tasks
|
|
170
|
+
semaphore = asyncio.Semaphore(args.parallel)
|
|
171
|
+
in_flight: set[asyncio.Task] = set()
|
|
172
|
+
max_in_flight = args.parallel * args.in_flight_task_multiplier
|
|
173
|
+
|
|
174
|
+
async for request in get_requests(args, api_plugin):
|
|
175
|
+
# Keep the number of scheduled tasks bounded to avoid OOM
|
|
176
|
+
if len(in_flight) >= max_in_flight:
|
|
177
|
+
done, pending = await asyncio.wait(in_flight, return_when=asyncio.FIRST_COMPLETED)
|
|
178
|
+
in_flight = pending
|
|
179
|
+
|
|
180
|
+
task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, client))
|
|
181
|
+
in_flight.add(task)
|
|
182
|
+
|
|
183
|
+
# Wait for remaining in-flight tasks
|
|
184
|
+
if in_flight:
|
|
185
|
+
await asyncio.gather(*in_flight, return_exceptions=True)
|
|
186
|
+
|
|
187
|
+
# Drain queue and finish
|
|
188
|
+
await benchmark_data_queue.join()
|
|
189
|
+
data_process_completed_event.set()
|
|
190
|
+
|
|
191
|
+
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
192
|
+
|
|
208
193
|
metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
|
|
209
194
|
return metrics_result, percentile_result
|
evalscope/perf/http_client.py
CHANGED
|
@@ -3,6 +3,7 @@ import asyncio
|
|
|
3
3
|
import time
|
|
4
4
|
from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
|
|
5
5
|
|
|
6
|
+
from evalscope.perf.utils.benchmark_util import BenchmarkData
|
|
6
7
|
from evalscope.utils.logger import get_logger
|
|
7
8
|
from .arguments import Arguments
|
|
8
9
|
|
|
@@ -24,7 +25,22 @@ class AioHttpClient:
|
|
|
24
25
|
self.read_timeout = args.read_timeout
|
|
25
26
|
self.connect_timeout = args.connect_timeout
|
|
26
27
|
self.api_plugin = api_plugin
|
|
28
|
+
|
|
29
|
+
# Configure connector similar to vLLM bench for better TTFT under load.
|
|
30
|
+
connector = aiohttp.TCPConnector(
|
|
31
|
+
limit=args.parallel or 0, # 0 means no limit in aiohttp; use parallel as limit if set
|
|
32
|
+
limit_per_host=args.parallel or 0,
|
|
33
|
+
ttl_dns_cache=300,
|
|
34
|
+
use_dns_cache=True,
|
|
35
|
+
keepalive_timeout=60,
|
|
36
|
+
enable_cleanup_closed=True,
|
|
37
|
+
force_close=False,
|
|
38
|
+
ssl=('https://' in self.url),
|
|
39
|
+
)
|
|
40
|
+
|
|
27
41
|
self.client = aiohttp.ClientSession(
|
|
42
|
+
connector=connector,
|
|
43
|
+
trust_env=True,
|
|
28
44
|
timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
|
|
29
45
|
trace_configs=[self._create_trace_config()] if args.debug else []
|
|
30
46
|
)
|
|
@@ -43,23 +59,25 @@ class AioHttpClient:
|
|
|
43
59
|
trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
|
|
44
60
|
return trace_config
|
|
45
61
|
|
|
46
|
-
async def post(self, body):
|
|
47
|
-
"""
|
|
48
|
-
|
|
49
|
-
|
|
62
|
+
async def post(self, body) -> BenchmarkData:
|
|
63
|
+
"""
|
|
64
|
+
Send POST request and delegate response handling to API plugin.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
BenchmarkData: The benchmark data object containing request and response information.
|
|
50
68
|
"""
|
|
51
69
|
try:
|
|
52
70
|
# Delegate the request processing to the API plugin
|
|
53
|
-
|
|
54
|
-
|
|
71
|
+
output = await self.api_plugin.process_request(self.client, self.url, self.headers, body)
|
|
72
|
+
return output
|
|
55
73
|
except asyncio.TimeoutError as e:
|
|
56
74
|
logger.error(
|
|
57
75
|
f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longer timeout.' # noqa: E501
|
|
58
76
|
)
|
|
59
|
-
|
|
77
|
+
return BenchmarkData(success=False, error=str(e))
|
|
60
78
|
except (aiohttp.ClientConnectorError, Exception) as e:
|
|
61
79
|
logger.error(e)
|
|
62
|
-
|
|
80
|
+
return BenchmarkData(success=False, error=str(e))
|
|
63
81
|
|
|
64
82
|
@staticmethod
|
|
65
83
|
async def on_request_start(session, context, params: aiohttp.TraceRequestStartParams):
|
|
@@ -91,7 +109,6 @@ class AioHttpClient:
|
|
|
91
109
|
|
|
92
110
|
|
|
93
111
|
async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
|
|
94
|
-
is_error = True
|
|
95
112
|
start_time = time.perf_counter()
|
|
96
113
|
|
|
97
114
|
async def attempt_connection():
|
|
@@ -100,18 +117,16 @@ async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
|
|
|
100
117
|
messages = [{'role': 'user', 'content': 'hello'}] if args.apply_chat_template else 'hello'
|
|
101
118
|
request = api_plugin.build_request(messages)
|
|
102
119
|
|
|
103
|
-
|
|
104
|
-
|
|
120
|
+
output = await client.post(request)
|
|
121
|
+
return output
|
|
105
122
|
|
|
106
123
|
while True:
|
|
107
124
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
)
|
|
111
|
-
if not is_error:
|
|
125
|
+
output = await asyncio.wait_for(attempt_connection(), timeout=args.connect_timeout)
|
|
126
|
+
if output.success:
|
|
112
127
|
logger.info('Test connection successful.')
|
|
113
128
|
return True
|
|
114
|
-
logger.warning(f'Retrying...
|
|
129
|
+
logger.warning(f'Retrying... <{output.error}>')
|
|
115
130
|
except Exception as e:
|
|
116
131
|
logger.warning(f'Retrying... <{e}>')
|
|
117
132
|
|
evalscope/perf/main.py
CHANGED
|
@@ -4,7 +4,9 @@ import os
|
|
|
4
4
|
import platform
|
|
5
5
|
import threading
|
|
6
6
|
import time
|
|
7
|
+
import warnings
|
|
7
8
|
from argparse import Namespace
|
|
9
|
+
from logging import warn
|
|
8
10
|
|
|
9
11
|
from evalscope.perf.utils.local_server import start_app
|
|
10
12
|
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
@@ -79,9 +81,20 @@ def run_perf_benchmark(args):
|
|
|
79
81
|
configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
|
|
80
82
|
|
|
81
83
|
# Initialize wandb and swanlab
|
|
82
|
-
|
|
84
|
+
visualizer = args.visualizer
|
|
85
|
+
if visualizer is None:
|
|
86
|
+
if args.wandb_api_key is not None:
|
|
87
|
+
visualizer = 'wandb'
|
|
88
|
+
warnings.warn('--wandb-api-key is deprecated. Please use `--visualizer wandb` instead.', DeprecationWarning)
|
|
89
|
+
elif args.swanlab_api_key is not None:
|
|
90
|
+
visualizer = 'swanlab'
|
|
91
|
+
warnings.warn(
|
|
92
|
+
'--swanlab-api-key is deprecated. Please use `--visualizer swanlab` instead.', DeprecationWarning
|
|
93
|
+
)
|
|
94
|
+
args.visualizer = visualizer
|
|
95
|
+
if visualizer == 'wandb':
|
|
83
96
|
init_wandb(args)
|
|
84
|
-
|
|
97
|
+
elif visualizer == 'swanlab':
|
|
85
98
|
init_swanlab(args)
|
|
86
99
|
|
|
87
100
|
# Initialize local server if needed
|
|
@@ -3,6 +3,7 @@ from abc import abstractmethod
|
|
|
3
3
|
from typing import Any, AsyncGenerator, Dict, List, Tuple
|
|
4
4
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
6
|
+
from evalscope.perf.utils.benchmark_util import BenchmarkData
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class ApiPluginBase:
|
|
@@ -28,13 +29,13 @@ class ApiPluginBase:
|
|
|
28
29
|
raise NotImplementedError
|
|
29
30
|
|
|
30
31
|
@abstractmethod
|
|
31
|
-
def parse_responses(self, responses: List, request:
|
|
32
|
+
def parse_responses(self, responses: List[Dict], request: str = None, **kwargs: Any) -> Tuple[int, int]:
|
|
32
33
|
"""Parser responses and return number of request and response tokens.
|
|
33
34
|
|
|
34
35
|
Args:
|
|
35
|
-
responses (List[
|
|
36
|
+
responses (List[Dict]): List of http response body, for stream output,
|
|
36
37
|
there are multiple responses, each is bytes, for general only one.
|
|
37
|
-
request (
|
|
38
|
+
request (str): The json string of request.
|
|
38
39
|
|
|
39
40
|
Returns:
|
|
40
41
|
Tuple: (Number of prompt_tokens and number of completion_tokens).
|
|
@@ -42,8 +43,9 @@ class ApiPluginBase:
|
|
|
42
43
|
raise NotImplementedError
|
|
43
44
|
|
|
44
45
|
@abstractmethod
|
|
45
|
-
async def process_request(
|
|
46
|
-
|
|
46
|
+
async def process_request(
|
|
47
|
+
self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
|
|
48
|
+
) -> BenchmarkData:
|
|
47
49
|
"""Process the HTTP request and handle the response.
|
|
48
50
|
|
|
49
51
|
Args:
|
|
@@ -52,8 +54,8 @@ class ApiPluginBase:
|
|
|
52
54
|
headers: The request headers
|
|
53
55
|
body: The request body
|
|
54
56
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
+
Returns:
|
|
58
|
+
BenchmarkData: The benchmark data including response and timing info.
|
|
57
59
|
"""
|
|
58
60
|
raise NotImplementedError
|
|
59
61
|
|
|
@@ -5,6 +5,7 @@ from typing import Any, AsyncGenerator, Dict, List, Tuple, Union
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
6
6
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
7
7
|
from evalscope.perf.plugin.registry import register_api
|
|
8
|
+
from evalscope.perf.utils.benchmark_util import BenchmarkData
|
|
8
9
|
from evalscope.utils.logger import get_logger
|
|
9
10
|
|
|
10
11
|
logger = get_logger()
|
|
@@ -98,7 +99,7 @@ class CustomPlugin(ApiPluginBase):
|
|
|
98
99
|
|
|
99
100
|
return payload
|
|
100
101
|
|
|
101
|
-
def parse_responses(self, responses: List[
|
|
102
|
+
def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> Tuple[int, int]:
|
|
102
103
|
"""Parse API responses and return token counts.
|
|
103
104
|
|
|
104
105
|
This method extracts the number of input and output tokens from the API responses.
|
|
@@ -106,8 +107,8 @@ class CustomPlugin(ApiPluginBase):
|
|
|
106
107
|
to calculate it using a tokenizer.
|
|
107
108
|
|
|
108
109
|
Args:
|
|
109
|
-
responses (List[
|
|
110
|
-
request (
|
|
110
|
+
responses (List[Dict]): List of API response strings.
|
|
111
|
+
request (str, optional): The original request, which might be needed for token calculation.
|
|
111
112
|
**kwargs: Additional arguments.
|
|
112
113
|
|
|
113
114
|
Returns:
|
|
@@ -160,8 +161,9 @@ class CustomPlugin(ApiPluginBase):
|
|
|
160
161
|
logger.error(f'Error parsing responses: {e}')
|
|
161
162
|
return 0, 0
|
|
162
163
|
|
|
163
|
-
async def process_request(
|
|
164
|
-
|
|
164
|
+
async def process_request(
|
|
165
|
+
self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
|
|
166
|
+
) -> BenchmarkData:
|
|
165
167
|
"""Process the HTTP request and handle the response.
|
|
166
168
|
|
|
167
169
|
This method handles sending the request to your API and processing the response,
|
|
@@ -173,60 +175,13 @@ class CustomPlugin(ApiPluginBase):
|
|
|
173
175
|
headers (Dict): The request headers.
|
|
174
176
|
body (Dict): The request body.
|
|
175
177
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
- is_error: Whether the response indicates an error
|
|
179
|
-
- status_code: HTTP status code
|
|
180
|
-
- response_data: Response content
|
|
178
|
+
Returns:
|
|
179
|
+
BenchmarkData: The benchmark data including response and timing info.
|
|
181
180
|
"""
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
# Convert body to JSON
|
|
187
|
-
data = json.dumps(body, ensure_ascii=False)
|
|
188
|
-
|
|
189
|
-
# Send the request
|
|
190
|
-
async with client_session.request('POST', url=url, data=data, headers=headers) as response: # noqa: E125
|
|
191
|
-
# Get the status code
|
|
192
|
-
status_code = response.status
|
|
193
|
-
|
|
194
|
-
# Check if it's a streaming response
|
|
195
|
-
if 'text/event-stream' in response.content_type:
|
|
196
|
-
# Handle streaming response
|
|
197
|
-
async for line in response.content:
|
|
198
|
-
line_str = line.decode('utf-8').strip()
|
|
199
|
-
if not line_str:
|
|
200
|
-
continue
|
|
201
|
-
|
|
202
|
-
# Check for data prefix in server-sent events
|
|
203
|
-
if line_str.startswith('data: '):
|
|
204
|
-
data = line_str[6:] # Remove 'data: ' prefix
|
|
205
|
-
|
|
206
|
-
# Check if it's the end of the stream
|
|
207
|
-
if data == '[DONE]':
|
|
208
|
-
break
|
|
209
|
-
|
|
210
|
-
try:
|
|
211
|
-
# Parse the JSON data
|
|
212
|
-
parsed_data = json.loads(data)
|
|
213
|
-
yield (False, status_code, json.dumps(parsed_data))
|
|
214
|
-
except json.JSONDecodeError:
|
|
215
|
-
yield (True, status_code, f'Failed to parse JSON: {data}')
|
|
216
|
-
else:
|
|
217
|
-
# Handle regular response
|
|
218
|
-
if 'application/json' in response.content_type:
|
|
219
|
-
# JSON response
|
|
220
|
-
content = await response.json()
|
|
221
|
-
yield (status_code >= 400, status_code, json.dumps(content))
|
|
222
|
-
else:
|
|
223
|
-
# Text response
|
|
224
|
-
content = await response.text()
|
|
225
|
-
yield (status_code >= 400, status_code, content)
|
|
226
|
-
|
|
227
|
-
except Exception as e:
|
|
228
|
-
logger.error(f'Error in process_request: {e}')
|
|
229
|
-
yield (True, 500, str(e))
|
|
181
|
+
raise NotImplementedError(
|
|
182
|
+
'The `process_request` method must be implemented in a subclass. '
|
|
183
|
+
'For OpenAI-compatible APIs, consider inheriting from `DefaultApiPlugin` to reuse the default implementation.' # noqa: E501
|
|
184
|
+
)
|
|
230
185
|
|
|
231
186
|
|
|
232
187
|
if __name__ == '__main__':
|