evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
evalscope/perf/arguments.py
CHANGED
|
@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
|
|
|
6
6
|
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from evalscope.constants import DEFAULT_WORK_DIR
|
|
9
|
+
from evalscope.utils import BaseArgument
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
@dataclass
|
|
12
|
-
class Arguments:
|
|
13
|
+
class Arguments(BaseArgument):
|
|
13
14
|
# Model and API
|
|
14
15
|
model: str # Model name or path
|
|
15
16
|
model_id: Optional[str] = None # Model identifier
|
|
@@ -30,6 +31,7 @@ class Arguments:
|
|
|
30
31
|
number: Union[int, List[int]] = 1000 # Number of requests to be made
|
|
31
32
|
parallel: Union[int, List[int]] = 1 # Number of parallel requests
|
|
32
33
|
rate: int = -1 # Rate limit for requests (default: -1, no limit)
|
|
34
|
+
sleep_interval: int = 5 # Sleep interval between performance runs, in seconds
|
|
33
35
|
|
|
34
36
|
# Logging and debugging
|
|
35
37
|
log_every_n_query: int = 10 # Log every N queries
|
|
@@ -48,6 +50,11 @@ class Arguments:
|
|
|
48
50
|
prompt: Optional[str] = None # The prompt text
|
|
49
51
|
query_template: Optional[str] = None # Template for the query
|
|
50
52
|
apply_chat_template: Optional[bool] = None # Whether to apply chat template
|
|
53
|
+
# random vl settings
|
|
54
|
+
image_width: int = 224 # Width of the image for random VL dataset
|
|
55
|
+
image_height: int = 224 # Height of the image for random VL dataset
|
|
56
|
+
image_format: str = 'RGB' # Image format for random VL dataset
|
|
57
|
+
image_num: int = 1 # Number of images for random VL dataset
|
|
51
58
|
|
|
52
59
|
# Dataset settings
|
|
53
60
|
dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
|
|
@@ -69,15 +76,6 @@ class Arguments:
|
|
|
69
76
|
top_k: Optional[int] = None # Top-k sampling setting for the response
|
|
70
77
|
extra_args: Optional[Dict[str, Any]] = None # Extra arguments
|
|
71
78
|
|
|
72
|
-
@staticmethod
|
|
73
|
-
def from_args(args):
|
|
74
|
-
# Convert Namespace to a dictionary and filter out None values
|
|
75
|
-
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
76
|
-
|
|
77
|
-
if 'func' in args_dict:
|
|
78
|
-
del args_dict['func'] # Note: compat CLI arguments
|
|
79
|
-
return Arguments(**args_dict)
|
|
80
|
-
|
|
81
79
|
def __post_init__(self):
|
|
82
80
|
# Set the default headers
|
|
83
81
|
self.headers = self.headers or {} # Default to empty dictionary
|
|
@@ -108,12 +106,6 @@ class Arguments:
|
|
|
108
106
|
self.parallel
|
|
109
107
|
), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
|
|
110
108
|
|
|
111
|
-
def __str__(self):
|
|
112
|
-
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
113
|
-
|
|
114
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
115
|
-
return self.__dict__
|
|
116
|
-
|
|
117
109
|
|
|
118
110
|
class ParseKVAction(argparse.Action):
|
|
119
111
|
|
|
@@ -156,6 +148,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
156
148
|
parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
|
|
157
149
|
parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1') # noqa: E501
|
|
158
150
|
parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
|
|
151
|
+
parser.add_argument(
|
|
152
|
+
'--sleep-interval', type=int, default=5, help='Sleep interval between performance runs, in seconds. Default 5') # noqa: E501
|
|
159
153
|
|
|
160
154
|
# Logging and debugging
|
|
161
155
|
parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
|
|
@@ -172,6 +166,11 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
172
166
|
parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
|
|
173
167
|
parser.add_argument(
|
|
174
168
|
'--apply-chat-template', type=argparse.BooleanOptionalAction, default=None, help='Apply chat template to the prompt') # noqa: E501
|
|
169
|
+
# random vl settings
|
|
170
|
+
parser.add_argument('--image-width', type=int, default=224, help='Width of the image for random VL dataset')
|
|
171
|
+
parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
|
|
172
|
+
parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
|
|
173
|
+
parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
|
|
175
174
|
|
|
176
175
|
# Output settings
|
|
177
176
|
parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -6,15 +6,18 @@ import sqlite3
|
|
|
6
6
|
import time
|
|
7
7
|
from http import HTTPStatus
|
|
8
8
|
from tqdm import tqdm
|
|
9
|
-
from typing import AsyncGenerator, Dict, List, Tuple
|
|
10
|
-
|
|
11
|
-
from evalscope.perf.arguments import Arguments
|
|
12
|
-
from evalscope.perf.http_client import AioHttpClient, test_connection
|
|
13
|
-
from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
|
|
14
|
-
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
15
|
-
from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
|
|
16
|
-
from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
|
|
9
|
+
from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
|
|
10
|
+
|
|
17
11
|
from evalscope.utils.logger import get_logger
|
|
12
|
+
from .arguments import Arguments
|
|
13
|
+
from .http_client import AioHttpClient, test_connection
|
|
14
|
+
from .plugin import ApiRegistry, DatasetRegistry
|
|
15
|
+
from .utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
16
|
+
from .utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, load_prompt, summary_result
|
|
17
|
+
from .utils.handler import add_signal_handlers, exception_handler
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from .plugin import ApiPluginBase, DatasetPluginBase
|
|
18
21
|
|
|
19
22
|
logger = get_logger()
|
|
20
23
|
|
|
@@ -22,28 +25,22 @@ data_process_completed_event = asyncio.Event()
|
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
@exception_handler
|
|
25
|
-
async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
|
|
32
|
-
return file.read()
|
|
33
|
-
return prompt_path_or_text
|
|
34
|
-
|
|
35
|
-
async def generate_requests_from_prompt(messages):
|
|
36
|
-
request = query_generator.build_request(messages, args)
|
|
28
|
+
async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGenerator[dict, None]:
|
|
29
|
+
|
|
30
|
+
async def generate_requests_from_prompt():
|
|
31
|
+
prompt = load_prompt(args.prompt)
|
|
32
|
+
messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
|
|
33
|
+
request = api_plugin.build_request(messages)
|
|
37
34
|
for _ in range(args.number):
|
|
38
35
|
yield request
|
|
39
36
|
|
|
40
37
|
async def generate_requests_from_dataset():
|
|
41
|
-
message_generator_class = DatasetRegistry(args.dataset)
|
|
38
|
+
message_generator_class = DatasetRegistry.get_class(args.dataset)
|
|
42
39
|
message_generator = message_generator_class(args)
|
|
43
40
|
|
|
44
41
|
dataset_messages = []
|
|
45
42
|
try:
|
|
46
|
-
for messages in message_generator:
|
|
43
|
+
for messages in message_generator.build_messages():
|
|
47
44
|
dataset_messages.append(messages)
|
|
48
45
|
except StopIteration:
|
|
49
46
|
pass
|
|
@@ -56,7 +53,7 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
|
|
|
56
53
|
|
|
57
54
|
while count < args.number:
|
|
58
55
|
messages = dataset_messages[dataset_index]
|
|
59
|
-
request =
|
|
56
|
+
request = api_plugin.build_request(messages)
|
|
60
57
|
if request is not None:
|
|
61
58
|
yield request
|
|
62
59
|
count += 1
|
|
@@ -64,13 +61,11 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
|
|
|
64
61
|
dataset_index = (dataset_index + 1) % len(dataset_messages)
|
|
65
62
|
|
|
66
63
|
if args.prompt:
|
|
67
|
-
|
|
68
|
-
messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
|
|
69
|
-
generator = generate_requests_from_prompt(messages)
|
|
64
|
+
generator = generate_requests_from_prompt()
|
|
70
65
|
elif args.dataset:
|
|
71
66
|
generator = generate_requests_from_dataset()
|
|
72
67
|
else:
|
|
73
|
-
raise
|
|
68
|
+
raise ValueError('Either prompt or dataset is required!')
|
|
74
69
|
|
|
75
70
|
async for request in generator:
|
|
76
71
|
yield request
|
|
@@ -85,9 +80,10 @@ async def send_request(
|
|
|
85
80
|
request: dict,
|
|
86
81
|
benchmark_data_queue: asyncio.Queue,
|
|
87
82
|
args: Arguments,
|
|
83
|
+
api_plugin: 'ApiPluginBase',
|
|
88
84
|
):
|
|
89
85
|
async with semaphore:
|
|
90
|
-
client = AioHttpClient(args)
|
|
86
|
+
client = AioHttpClient(args, api_plugin)
|
|
91
87
|
async with client:
|
|
92
88
|
benchmark_data = BenchmarkData(request=request)
|
|
93
89
|
benchmark_data.start_time = time.perf_counter()
|
|
@@ -95,7 +91,8 @@ async def send_request(
|
|
|
95
91
|
try:
|
|
96
92
|
async for is_error, state_code, response_data in client.post(request):
|
|
97
93
|
if is_error or state_code != HTTPStatus.OK:
|
|
98
|
-
|
|
94
|
+
error_msg = str(response_data) if response_data else 'Unknown error'
|
|
95
|
+
logger.error(f'Request: {request} failed, state_code: {state_code}, data: {error_msg}')
|
|
99
96
|
benchmark_data.success = False
|
|
100
97
|
break
|
|
101
98
|
if response_data:
|
|
@@ -116,12 +113,9 @@ async def send_request(
|
|
|
116
113
|
|
|
117
114
|
|
|
118
115
|
@exception_handler
|
|
119
|
-
async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments):
|
|
116
|
+
async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
|
|
120
117
|
metrics = BenchmarkMetrics(concurrency=args.parallel)
|
|
121
118
|
|
|
122
|
-
api_plugin_class = ApiRegistry(args.api)
|
|
123
|
-
api_plugin = api_plugin_class(args.tokenizer_path)
|
|
124
|
-
|
|
125
119
|
result_db_path = get_result_db_path(args)
|
|
126
120
|
|
|
127
121
|
collected_benchmark_data = []
|
|
@@ -172,8 +166,8 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
|
|
|
172
166
|
|
|
173
167
|
|
|
174
168
|
@exception_handler
|
|
175
|
-
async def connect_test(args: Arguments) -> bool:
|
|
176
|
-
if (not args.no_test_connection) and (not await test_connection(args)):
|
|
169
|
+
async def connect_test(args: Arguments, api_plugin) -> bool:
|
|
170
|
+
if (not args.no_test_connection) and (not await test_connection(args, api_plugin)):
|
|
177
171
|
raise TimeoutError('Test connection failed')
|
|
178
172
|
|
|
179
173
|
|
|
@@ -183,19 +177,24 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
|
|
|
183
177
|
loop = asyncio.get_running_loop()
|
|
184
178
|
add_signal_handlers(loop)
|
|
185
179
|
|
|
180
|
+
# Create API plugin instance for request/response processing
|
|
181
|
+
api_plugin_class = ApiRegistry.get_class(args.api)
|
|
182
|
+
api_plugin = api_plugin_class(args)
|
|
183
|
+
|
|
186
184
|
# init queue
|
|
187
185
|
benchmark_data_queue = asyncio.Queue()
|
|
188
186
|
# reset event
|
|
189
187
|
data_process_completed_event.clear()
|
|
190
188
|
# test connection
|
|
191
|
-
await connect_test(args)
|
|
189
|
+
await connect_test(args, api_plugin)
|
|
192
190
|
# start statistic benchmark metric
|
|
193
|
-
statistic_benchmark_metric_task = asyncio.create_task(
|
|
191
|
+
statistic_benchmark_metric_task = asyncio.create_task(
|
|
192
|
+
statistic_benchmark_metric(benchmark_data_queue, args, api_plugin))
|
|
194
193
|
# start send request
|
|
195
194
|
semaphore = asyncio.Semaphore(args.parallel)
|
|
196
195
|
send_request_tasks: List[asyncio.Task] = []
|
|
197
|
-
async for request in get_requests(args):
|
|
198
|
-
task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
|
|
196
|
+
async for request in get_requests(args, api_plugin):
|
|
197
|
+
task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, api_plugin))
|
|
199
198
|
send_request_tasks.append(task)
|
|
200
199
|
|
|
201
200
|
await asyncio.gather(*send_request_tasks, return_exceptions=True)
|
evalscope/perf/http_client.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import aiohttp
|
|
2
2
|
import asyncio
|
|
3
|
-
import json
|
|
4
3
|
import time
|
|
5
|
-
from
|
|
6
|
-
from typing import AsyncGenerator, Dict, List, Tuple
|
|
4
|
+
from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
|
|
7
5
|
|
|
8
|
-
from evalscope.perf.arguments import Arguments
|
|
9
|
-
from evalscope.perf.utils.local_server import ServerSentEvent
|
|
10
6
|
from evalscope.utils.logger import get_logger
|
|
7
|
+
from .arguments import Arguments
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from .plugin.api.base import ApiPluginBase
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
13
13
|
|
|
@@ -17,95 +17,48 @@ class AioHttpClient:
|
|
|
17
17
|
def __init__(
|
|
18
18
|
self,
|
|
19
19
|
args: Arguments,
|
|
20
|
+
api_plugin: 'ApiPluginBase',
|
|
20
21
|
):
|
|
21
22
|
self.url = args.url
|
|
22
23
|
self.headers = {'user-agent': 'modelscope_bench', **(args.headers or {})}
|
|
23
24
|
self.read_timeout = args.read_timeout
|
|
24
25
|
self.connect_timeout = args.connect_timeout
|
|
26
|
+
self.api_plugin = api_plugin
|
|
25
27
|
self.client = aiohttp.ClientSession(
|
|
26
28
|
timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
|
|
27
29
|
trace_configs=[self._create_trace_config()] if args.debug else [])
|
|
28
30
|
|
|
29
|
-
def _create_trace_config(self):
|
|
30
|
-
trace_config = aiohttp.TraceConfig()
|
|
31
|
-
trace_config.on_request_start.append(self.on_request_start)
|
|
32
|
-
trace_config.on_request_chunk_sent.append(self.on_request_chunk_sent)
|
|
33
|
-
trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
|
|
34
|
-
return trace_config
|
|
35
|
-
|
|
36
31
|
async def __aenter__(self):
|
|
37
32
|
pass
|
|
38
33
|
|
|
39
34
|
async def __aexit__(self, exc_type, exc, tb):
|
|
40
35
|
await self.client.close()
|
|
41
36
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
if sse_msg.event == 'error':
|
|
50
|
-
is_error = True
|
|
51
|
-
if sse_msg.data:
|
|
52
|
-
if sse_msg.data.startswith('[DONE]'):
|
|
53
|
-
break
|
|
54
|
-
yield is_error, response.status, sse_msg.data
|
|
55
|
-
|
|
56
|
-
async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
|
|
57
|
-
response_status = response.status
|
|
58
|
-
response_content_type = response.content_type
|
|
59
|
-
content_type_json = 'application/json'
|
|
60
|
-
content_type_event_stream = 'text/event-stream'
|
|
61
|
-
is_success = response_status == HTTPStatus.OK
|
|
62
|
-
|
|
63
|
-
if is_success:
|
|
64
|
-
# Handle successful response with 'text/event-stream' content type
|
|
65
|
-
if content_type_event_stream in response_content_type:
|
|
66
|
-
async for is_error, response_status, content in self._handle_stream(response):
|
|
67
|
-
yield (is_error, response_status, content)
|
|
68
|
-
# Handle successful response with 'application/json' content type
|
|
69
|
-
elif content_type_json in response_content_type:
|
|
70
|
-
content = await response.json()
|
|
71
|
-
if content.get('object') == 'error':
|
|
72
|
-
yield (True, content.get('code'), content.get('message')) # DashScope
|
|
73
|
-
else:
|
|
74
|
-
yield (False, response_status, json.dumps(content, ensure_ascii=False))
|
|
75
|
-
# Handle other successful responses
|
|
76
|
-
else:
|
|
77
|
-
content = await response.read()
|
|
78
|
-
yield (False, response_status, content)
|
|
79
|
-
else:
|
|
80
|
-
# Handle error response with 'application/json' content type
|
|
81
|
-
if content_type_json in response_content_type:
|
|
82
|
-
error = await response.json()
|
|
83
|
-
yield (True, response_status, json.dumps(error, ensure_ascii=False))
|
|
84
|
-
# Handle error response with 'text/event-stream' content type
|
|
85
|
-
elif content_type_event_stream in response_content_type:
|
|
86
|
-
async for _, _, data in self._handle_stream(response):
|
|
87
|
-
error = json.loads(data)
|
|
88
|
-
yield (True, response_status, json.dumps(error, ensure_ascii=False))
|
|
89
|
-
# Handle other error responses
|
|
90
|
-
else:
|
|
91
|
-
msg = await response.read()
|
|
92
|
-
yield (True, response_status, msg.decode('utf-8'))
|
|
37
|
+
def _create_trace_config(self):
|
|
38
|
+
"""Create trace configuration for debugging."""
|
|
39
|
+
trace_config = aiohttp.TraceConfig()
|
|
40
|
+
trace_config.on_request_start.append(self.on_request_start)
|
|
41
|
+
trace_config.on_request_chunk_sent.append(self.on_request_chunk_sent)
|
|
42
|
+
trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
|
|
43
|
+
return trace_config
|
|
93
44
|
|
|
94
45
|
async def post(self, body):
|
|
95
|
-
|
|
46
|
+
"""Send POST request and delegate response handling to API plugin.
|
|
47
|
+
Yields:
|
|
48
|
+
Tuple[bool, int, str]: (is_error, status_code, response_data)
|
|
49
|
+
"""
|
|
96
50
|
try:
|
|
97
|
-
|
|
98
|
-
async
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
except asyncio.TimeoutError:
|
|
51
|
+
# Delegate the request processing to the API plugin
|
|
52
|
+
async for result in self.api_plugin.process_request(self.client, self.url, self.headers, body):
|
|
53
|
+
yield result
|
|
54
|
+
except asyncio.TimeoutError as e:
|
|
102
55
|
logger.error(
|
|
103
|
-
f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set
|
|
56
|
+
f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longer timeout.' # noqa: E501
|
|
104
57
|
)
|
|
105
|
-
yield (True, None,
|
|
58
|
+
yield (True, None, str(e))
|
|
106
59
|
except (aiohttp.ClientConnectorError, Exception) as e:
|
|
107
60
|
logger.error(e)
|
|
108
|
-
yield (True, None, e)
|
|
61
|
+
yield (True, None, str(e))
|
|
109
62
|
|
|
110
63
|
@staticmethod
|
|
111
64
|
async def on_request_start(session, context, params: aiohttp.TraceRequestStartParams):
|
|
@@ -136,25 +89,16 @@ class AioHttpClient:
|
|
|
136
89
|
logger.debug(f'Request received: <{method=}, {url=}, {truncated_chunk=}>')
|
|
137
90
|
|
|
138
91
|
|
|
139
|
-
async def test_connection(args: Arguments) -> bool:
|
|
92
|
+
async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
|
|
140
93
|
is_error = True
|
|
141
94
|
start_time = time.perf_counter()
|
|
142
95
|
|
|
143
96
|
async def attempt_connection():
|
|
144
|
-
client = AioHttpClient(args)
|
|
97
|
+
client = AioHttpClient(args, api_plugin)
|
|
145
98
|
async with client:
|
|
146
|
-
if args.apply_chat_template
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
'role': 'user',
|
|
150
|
-
'content': 'hello'
|
|
151
|
-
}],
|
|
152
|
-
'model': args.model,
|
|
153
|
-
'max_tokens': 10,
|
|
154
|
-
'stream': args.stream
|
|
155
|
-
}
|
|
156
|
-
else:
|
|
157
|
-
request = {'prompt': 'hello', 'model': args.model, 'max_tokens': 10}
|
|
99
|
+
messages = [{'role': 'user', 'content': 'hello'}] if args.apply_chat_template else 'hello'
|
|
100
|
+
request = api_plugin.build_request(messages)
|
|
101
|
+
|
|
158
102
|
async for is_error, state_code, response_data in client.post(request):
|
|
159
103
|
return is_error, state_code, response_data
|
|
160
104
|
|
evalscope/perf/main.py
CHANGED
|
@@ -9,7 +9,7 @@ from argparse import Namespace
|
|
|
9
9
|
from evalscope.perf.utils.local_server import start_app
|
|
10
10
|
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
11
11
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
12
|
-
from evalscope.utils.
|
|
12
|
+
from evalscope.utils.model_utils import seed_everything
|
|
13
13
|
from .arguments import Arguments, parse_args
|
|
14
14
|
from .benchmark import benchmark
|
|
15
15
|
from .utils.db_util import get_output_path
|
|
@@ -57,8 +57,8 @@ def run_multi_benchmark(args: Arguments, output_path: str = None):
|
|
|
57
57
|
results.append(metrics_result)
|
|
58
58
|
# Sleep between runs to avoid overwhelming the server
|
|
59
59
|
if i < len(number_list) - 1:
|
|
60
|
-
logger.info('Sleeping for
|
|
61
|
-
time.sleep(
|
|
60
|
+
logger.info(f'Sleeping for {args.sleep_interval} seconds before the next run...')
|
|
61
|
+
time.sleep(args.sleep_interval)
|
|
62
62
|
# Analyze results
|
|
63
63
|
print_summary(results, args.model_id)
|
|
64
64
|
return results
|
|
@@ -1,2 +1,3 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from .api import *
|
|
2
|
+
from .datasets import *
|
|
3
|
+
from .registry import ApiRegistry, DatasetRegistry
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
from
|
|
1
|
+
from .base import ApiPluginBase
|
|
2
|
+
from .custom_api import CustomPlugin
|
|
3
|
+
from .dashscope_api import DashScopeApiPlugin
|
|
4
|
+
from .openai_api import OpenaiPlugin
|
|
@@ -1,16 +1,18 @@
|
|
|
1
|
+
import aiohttp
|
|
1
2
|
from abc import abstractmethod
|
|
2
|
-
from typing import Any, Dict, List, Tuple
|
|
3
|
+
from typing import Any, AsyncGenerator, Dict, List, Tuple
|
|
3
4
|
|
|
4
5
|
from evalscope.perf.arguments import Arguments
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class ApiPluginBase:
|
|
8
9
|
|
|
9
|
-
def __init__(self,
|
|
10
|
-
self.
|
|
10
|
+
def __init__(self, param: Arguments) -> None:
|
|
11
|
+
self.param = param
|
|
12
|
+
self.model_path = param.tokenizer_path
|
|
11
13
|
|
|
12
14
|
@abstractmethod
|
|
13
|
-
def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
|
|
15
|
+
def build_request(self, messages: List[Dict], param: Arguments = None) -> Dict:
|
|
14
16
|
"""Build a api request body.
|
|
15
17
|
|
|
16
18
|
Args:
|
|
@@ -39,6 +41,22 @@ class ApiPluginBase:
|
|
|
39
41
|
"""
|
|
40
42
|
raise NotImplementedError
|
|
41
43
|
|
|
44
|
+
@abstractmethod
|
|
45
|
+
async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
|
|
46
|
+
body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
|
|
47
|
+
"""Process the HTTP request and handle the response.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
client_session: The aiohttp client session
|
|
51
|
+
url: The request URL
|
|
52
|
+
headers: The request headers
|
|
53
|
+
body: The request body
|
|
54
|
+
|
|
55
|
+
Yields:
|
|
56
|
+
Tuple[bool, int, str]: (is_error, status_code, response_data)
|
|
57
|
+
"""
|
|
58
|
+
raise NotImplementedError
|
|
59
|
+
|
|
42
60
|
@staticmethod
|
|
43
61
|
def replace_values(input_json: Any, model: str, prompt: str):
|
|
44
62
|
if isinstance(input_json, dict):
|