evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  25. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  26. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  27. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  28. evalscope/benchmarks/data_adapter.py +29 -9
  29. evalscope/benchmarks/general_arena/__init__.py +0 -0
  30. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  31. evalscope/benchmarks/general_arena/utils.py +226 -0
  32. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
  33. evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
  34. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  35. evalscope/benchmarks/hle/__init__.py +0 -0
  36. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  37. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  38. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  40. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  41. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  42. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  43. evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
  44. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  45. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  46. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  47. evalscope/benchmarks/race/race_adapter.py +1 -1
  48. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  49. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  50. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  51. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  52. evalscope/benchmarks/utils.py +2 -2
  53. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  54. evalscope/config.py +8 -123
  55. evalscope/constants.py +5 -21
  56. evalscope/evaluator/__init__.py +1 -1
  57. evalscope/evaluator/evaluator.py +20 -15
  58. evalscope/metrics/__init__.py +9 -1
  59. evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
  60. evalscope/metrics/llm_judge.py +106 -20
  61. evalscope/metrics/metrics.py +20 -8
  62. evalscope/models/__init__.py +4 -8
  63. evalscope/models/adapters/__init__.py +4 -9
  64. evalscope/models/adapters/base_adapter.py +4 -0
  65. evalscope/models/adapters/bfcl_adapter.py +2 -0
  66. evalscope/models/adapters/chat_adapter.py +3 -0
  67. evalscope/models/adapters/choice_adapter.py +4 -0
  68. evalscope/models/adapters/custom_adapter.py +7 -3
  69. evalscope/models/adapters/server_adapter.py +4 -2
  70. evalscope/models/adapters/t2i_adapter.py +3 -0
  71. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  72. evalscope/models/custom/dummy_model.py +3 -3
  73. evalscope/models/register.py +0 -14
  74. evalscope/perf/arguments.py +15 -16
  75. evalscope/perf/benchmark.py +38 -39
  76. evalscope/perf/http_client.py +30 -86
  77. evalscope/perf/main.py +3 -3
  78. evalscope/perf/plugin/__init__.py +3 -2
  79. evalscope/perf/plugin/api/__init__.py +4 -3
  80. evalscope/perf/plugin/api/base.py +22 -4
  81. evalscope/perf/plugin/api/custom_api.py +212 -55
  82. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  83. evalscope/perf/plugin/api/default_api.py +105 -0
  84. evalscope/perf/plugin/api/openai_api.py +17 -19
  85. evalscope/perf/plugin/datasets/__init__.py +10 -7
  86. evalscope/perf/plugin/datasets/base.py +22 -1
  87. evalscope/perf/plugin/datasets/custom.py +2 -1
  88. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  89. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  90. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  91. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  92. evalscope/perf/plugin/datasets/openqa.py +2 -1
  93. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  94. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  95. evalscope/perf/plugin/registry.py +36 -16
  96. evalscope/perf/utils/analysis_result.py +24 -23
  97. evalscope/perf/utils/benchmark_util.py +14 -20
  98. evalscope/perf/utils/db_util.py +79 -61
  99. evalscope/report/__init__.py +1 -1
  100. evalscope/report/utils.py +34 -15
  101. evalscope/run.py +1 -1
  102. evalscope/summarizer.py +1 -2
  103. evalscope/utils/__init__.py +63 -2
  104. evalscope/utils/argument_utils.py +64 -0
  105. evalscope/utils/import_utils.py +16 -0
  106. evalscope/utils/io_utils.py +55 -4
  107. evalscope/utils/model_utils.py +37 -1
  108. evalscope/version.py +2 -2
  109. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
  110. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
  111. tests/aigc/test_t2i.py +1 -1
  112. tests/cli/test_all.py +68 -4
  113. tests/cli/test_collection.py +1 -1
  114. tests/cli/test_custom.py +261 -0
  115. tests/cli/test_run.py +34 -70
  116. tests/perf/test_perf.py +31 -4
  117. tests/rag/test_clip_benchmark.py +2 -1
  118. tests/rag/test_mteb.py +3 -1
  119. tests/rag/test_ragas.py +3 -1
  120. tests/swift/test_run_swift_eval.py +2 -1
  121. tests/swift/test_run_swift_vlm_eval.py +2 -1
  122. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  123. tests/utils.py +13 -0
  124. tests/vlm/test_vlmeval.py +8 -2
  125. evalscope/evaluator/rating_eval.py +0 -157
  126. evalscope/evaluator/reviewer/__init__.py +0 -1
  127. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  128. evalscope/models/model.py +0 -189
  129. evalscope/registry/__init__.py +0 -1
  130. evalscope/registry/config/cfg_arena.yaml +0 -77
  131. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  132. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  133. evalscope/registry/config/cfg_single.yaml +0 -78
  134. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  135. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  136. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  137. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  138. evalscope/registry/data/question.jsonl +0 -80
  139. evalscope/registry/tasks/arc.yaml +0 -28
  140. evalscope/registry/tasks/bbh.yaml +0 -26
  141. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  142. evalscope/registry/tasks/ceval.yaml +0 -27
  143. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  144. evalscope/registry/tasks/cmmlu.yaml +0 -27
  145. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  146. evalscope/registry/tasks/general_qa.yaml +0 -27
  147. evalscope/registry/tasks/gsm8k.yaml +0 -29
  148. evalscope/registry/tasks/mmlu.yaml +0 -29
  149. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  150. evalscope/run_arena.py +0 -202
  151. evalscope/utils/arena_utils.py +0 -217
  152. evalscope/utils/completion_parsers.py +0 -82
  153. /evalscope/{utils → benchmarks}/filters.py +0 -0
  154. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  155. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  156. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  157. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
6
6
  from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from evalscope.constants import DEFAULT_WORK_DIR
9
+ from evalscope.utils import BaseArgument
9
10
 
10
11
 
11
12
  @dataclass
12
- class Arguments:
13
+ class Arguments(BaseArgument):
13
14
  # Model and API
14
15
  model: str # Model name or path
15
16
  model_id: Optional[str] = None # Model identifier
@@ -30,6 +31,7 @@ class Arguments:
30
31
  number: Union[int, List[int]] = 1000 # Number of requests to be made
31
32
  parallel: Union[int, List[int]] = 1 # Number of parallel requests
32
33
  rate: int = -1 # Rate limit for requests (default: -1, no limit)
34
+ sleep_interval: int = 5 # Sleep interval between performance runs, in seconds
33
35
 
34
36
  # Logging and debugging
35
37
  log_every_n_query: int = 10 # Log every N queries
@@ -48,6 +50,11 @@ class Arguments:
48
50
  prompt: Optional[str] = None # The prompt text
49
51
  query_template: Optional[str] = None # Template for the query
50
52
  apply_chat_template: Optional[bool] = None # Whether to apply chat template
53
+ # random vl settings
54
+ image_width: int = 224 # Width of the image for random VL dataset
55
+ image_height: int = 224 # Height of the image for random VL dataset
56
+ image_format: str = 'RGB' # Image format for random VL dataset
57
+ image_num: int = 1 # Number of images for random VL dataset
51
58
 
52
59
  # Dataset settings
53
60
  dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
@@ -69,15 +76,6 @@ class Arguments:
69
76
  top_k: Optional[int] = None # Top-k sampling setting for the response
70
77
  extra_args: Optional[Dict[str, Any]] = None # Extra arguments
71
78
 
72
- @staticmethod
73
- def from_args(args):
74
- # Convert Namespace to a dictionary and filter out None values
75
- args_dict = {k: v for k, v in vars(args).items() if v is not None}
76
-
77
- if 'func' in args_dict:
78
- del args_dict['func'] # Note: compat CLI arguments
79
- return Arguments(**args_dict)
80
-
81
79
  def __post_init__(self):
82
80
  # Set the default headers
83
81
  self.headers = self.headers or {} # Default to empty dictionary
@@ -108,12 +106,6 @@ class Arguments:
108
106
  self.parallel
109
107
  ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
110
108
 
111
- def __str__(self):
112
- return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
113
-
114
- def to_dict(self) -> Dict[str, Any]:
115
- return self.__dict__
116
-
117
109
 
118
110
  class ParseKVAction(argparse.Action):
119
111
 
@@ -156,6 +148,8 @@ def add_argument(parser: argparse.ArgumentParser):
156
148
  parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
157
149
  parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1') # noqa: E501
158
150
  parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
151
+ parser.add_argument(
152
+ '--sleep-interval', type=int, default=5, help='Sleep interval between performance runs, in seconds. Default 5') # noqa: E501
159
153
 
160
154
  # Logging and debugging
161
155
  parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
@@ -172,6 +166,11 @@ def add_argument(parser: argparse.ArgumentParser):
172
166
  parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
173
167
  parser.add_argument(
174
168
  '--apply-chat-template', type=argparse.BooleanOptionalAction, default=None, help='Apply chat template to the prompt') # noqa: E501
169
+ # random vl settings
170
+ parser.add_argument('--image-width', type=int, default=224, help='Width of the image for random VL dataset')
171
+ parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
172
+ parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
173
+ parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
175
174
 
176
175
  # Output settings
177
176
  parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
@@ -6,15 +6,18 @@ import sqlite3
6
6
  import time
7
7
  from http import HTTPStatus
8
8
  from tqdm import tqdm
9
- from typing import AsyncGenerator, Dict, List, Tuple
10
-
11
- from evalscope.perf.arguments import Arguments
12
- from evalscope.perf.http_client import AioHttpClient, test_connection
13
- from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
14
- from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
15
- from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
16
- from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
9
+ from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
10
+
17
11
  from evalscope.utils.logger import get_logger
12
+ from .arguments import Arguments
13
+ from .http_client import AioHttpClient, test_connection
14
+ from .plugin import ApiRegistry, DatasetRegistry
15
+ from .utils.benchmark_util import BenchmarkData, BenchmarkMetrics
16
+ from .utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, load_prompt, summary_result
17
+ from .utils.handler import add_signal_handlers, exception_handler
18
+
19
+ if TYPE_CHECKING:
20
+ from .plugin import ApiPluginBase, DatasetPluginBase
18
21
 
19
22
  logger = get_logger()
20
23
 
@@ -22,28 +25,22 @@ data_process_completed_event = asyncio.Event()
22
25
 
23
26
 
24
27
  @exception_handler
25
- async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
26
- query_generator_class = ApiRegistry(args.api)
27
- query_generator = query_generator_class(args.tokenizer_path)
28
-
29
- def load_prompt(prompt_path_or_text):
30
- if prompt_path_or_text.startswith('@'):
31
- with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
32
- return file.read()
33
- return prompt_path_or_text
34
-
35
- async def generate_requests_from_prompt(messages):
36
- request = query_generator.build_request(messages, args)
28
+ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGenerator[dict, None]:
29
+
30
+ async def generate_requests_from_prompt():
31
+ prompt = load_prompt(args.prompt)
32
+ messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
33
+ request = api_plugin.build_request(messages)
37
34
  for _ in range(args.number):
38
35
  yield request
39
36
 
40
37
  async def generate_requests_from_dataset():
41
- message_generator_class = DatasetRegistry(args.dataset)
38
+ message_generator_class = DatasetRegistry.get_class(args.dataset)
42
39
  message_generator = message_generator_class(args)
43
40
 
44
41
  dataset_messages = []
45
42
  try:
46
- for messages in message_generator:
43
+ for messages in message_generator.build_messages():
47
44
  dataset_messages.append(messages)
48
45
  except StopIteration:
49
46
  pass
@@ -56,7 +53,7 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
56
53
 
57
54
  while count < args.number:
58
55
  messages = dataset_messages[dataset_index]
59
- request = query_generator.build_request(messages, args)
56
+ request = api_plugin.build_request(messages)
60
57
  if request is not None:
61
58
  yield request
62
59
  count += 1
@@ -64,13 +61,11 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
64
61
  dataset_index = (dataset_index + 1) % len(dataset_messages)
65
62
 
66
63
  if args.prompt:
67
- prompt = load_prompt(args.prompt)
68
- messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
69
- generator = generate_requests_from_prompt(messages)
64
+ generator = generate_requests_from_prompt()
70
65
  elif args.dataset:
71
66
  generator = generate_requests_from_dataset()
72
67
  else:
73
- raise Exception('Either prompt or dataset is required!')
68
+ raise ValueError('Either prompt or dataset is required!')
74
69
 
75
70
  async for request in generator:
76
71
  yield request
@@ -85,9 +80,10 @@ async def send_request(
85
80
  request: dict,
86
81
  benchmark_data_queue: asyncio.Queue,
87
82
  args: Arguments,
83
+ api_plugin: 'ApiPluginBase',
88
84
  ):
89
85
  async with semaphore:
90
- client = AioHttpClient(args)
86
+ client = AioHttpClient(args, api_plugin)
91
87
  async with client:
92
88
  benchmark_data = BenchmarkData(request=request)
93
89
  benchmark_data.start_time = time.perf_counter()
@@ -95,7 +91,8 @@ async def send_request(
95
91
  try:
96
92
  async for is_error, state_code, response_data in client.post(request):
97
93
  if is_error or state_code != HTTPStatus.OK:
98
- logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
94
+ error_msg = str(response_data) if response_data else 'Unknown error'
95
+ logger.error(f'Request: {request} failed, state_code: {state_code}, data: {error_msg}')
99
96
  benchmark_data.success = False
100
97
  break
101
98
  if response_data:
@@ -116,12 +113,9 @@ async def send_request(
116
113
 
117
114
 
118
115
  @exception_handler
119
- async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments):
116
+ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
120
117
  metrics = BenchmarkMetrics(concurrency=args.parallel)
121
118
 
122
- api_plugin_class = ApiRegistry(args.api)
123
- api_plugin = api_plugin_class(args.tokenizer_path)
124
-
125
119
  result_db_path = get_result_db_path(args)
126
120
 
127
121
  collected_benchmark_data = []
@@ -172,8 +166,8 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
172
166
 
173
167
 
174
168
  @exception_handler
175
- async def connect_test(args: Arguments) -> bool:
176
- if (not args.no_test_connection) and (not await test_connection(args)):
169
+ async def connect_test(args: Arguments, api_plugin) -> bool:
170
+ if (not args.no_test_connection) and (not await test_connection(args, api_plugin)):
177
171
  raise TimeoutError('Test connection failed')
178
172
 
179
173
 
@@ -183,19 +177,24 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
183
177
  loop = asyncio.get_running_loop()
184
178
  add_signal_handlers(loop)
185
179
 
180
+ # Create API plugin instance for request/response processing
181
+ api_plugin_class = ApiRegistry.get_class(args.api)
182
+ api_plugin = api_plugin_class(args)
183
+
186
184
  # init queue
187
185
  benchmark_data_queue = asyncio.Queue()
188
186
  # reset event
189
187
  data_process_completed_event.clear()
190
188
  # test connection
191
- await connect_test(args)
189
+ await connect_test(args, api_plugin)
192
190
  # start statistic benchmark metric
193
- statistic_benchmark_metric_task = asyncio.create_task(statistic_benchmark_metric(benchmark_data_queue, args))
191
+ statistic_benchmark_metric_task = asyncio.create_task(
192
+ statistic_benchmark_metric(benchmark_data_queue, args, api_plugin))
194
193
  # start send request
195
194
  semaphore = asyncio.Semaphore(args.parallel)
196
195
  send_request_tasks: List[asyncio.Task] = []
197
- async for request in get_requests(args):
198
- task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args))
196
+ async for request in get_requests(args, api_plugin):
197
+ task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, api_plugin))
199
198
  send_request_tasks.append(task)
200
199
 
201
200
  await asyncio.gather(*send_request_tasks, return_exceptions=True)
@@ -1,13 +1,13 @@
1
1
  import aiohttp
2
2
  import asyncio
3
- import json
4
3
  import time
5
- from http import HTTPStatus
6
- from typing import AsyncGenerator, Dict, List, Tuple
4
+ from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
7
5
 
8
- from evalscope.perf.arguments import Arguments
9
- from evalscope.perf.utils.local_server import ServerSentEvent
10
6
  from evalscope.utils.logger import get_logger
7
+ from .arguments import Arguments
8
+
9
+ if TYPE_CHECKING:
10
+ from .plugin.api.base import ApiPluginBase
11
11
 
12
12
  logger = get_logger()
13
13
 
@@ -17,95 +17,48 @@ class AioHttpClient:
17
17
  def __init__(
18
18
  self,
19
19
  args: Arguments,
20
+ api_plugin: 'ApiPluginBase',
20
21
  ):
21
22
  self.url = args.url
22
23
  self.headers = {'user-agent': 'modelscope_bench', **(args.headers or {})}
23
24
  self.read_timeout = args.read_timeout
24
25
  self.connect_timeout = args.connect_timeout
26
+ self.api_plugin = api_plugin
25
27
  self.client = aiohttp.ClientSession(
26
28
  timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
27
29
  trace_configs=[self._create_trace_config()] if args.debug else [])
28
30
 
29
- def _create_trace_config(self):
30
- trace_config = aiohttp.TraceConfig()
31
- trace_config.on_request_start.append(self.on_request_start)
32
- trace_config.on_request_chunk_sent.append(self.on_request_chunk_sent)
33
- trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
34
- return trace_config
35
-
36
31
  async def __aenter__(self):
37
32
  pass
38
33
 
39
34
  async def __aexit__(self, exc_type, exc, tb):
40
35
  await self.client.close()
41
36
 
42
- async def _handle_stream(self, response: aiohttp.ClientResponse):
43
- is_error = False
44
- async for line in response.content:
45
- line = line.decode('utf8').rstrip('\n\r')
46
- sse_msg = ServerSentEvent.decode(line)
47
- if sse_msg:
48
- logger.debug(f'Response recevied: {line}')
49
- if sse_msg.event == 'error':
50
- is_error = True
51
- if sse_msg.data:
52
- if sse_msg.data.startswith('[DONE]'):
53
- break
54
- yield is_error, response.status, sse_msg.data
55
-
56
- async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
57
- response_status = response.status
58
- response_content_type = response.content_type
59
- content_type_json = 'application/json'
60
- content_type_event_stream = 'text/event-stream'
61
- is_success = response_status == HTTPStatus.OK
62
-
63
- if is_success:
64
- # Handle successful response with 'text/event-stream' content type
65
- if content_type_event_stream in response_content_type:
66
- async for is_error, response_status, content in self._handle_stream(response):
67
- yield (is_error, response_status, content)
68
- # Handle successful response with 'application/json' content type
69
- elif content_type_json in response_content_type:
70
- content = await response.json()
71
- if content.get('object') == 'error':
72
- yield (True, content.get('code'), content.get('message')) # DashScope
73
- else:
74
- yield (False, response_status, json.dumps(content, ensure_ascii=False))
75
- # Handle other successful responses
76
- else:
77
- content = await response.read()
78
- yield (False, response_status, content)
79
- else:
80
- # Handle error response with 'application/json' content type
81
- if content_type_json in response_content_type:
82
- error = await response.json()
83
- yield (True, response_status, json.dumps(error, ensure_ascii=False))
84
- # Handle error response with 'text/event-stream' content type
85
- elif content_type_event_stream in response_content_type:
86
- async for _, _, data in self._handle_stream(response):
87
- error = json.loads(data)
88
- yield (True, response_status, json.dumps(error, ensure_ascii=False))
89
- # Handle other error responses
90
- else:
91
- msg = await response.read()
92
- yield (True, response_status, msg.decode('utf-8'))
37
+ def _create_trace_config(self):
38
+ """Create trace configuration for debugging."""
39
+ trace_config = aiohttp.TraceConfig()
40
+ trace_config.on_request_start.append(self.on_request_start)
41
+ trace_config.on_request_chunk_sent.append(self.on_request_chunk_sent)
42
+ trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
43
+ return trace_config
93
44
 
94
45
  async def post(self, body):
95
- headers = {'Content-Type': 'application/json', **self.headers}
46
+ """Send POST request and delegate response handling to API plugin.
47
+ Yields:
48
+ Tuple[bool, int, str]: (is_error, status_code, response_data)
49
+ """
96
50
  try:
97
- data = json.dumps(body, ensure_ascii=False) # serialize to JSON
98
- async with self.client.request('POST', url=self.url, data=data, headers=headers) as response:
99
- async for rsp in self._handle_response(response):
100
- yield rsp
101
- except asyncio.TimeoutError:
51
+ # Delegate the request processing to the API plugin
52
+ async for result in self.api_plugin.process_request(self.client, self.url, self.headers, body):
53
+ yield result
54
+ except asyncio.TimeoutError as e:
102
55
  logger.error(
103
- f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longger timeout.' # noqa: E501
56
+ f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longer timeout.' # noqa: E501
104
57
  )
105
- yield (True, None, 'Timeout')
58
+ yield (True, None, str(e))
106
59
  except (aiohttp.ClientConnectorError, Exception) as e:
107
60
  logger.error(e)
108
- yield (True, None, e)
61
+ yield (True, None, str(e))
109
62
 
110
63
  @staticmethod
111
64
  async def on_request_start(session, context, params: aiohttp.TraceRequestStartParams):
@@ -136,25 +89,16 @@ class AioHttpClient:
136
89
  logger.debug(f'Request received: <{method=}, {url=}, {truncated_chunk=}>')
137
90
 
138
91
 
139
- async def test_connection(args: Arguments) -> bool:
92
+ async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
140
93
  is_error = True
141
94
  start_time = time.perf_counter()
142
95
 
143
96
  async def attempt_connection():
144
- client = AioHttpClient(args)
97
+ client = AioHttpClient(args, api_plugin)
145
98
  async with client:
146
- if args.apply_chat_template:
147
- request = {
148
- 'messages': [{
149
- 'role': 'user',
150
- 'content': 'hello'
151
- }],
152
- 'model': args.model,
153
- 'max_tokens': 10,
154
- 'stream': args.stream
155
- }
156
- else:
157
- request = {'prompt': 'hello', 'model': args.model, 'max_tokens': 10}
99
+ messages = [{'role': 'user', 'content': 'hello'}] if args.apply_chat_template else 'hello'
100
+ request = api_plugin.build_request(messages)
101
+
158
102
  async for is_error, state_code, response_data in client.post(request):
159
103
  return is_error, state_code, response_data
160
104
 
evalscope/perf/main.py CHANGED
@@ -9,7 +9,7 @@ from argparse import Namespace
9
9
  from evalscope.perf.utils.local_server import start_app
10
10
  from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
11
11
  from evalscope.utils.logger import configure_logging, get_logger
12
- from evalscope.utils.utils import seed_everything
12
+ from evalscope.utils.model_utils import seed_everything
13
13
  from .arguments import Arguments, parse_args
14
14
  from .benchmark import benchmark
15
15
  from .utils.db_util import get_output_path
@@ -57,8 +57,8 @@ def run_multi_benchmark(args: Arguments, output_path: str = None):
57
57
  results.append(metrics_result)
58
58
  # Sleep between runs to avoid overwhelming the server
59
59
  if i < len(number_list) - 1:
60
- logger.info('Sleeping for 5 seconds before the next run...')
61
- time.sleep(5)
60
+ logger.info(f'Sleeping for {args.sleep_interval} seconds before the next run...')
61
+ time.sleep(args.sleep_interval)
62
62
  # Analyze results
63
63
  print_summary(results, args.model_id)
64
64
  return results
@@ -1,2 +1,3 @@
1
- from evalscope.perf.plugin.api import *
2
- from evalscope.perf.plugin.datasets import *
1
+ from .api import *
2
+ from .datasets import *
3
+ from .registry import ApiRegistry, DatasetRegistry
@@ -1,3 +1,4 @@
1
- from evalscope.perf.plugin.api.custom_api import CustomPlugin
2
- from evalscope.perf.plugin.api.dashscope_api import DashScopeApiPlugin
3
- from evalscope.perf.plugin.api.openai_api import OpenaiPlugin
1
+ from .base import ApiPluginBase
2
+ from .custom_api import CustomPlugin
3
+ from .dashscope_api import DashScopeApiPlugin
4
+ from .openai_api import OpenaiPlugin
@@ -1,16 +1,18 @@
1
+ import aiohttp
1
2
  from abc import abstractmethod
2
- from typing import Any, Dict, List, Tuple
3
+ from typing import Any, AsyncGenerator, Dict, List, Tuple
3
4
 
4
5
  from evalscope.perf.arguments import Arguments
5
6
 
6
7
 
7
8
  class ApiPluginBase:
8
9
 
9
- def __init__(self, model_path: str) -> None:
10
- self.model_path = model_path
10
+ def __init__(self, param: Arguments) -> None:
11
+ self.param = param
12
+ self.model_path = param.tokenizer_path
11
13
 
12
14
  @abstractmethod
13
- def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
15
+ def build_request(self, messages: List[Dict], param: Arguments = None) -> Dict:
14
16
  """Build a api request body.
15
17
 
16
18
  Args:
@@ -39,6 +41,22 @@ class ApiPluginBase:
39
41
  """
40
42
  raise NotImplementedError
41
43
 
44
+ @abstractmethod
45
+ async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
46
+ body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
47
+ """Process the HTTP request and handle the response.
48
+
49
+ Args:
50
+ client_session: The aiohttp client session
51
+ url: The request URL
52
+ headers: The request headers
53
+ body: The request body
54
+
55
+ Yields:
56
+ Tuple[bool, int, str]: (is_error, status_code, response_data)
57
+ """
58
+ raise NotImplementedError
59
+
42
60
  @staticmethod
43
61
  def replace_values(input_json: Any, model: str, prompt: str):
44
62
  if isinstance(input_json, dict):