evalscope 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (48) hide show
  1. evalscope/arguments.py +10 -0
  2. evalscope/backend/rag_eval/utils/llm.py +1 -1
  3. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +3 -3
  4. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  5. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  6. evalscope/benchmarks/data_adapter.py +4 -2
  7. evalscope/benchmarks/drop/__init__.py +0 -0
  8. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  9. evalscope/benchmarks/drop/utils.py +59 -0
  10. evalscope/benchmarks/general_qa/general_qa_adapter.py +8 -4
  11. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  12. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  13. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
  14. evalscope/benchmarks/tool_bench/utils.py +202 -0
  15. evalscope/benchmarks/utils.py +3 -2
  16. evalscope/benchmarks/winogrande/__init__.py +0 -0
  17. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  18. evalscope/collections/evaluator.py +76 -26
  19. evalscope/config.py +46 -15
  20. evalscope/evaluator/evaluator.py +48 -14
  21. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  22. evalscope/metrics/llm_judge.py +3 -3
  23. evalscope/metrics/rouge_metric.py +11 -13
  24. evalscope/models/adapters/chat_adapter.py +51 -34
  25. evalscope/models/adapters/server_adapter.py +15 -19
  26. evalscope/perf/arguments.py +14 -5
  27. evalscope/perf/benchmark.py +4 -9
  28. evalscope/perf/main.py +69 -17
  29. evalscope/perf/utils/benchmark_util.py +33 -15
  30. evalscope/perf/utils/db_util.py +32 -20
  31. evalscope/perf/utils/log_utils.py +1 -1
  32. evalscope/perf/utils/rich_display.py +186 -0
  33. evalscope/report/app.py +47 -34
  34. evalscope/report/utils.py +1 -1
  35. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  36. evalscope/utils/deprecation_utils.py +42 -0
  37. evalscope/version.py +2 -2
  38. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/METADATA +49 -25
  39. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/RECORD +48 -38
  40. tests/aigc/test_t2i.py +4 -4
  41. tests/cli/test_all.py +3 -0
  42. tests/cli/test_collection.py +2 -1
  43. tests/cli/test_run.py +37 -14
  44. tests/perf/test_perf.py +27 -2
  45. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
  46. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
  47. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
  48. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,186 @@
1
+ # the following code is largely adapted from https://github.com/lework/llm-benchmark
2
+
3
+ import numpy as np
4
+ from rich.console import Console
5
+ from rich.panel import Panel
6
+ from rich.style import Style
7
+ from rich.table import Table
8
+ from rich.text import Text
9
+
10
+ from evalscope.utils.logger import get_logger
11
+ from .benchmark_util import Metrics
12
+ from .db_util import PercentileMetrics
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ def analyze_results(all_results):
18
+ """Analyze all test results and generate a summary report"""
19
+ summary = []
20
+ total_tokens = 0
21
+ total_time = 0
22
+
23
+ for result in all_results:
24
+ total_metrics = result[0]
25
+ percentile_metrics = result[1]
26
+ percentiles = percentile_metrics[PercentileMetrics.PERCENTILES]
27
+ try:
28
+ concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0)
29
+ rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0)
30
+ avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0)
31
+ p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')]
32
+ avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
33
+ avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
34
+ p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
35
+ success_rate = (total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
36
+ / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)) * 100
37
+ avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
38
+ p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
39
+
40
+ # Ensure all values are valid numbers
41
+ if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]):
42
+ logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped')
43
+ continue
44
+
45
+ summary.append([
46
+ concurrency,
47
+ f'{rps:.2f}' if rps is not None else 'N/A',
48
+ f'{avg_latency:.3f}' if avg_latency is not None else 'N/A',
49
+ f'{p99_latency:.3f}' if p99_latency is not None else 'N/A',
50
+ f'{avg_tps:.2f}' if avg_tps is not None else 'N/A',
51
+ f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A',
52
+ f'{success_rate:.1f}%' if success_rate is not None else 'N/A',
53
+ f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A',
54
+ f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A',
55
+ f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
56
+ ])
57
+
58
+ total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST, 0) * total_metrics.get(
59
+ Metrics.SUCCEED_REQUESTS, 0)
60
+ total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
61
+ except Exception as e:
62
+ logger.warning(
63
+ f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}")
64
+ continue
65
+
66
+ if not summary:
67
+ logger.warning('Error: No valid test result data')
68
+ return [], 0, 0
69
+
70
+ return summary, total_tokens, total_time
71
+
72
+
73
+ def print_summary(all_results, model_name):
74
+ """Print test results summary"""
75
+ summary, total_tokens, total_time = analyze_results(all_results)
76
+
77
+ if not summary:
78
+ logger.warning('No available test result data to display')
79
+ return
80
+
81
+ console = Console(width=100) # Set fixed width
82
+
83
+ # Create title panel
84
+ title = Text('Performance Test Summary Report', style='bold')
85
+ console.print(Panel(title, width=60))
86
+
87
+ # Print basic information
88
+ basic_info = Table(show_header=False, width=60)
89
+ basic_info.add_column('Name', style='cyan', width=25)
90
+ basic_info.add_column('Value', style='green', width=35)
91
+
92
+ basic_info.add_row('Model', model_name)
93
+ basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
94
+ basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
95
+ basic_info.add_row('Avg Output Rate', f'{total_tokens/total_time:.2f} tokens/sec')
96
+
97
+ console.print('\nBasic Information:')
98
+ console.print(basic_info)
99
+
100
+ # Create detailed performance metrics table
101
+ table = Table(
102
+ title='Detailed Performance Metrics',
103
+ show_header=True,
104
+ header_style='bold cyan',
105
+ border_style='blue',
106
+ width=100, # Set total table width
107
+ pad_edge=False, # Reduce edge padding
108
+ min_width=60, # Minimum width
109
+ )
110
+
111
+ # Add columns (set fixed column widths)
112
+ table.add_column('Conc.', justify='right', style='cyan')
113
+ table.add_column('RPS', justify='right')
114
+ table.add_column('Avg Lat.(s)', justify='right')
115
+ table.add_column('P99 Lat.(s)', justify='right')
116
+ table.add_column('Gen. toks/s', justify='right')
117
+ table.add_column('Avg TTFT(s)', justify='right')
118
+ table.add_column('P99 TTFT(s)', justify='right')
119
+ table.add_column('Avg TPOT(s)', justify='right')
120
+ table.add_column('P99 TPOT(s)', justify='right')
121
+ table.add_column('Success Rate', justify='right', style='green')
122
+
123
+ # Add data rows
124
+ for row in summary:
125
+ try:
126
+ # Set row style based on success rate
127
+ success_rate = float(row[6].rstrip('%'))
128
+ row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red'
129
+
130
+ table.add_row(
131
+ str(row[0]), # Concurrency
132
+ f'{float(row[1]):.2f}', # RPS
133
+ f'{float(row[2]):.3f}', # Average Latency
134
+ f'{float(row[3]):.3f}', # P99 Latency
135
+ f'{float(row[4]):.2f}', # Average TPS
136
+ f'{float(row[5]):.3f}', # First Token Latency
137
+ f'{float(row[7]):.3f}', # P99 TTFT
138
+ f'{float(row[8]):.3f}', # Average TPOT
139
+ f'{float(row[9]):.3f}', # P99 TPOT
140
+ row[6], # Success Rate
141
+ style=row_style)
142
+ except ValueError as e:
143
+ console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
144
+ continue
145
+
146
+ console.print('\n')
147
+ console.print(table)
148
+
149
+ # Calculate and display best performance configuration
150
+ try:
151
+ best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary])
152
+ best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary])
153
+
154
+ perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60)
155
+ perf_info.add_column('Metric', style='cyan', width=20)
156
+ perf_info.add_column('Value', style='green', width=40)
157
+
158
+ perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
159
+ perf_info.add_row('Lowest Latency',
160
+ f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)')
161
+
162
+ console.print('\n')
163
+ console.print(perf_info)
164
+
165
+ # Performance recommendations
166
+ recommendations = []
167
+ if best_rps_idx == len(summary) - 1:
168
+ recommendations.append(
169
+ 'The system seems not to have reached its performance bottleneck, try higher concurrency')
170
+ elif best_rps_idx == 0:
171
+ recommendations.append('Consider lowering concurrency, current load may be too high')
172
+ else:
173
+ recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}')
174
+
175
+ success_rate = float(summary[-1][6][:-1])
176
+ if success_rate < 95:
177
+ recommendations.append(
178
+ 'Success rate is low at high concurrency, check system resources or reduce concurrency')
179
+
180
+ recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
181
+ console.print(recommend_text)
182
+ for rec in recommendations:
183
+ console.print(f'• {rec}', style='yellow')
184
+
185
+ except Exception as e:
186
+ console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')
evalscope/report/app.py CHANGED
@@ -223,6 +223,33 @@ def plot_multi_report_radar(df: pd.DataFrame):
223
223
  return fig
224
224
 
225
225
 
226
+ def convert_markdown_image(text):
227
+ if not os.path.isfile(text):
228
+ return text
229
+ # Convert the image path to a markdown image tag
230
+ if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
231
+ text = os.path.abspath(text)
232
+ image_tag = f'![image](gradio_api/file={text})'
233
+ logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
234
+ return image_tag
235
+ return text
236
+
237
+
238
+ def convert_html_tags(text):
239
+ # match begin label
240
+ text = re.sub(r'<(\w+)>', r'[\1]', text)
241
+ # match end label
242
+ text = re.sub(r'</(\w+)>', r'[/\1]', text)
243
+ return text
244
+
245
+
246
+ def process_string(string: str, max_length: int = 2048) -> str:
247
+ string = convert_html_tags(string) # for display labels e.g.
248
+ if max_length and len(string) > max_length:
249
+ return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
250
+ return string
251
+
252
+
226
253
  def dict_to_markdown(data) -> str:
227
254
  markdown_lines = []
228
255
 
@@ -230,55 +257,41 @@ def dict_to_markdown(data) -> str:
230
257
  bold_key = f'**{key}**'
231
258
 
232
259
  if isinstance(value, list):
233
- value_str = '\n' + '\n'.join([f' - {item}' for item in value])
260
+ value_str = '\n' + '\n'.join([f'- {process_model_prediction(item, max_length=None)}' for item in value])
234
261
  elif isinstance(value, dict):
235
262
  value_str = dict_to_markdown(value)
236
263
  else:
237
264
  value_str = str(value)
238
265
 
239
- value_str = process_string(value_str)
240
- markdown_line = f'{bold_key}: {value_str}'
266
+ value_str = process_string(value_str, max_length=None) # Convert HTML tags but don't truncate
267
+ markdown_line = f'{bold_key}:\n{value_str}'
241
268
  markdown_lines.append(markdown_line)
242
269
 
243
270
  return '\n\n'.join(markdown_lines)
244
271
 
245
272
 
246
- def convert_html_tags(text):
247
- # match begin label
248
- text = re.sub(r'<(\w+)>', r'[\1]', text)
249
- # match end label
250
- text = re.sub(r'</(\w+)>', r'[/\1]', text)
251
- return text
252
-
253
-
254
- def convert_markdown_image(text):
255
- if not os.path.isfile(text):
256
- return text
257
- # Convert the image path to a markdown image tag
258
- if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
259
- text = os.path.abspath(text)
260
- image_tag = f'![image](gradio_api/file={text})'
261
- logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
262
- return image_tag
263
- return text
273
+ def process_model_prediction(item: Any, max_length: int = 2048) -> str:
274
+ """
275
+ Process model prediction output into a formatted string.
264
276
 
277
+ Args:
278
+ item: The item to process. Can be a string, list, or dictionary.
279
+ max_length: The maximum length of the output string.
265
280
 
266
- def process_string(string: str, max_length: int = 2048) -> str:
267
- string = convert_html_tags(string) # for display labels e.g. `<think>`
268
- if len(string) > max_length:
269
- return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
270
- return string
271
-
272
-
273
- def process_model_prediction(item: Any):
281
+ Returns:
282
+ A formatted string representation of the input.
283
+ """
274
284
  if isinstance(item, dict):
275
- res = dict_to_markdown(item)
276
- return process_string(res)
285
+ result = dict_to_markdown(item)
277
286
  elif isinstance(item, list):
278
- res = '\n'.join([process_model_prediction(item) for item in item])
279
- return process_string(res)
287
+ result = '\n'.join([f'- {process_model_prediction(i, max_length=None)}' for i in item])
280
288
  else:
281
- return process_string(str(item))
289
+ result = str(item)
290
+
291
+ # Apply HTML tag conversion and truncation only at the final output
292
+ if max_length is not None:
293
+ return process_string(result, max_length)
294
+ return result
282
295
 
283
296
 
284
297
  def normalize_score(score):
evalscope/report/utils.py CHANGED
@@ -96,7 +96,7 @@ class Report:
96
96
 
97
97
  @classmethod
98
98
  def from_json(cls, json_file: str):
99
- with open(json_file, 'r') as f:
99
+ with open(json_file, 'r', encoding='utf-8') as f:
100
100
  data = json.load(f)
101
101
  return cls.from_dict(data)
102
102
 
@@ -6,11 +6,12 @@ from typing import Union
6
6
  from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
7
7
  from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
8
8
  from evalscope.utils import get_logger
9
+ from evalscope.utils.deprecation_utils import deprecated
9
10
  from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
10
11
 
11
12
  logger = get_logger()
12
13
 
13
-
14
+ @deprecated(since='0.15.1', remove_in='0.18.0', alternative='Native implementation of ToolBench')
14
15
  def run_task(task_cfg: Union[str, dict]):
15
16
 
16
17
  if isinstance(task_cfg, str):
@@ -0,0 +1,42 @@
1
+ import functools
2
+ import inspect
3
+ from typing import Callable, Optional
4
+
5
+ from .logger import get_logger
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optional[str] = None) -> Callable:
11
+ """
12
+ Decorator to mark functions as deprecated.
13
+
14
+ :param since: String indicating the version since deprecation
15
+ :param remove_in: Optional string indicating the version when it will be removed
16
+ :param alternative: Optional string suggesting an alternative
17
+ :return: Decorated function
18
+ """
19
+
20
+ def decorator(func: Callable) -> Callable:
21
+
22
+ @functools.wraps(func)
23
+ def wrapper(*args, **kwargs):
24
+ # Get the file name where the function is defined
25
+ file_name = inspect.getfile(func)
26
+
27
+ # Construct the warning message
28
+ warning_parts = [
29
+ f'{func.__name__} in {file_name} has been deprecated since version {since}',
30
+ f'and will be removed in version {remove_in}' if remove_in else None,
31
+ f'Use {alternative} instead' if alternative else None
32
+ ]
33
+ warning_message = '. '.join(filter(None, warning_parts))
34
+
35
+ # Log the warning
36
+ logger.warning(warning_message)
37
+
38
+ return func(*args, **kwargs)
39
+
40
+ return wrapper
41
+
42
+ return decorator
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.15.0'
4
- __release_datetime__ = '2025-04-29 00:00:00'
3
+ __version__ = '0.16.0'
4
+ __release_datetime__ = '2025-05-19 18:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.15.0
3
+ Version: 0.16.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -26,12 +26,11 @@ Requires-Dist: latex2sympy2
26
26
  Requires-Dist: matplotlib
27
27
  Requires-Dist: modelscope[framework]
28
28
  Requires-Dist: nltk>=3.9
29
- Requires-Dist: omegaconf
30
29
  Requires-Dist: openai
31
30
  Requires-Dist: pandas
32
31
  Requires-Dist: pillow
33
32
  Requires-Dist: pyarrow
34
- Requires-Dist: pyyaml
33
+ Requires-Dist: pyyaml>=5.1
35
34
  Requires-Dist: requests
36
35
  Requires-Dist: rouge-chinese
37
36
  Requires-Dist: rouge-score>=0.1.0
@@ -48,6 +47,7 @@ Requires-Dist: word2number
48
47
  Provides-Extra: aigc
49
48
  Requires-Dist: diffusers; extra == "aigc"
50
49
  Requires-Dist: iopath; extra == "aigc"
50
+ Requires-Dist: omegaconf; extra == "aigc"
51
51
  Requires-Dist: open-clip-torch; extra == "aigc"
52
52
  Requires-Dist: opencv-python; extra == "aigc"
53
53
  Provides-Extra: all
@@ -61,12 +61,11 @@ Requires-Dist: latex2sympy2; extra == "all"
61
61
  Requires-Dist: matplotlib; extra == "all"
62
62
  Requires-Dist: modelscope[framework]; extra == "all"
63
63
  Requires-Dist: nltk>=3.9; extra == "all"
64
- Requires-Dist: omegaconf; extra == "all"
65
64
  Requires-Dist: openai; extra == "all"
66
65
  Requires-Dist: pandas; extra == "all"
67
66
  Requires-Dist: pillow; extra == "all"
68
67
  Requires-Dist: pyarrow; extra == "all"
69
- Requires-Dist: pyyaml; extra == "all"
68
+ Requires-Dist: pyyaml>=5.1; extra == "all"
70
69
  Requires-Dist: requests; extra == "all"
71
70
  Requires-Dist: rouge-chinese; extra == "all"
72
71
  Requires-Dist: rouge-score>=0.1.0; extra == "all"
@@ -92,13 +91,15 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
92
91
  Requires-Dist: aiohttp; extra == "all"
93
92
  Requires-Dist: fastapi; extra == "all"
94
93
  Requires-Dist: numpy; extra == "all"
94
+ Requires-Dist: rich; extra == "all"
95
95
  Requires-Dist: sse-starlette; extra == "all"
96
96
  Requires-Dist: transformers; extra == "all"
97
- Requires-Dist: unicorn; extra == "all"
97
+ Requires-Dist: uvicorn; extra == "all"
98
98
  Requires-Dist: gradio==5.4.0; extra == "all"
99
99
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
100
100
  Requires-Dist: diffusers; extra == "all"
101
101
  Requires-Dist: iopath; extra == "all"
102
+ Requires-Dist: omegaconf; extra == "all"
102
103
  Requires-Dist: open-clip-torch; extra == "all"
103
104
  Requires-Dist: opencv-python; extra == "all"
104
105
  Provides-Extra: app
@@ -110,9 +111,10 @@ Provides-Extra: perf
110
111
  Requires-Dist: aiohttp; extra == "perf"
111
112
  Requires-Dist: fastapi; extra == "perf"
112
113
  Requires-Dist: numpy; extra == "perf"
114
+ Requires-Dist: rich; extra == "perf"
113
115
  Requires-Dist: sse-starlette; extra == "perf"
114
116
  Requires-Dist: transformers; extra == "perf"
115
- Requires-Dist: unicorn; extra == "perf"
117
+ Requires-Dist: uvicorn; extra == "perf"
116
118
  Provides-Extra: rag
117
119
  Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
118
120
  Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
@@ -177,9 +179,23 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
177
179
 
178
180
  ## 📝 Introduction
179
181
 
180
- EvalScope is [ModelScope](https://modelscope.cn/)'s official framework for model evaluation and benchmarking, designed for diverse assessment needs. It supports various model types including large language models, multimodal, embedding, reranker, and CLIP models.
182
+ EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
183
+
184
+ - 🧠 Large Language Models
185
+ - 🎨 Multimodal Models
186
+ - 🔍 Embedding Models
187
+ - 🏆 Reranker Models
188
+ - 🖼️ CLIP Models
189
+ - 🎭 AIGC Models (Image-to-Text/Video)
190
+ - ...and more!
191
+
192
+ EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
193
+
194
+ - 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
195
+ - 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
196
+ - 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
181
197
 
182
- The framework accommodates multiple evaluation scenarios such as end-to-end RAG evaluation, arena mode, and inference performance testing. It features built-in benchmarks and metrics like MMLU, CMMLU, C-Eval, and GSM8K. Seamlessly integrated with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, EvalScope enables one-click evaluations, offering comprehensive support for model training and assessment 🚀
198
+ Below is the overall architecture diagram of EvalScope:
183
199
 
184
200
  <p align="center">
185
201
  <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
@@ -214,6 +230,8 @@ Please scan the QR code below to join our community groups:
214
230
 
215
231
  ## 🎉 News
216
232
 
233
+ - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
234
+ - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
217
235
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
218
236
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
219
237
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
@@ -479,26 +497,27 @@ For more customized evaluations, such as customizing model parameters or dataset
479
497
 
480
498
  ```shell
481
499
  evalscope eval \
482
- --model Qwen/Qwen2.5-0.5B-Instruct \
483
- --model-args revision=master,precision=torch.float16,device_map=auto \
484
- --generation-config do_sample=true,temperature=0.5 \
500
+ --model Qwen/Qwen3-0.6B \
501
+ --model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
502
+ --generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
485
503
  --dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
486
504
  --datasets gsm8k \
487
505
  --limit 10
488
506
  ```
489
507
 
490
- ### Parameter
491
- - `--model-args`: Model loading parameters, separated by commas in `key=value` format. Default parameters:
492
- - `revision`: Model version, default is `master`
493
- - `precision`: Model precision, default is `auto`
494
- - `device_map`: Model device allocation, default is `auto`
495
- - `--generation-config`: Generation parameters, separated by commas in `key=value` format. Default parameters:
496
- - `do_sample`: Whether to use sampling, default is `false`
497
- - `max_length`: Maximum length, default is 2048
498
- - `max_new_tokens`: Maximum length of generation, default is 512
499
- - `--dataset-args`: Configuration parameters for evaluation datasets, passed in `json` format. The key is the dataset name, and the value is the parameters. Note that it needs to correspond one-to-one with the values in the `--datasets` parameter:
508
+ ### Parameter Description
509
+ - `--model-args`: Model loading parameters, passed as a JSON string:
510
+ - `revision`: Model version
511
+ - `precision`: Model precision
512
+ - `device_map`: Device allocation for the model
513
+ - `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
514
+ - `do_sample`: Whether to use sampling
515
+ - `temperature`: Generation temperature
516
+ - `max_new_tokens`: Maximum length of generated tokens
517
+ - `chat_template_kwargs`: Model inference template parameters
518
+ - `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
500
519
  - `few_shot_num`: Number of few-shot examples
501
- - `few_shot_random`: Whether to randomly sample few-shot data, if not set, defaults to `true`
520
+ - `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
502
521
 
503
522
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
504
523
 
@@ -517,6 +536,11 @@ A stress testing tool focused on large language models, which can be customized
517
536
 
518
537
  Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
519
538
 
539
+ **Output example**
540
+
541
+ ![multi_perf](docs/en/user_guides/stress_test/images/multi_perf.png)
542
+
543
+
520
544
  **Supports wandb for recording results**
521
545
 
522
546
  ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
@@ -565,7 +589,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
565
589
  </a>
566
590
 
567
591
  ## 🔜 Roadmap
568
- - [ ] Support for better evaluation report visualization
592
+ - [x] Support for better evaluation report visualization
569
593
  - [x] Support for mixed evaluations across multiple datasets
570
594
  - [x] RAG evaluation
571
595
  - [x] VLM evaluation
@@ -575,7 +599,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
575
599
  - [x] Multi-modal evaluation
576
600
  - [ ] Benchmarks
577
601
  - [ ] GAIA
578
- - [ ] GPQA
602
+ - [x] GPQA
579
603
  - [x] MBPP
580
604
 
581
605