evalscope 0.12.1__py3-none-any.whl → 0.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (50) hide show
  1. evalscope/arguments.py +6 -1
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -3
  3. evalscope/benchmarks/benchmark.py +3 -2
  4. evalscope/benchmarks/ceval/ceval_adapter.py +2 -1
  5. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  6. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  7. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +2 -1
  8. evalscope/benchmarks/data_adapter.py +32 -4
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -4
  10. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +20 -24
  11. evalscope/benchmarks/humaneval/humaneval_adapter.py +8 -5
  12. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  13. evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  14. evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  15. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  16. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  17. evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  18. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  19. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  20. evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -2
  22. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +148 -1
  23. evalscope/benchmarks/super_gpqa/utils.py +0 -5
  24. evalscope/collections/evaluator.py +4 -4
  25. evalscope/config.py +11 -3
  26. evalscope/constants.py +8 -0
  27. evalscope/evaluator/evaluator.py +56 -17
  28. evalscope/metrics/llm_judge.py +104 -0
  29. evalscope/models/custom_adapter.py +1 -1
  30. evalscope/perf/arguments.py +11 -40
  31. evalscope/perf/benchmark.py +39 -28
  32. evalscope/perf/http_client.py +9 -1
  33. evalscope/perf/main.py +2 -1
  34. evalscope/perf/plugin/datasets/__init__.py +1 -0
  35. evalscope/perf/plugin/datasets/openqa.py +6 -11
  36. evalscope/perf/plugin/datasets/random_dataset.py +51 -0
  37. evalscope/perf/utils/db_util.py +3 -0
  38. evalscope/run.py +15 -3
  39. evalscope/third_party/longbench_write/infer.py +1 -1
  40. evalscope/version.py +2 -2
  41. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/METADATA +56 -38
  42. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/RECORD +50 -36
  43. tests/cli/test_all.py +144 -0
  44. tests/cli/test_collection.py +27 -1
  45. tests/cli/test_run.py +103 -11
  46. tests/perf/test_perf.py +23 -0
  47. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/LICENSE +0 -0
  48. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/WHEEL +0 -0
  49. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/entry_points.txt +0 -0
  50. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/top_level.txt +0 -0
@@ -150,39 +150,45 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
150
150
  name = args.name if args.name else f'{args.model_id}_{current_time}'
151
151
  wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
152
152
 
153
- with sqlite3.connect(result_db_path) as con:
154
- cursor = con.cursor()
155
- create_result_table(cursor)
156
- with tqdm(desc='Processing') as pbar:
157
- while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
158
- try:
159
- # Attempt to get benchmark data from the queue with a timeout
160
- benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
161
- benchmark_data_queue.task_done()
162
- except asyncio.TimeoutError:
163
- # If timeout, continue to the next iteration
164
- continue
153
+ collected_benchmark_data = []
165
154
 
166
- # Update metrics based on the benchmark data
167
- metrics.update_metrics(benchmark_data, api_plugin)
155
+ with tqdm(desc='Processing') as pbar:
156
+ while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
157
+ try:
158
+ # Attempt to get benchmark data from the queue with a timeout
159
+ benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
160
+ benchmark_data_queue.task_done()
161
+ except asyncio.TimeoutError:
162
+ # If timeout, continue to the next iteration
163
+ continue
168
164
 
169
- # Insert benchmark data into the database and commit the transaction
170
- insert_benchmark_data(cursor, benchmark_data)
171
- con.commit()
165
+ # Update metrics based on the benchmark data
166
+ metrics.update_metrics(benchmark_data, api_plugin)
172
167
 
173
- # Create a message with the updated metrics
174
- message = metrics.create_message()
168
+ # Collect benchmark data for later database insertion
169
+ collected_benchmark_data.append(benchmark_data)
175
170
 
176
- # Log the message to wandb if the api key is provided
177
- if args.wandb_api_key:
178
- wandb.log(message)
171
+ # Create a message with the updated metrics
172
+ message = metrics.create_message()
179
173
 
180
- # Log the message to the logger every n queries
181
- if int(metrics.n_total_queries) % args.log_every_n_query == 0:
182
- msg = json.dumps(message, ensure_ascii=False, indent=2)
183
- logger.info(msg)
174
+ # Log the message to wandb if the api key is provided
175
+ if args.wandb_api_key:
176
+ wandb.log(message)
184
177
 
185
- pbar.update(1) # Update the progress bar
178
+ # Log the message to the logger every n queries
179
+ if int(metrics.n_total_queries) % args.log_every_n_query == 0:
180
+ msg = json.dumps(message, ensure_ascii=False, indent=2)
181
+ logger.info(msg)
182
+
183
+ pbar.update(1) # Update the progress bar
184
+
185
+ # Now perform database operations after all benchmark data has been processed
186
+ with sqlite3.connect(result_db_path) as con:
187
+ cursor = con.cursor()
188
+ create_result_table(cursor)
189
+ for benchmark_data in collected_benchmark_data:
190
+ insert_benchmark_data(cursor, benchmark_data)
191
+ con.commit()
186
192
 
187
193
  return metrics, result_db_path
188
194
 
@@ -199,7 +205,7 @@ async def start_server(args: Arguments) -> bool:
199
205
  else:
200
206
  args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
201
207
 
202
- if not await test_connection(args):
208
+ if (not args.no_test_connection) and (not await test_connection(args)):
203
209
  raise TimeoutError('Test connection failed')
204
210
 
205
211
 
@@ -209,9 +215,14 @@ async def benchmark(args: Arguments) -> None:
209
215
  loop = asyncio.get_running_loop()
210
216
  add_signal_handlers(loop)
211
217
 
218
+ # init queue
212
219
  request_queue = asyncio.Queue()
213
220
  benchmark_data_queue = asyncio.Queue()
214
221
 
222
+ # reset event
223
+ query_send_completed_event.clear()
224
+ data_process_completed_event.clear()
225
+
215
226
  async def create_send_request_tasks():
216
227
  tasks: List[asyncio.Task] = []
217
228
  for idx in range(args.parallel):
@@ -145,7 +145,15 @@ async def test_connection(args: Arguments) -> bool:
145
145
  client = AioHttpClient(args)
146
146
  async with client:
147
147
  if 'chat/completions' in args.url:
148
- request = {'messages': [{'role': 'user', 'content': 'hello'}], 'model': args.model, 'max_tokens': 10}
148
+ request = {
149
+ 'messages': [{
150
+ 'role': 'user',
151
+ 'content': 'hello'
152
+ }],
153
+ 'model': args.model,
154
+ 'max_tokens': 10,
155
+ 'stream': args.stream
156
+ }
149
157
  else:
150
158
  request = {'prompt': 'hello', 'model': args.model, 'max_tokens': 10}
151
159
  async for is_error, state_code, response_data in client.post(request):
evalscope/perf/main.py CHANGED
@@ -32,9 +32,10 @@ def run_perf_benchmark(args):
32
32
  if platform.system() == 'Windows':
33
33
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
34
34
 
35
- loop = asyncio.get_event_loop()
35
+ loop = asyncio.new_event_loop()
36
36
  if platform.system() != 'Windows':
37
37
  add_signal_handlers(loop)
38
+
38
39
  loop.run_until_complete(benchmark(args))
39
40
 
40
41
 
@@ -3,4 +3,5 @@ from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
3
3
  from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
4
4
  from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
5
5
  from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
6
+ from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
6
7
  from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
@@ -1,5 +1,5 @@
1
1
  import json
2
- import subprocess
2
+ import os
3
3
  from typing import Any, Dict, Iterator, List
4
4
 
5
5
  from evalscope.perf.arguments import Arguments
@@ -18,16 +18,11 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
18
18
 
19
19
  def build_messages(self) -> Iterator[List[Dict]]:
20
20
  if not self.query_parameters.dataset_path:
21
- subprocess.call([
22
- 'modelscope',
23
- 'download',
24
- '--dataset',
25
- 'AI-ModelScope/HC3-Chinese',
26
- 'open_qa.jsonl',
27
- '--local_dir',
28
- './data',
29
- ])
30
- self.query_parameters.dataset_path = './data/open_qa.jsonl'
21
+ from modelscope import dataset_snapshot_download
22
+
23
+ file_name = 'open_qa.jsonl'
24
+ local_path = dataset_snapshot_download('AI-ModelScope/HC3-Chinese', allow_patterns=[file_name])
25
+ self.query_parameters.dataset_path = os.path.join(local_path, file_name)
31
26
 
32
27
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
33
28
  item = json.loads(item)
@@ -0,0 +1,51 @@
1
+ import numpy as np
2
+ from typing import Dict, Iterator, List
3
+
4
+ from evalscope.perf.arguments import Arguments
5
+ from evalscope.perf.plugin.datasets.base import DatasetPluginBase
6
+ from evalscope.perf.plugin.registry import register_dataset
7
+
8
+
9
+ @register_dataset('random')
10
+ class RandomDatasetPlugin(DatasetPluginBase):
11
+ """Read dataset and return prompt.
12
+ """
13
+
14
+ def __init__(self, query_parameters: Arguments):
15
+ super().__init__(query_parameters)
16
+ assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
17
+
18
+ from modelscope import AutoTokenizer
19
+ self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
20
+ self.prefix_length = self.query_parameters.prefix_length
21
+ self.prefix_ids = self.get_random_inputs(self.prefix_length)
22
+ self.template_len = self.get_template_len()
23
+ self.number = self.query_parameters.number or 1
24
+
25
+ def build_messages(self) -> Iterator[List[Dict]]:
26
+ min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
27
+ max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
28
+
29
+ assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
30
+ assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
31
+
32
+ # refer to https://github.com/vllm-project/vllm/blob/ed6e9075d31e32c8548b480a47d1ffb77da1f54c/benchmarks/benchmark_serving.py#L366C1-L399C1 # noqa: E501
33
+ input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
34
+ offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
35
+
36
+ for i in range(self.number):
37
+ prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
38
+ prompt = self.tokenizer.decode(
39
+ self.prefix_ids + prompt_ids.tolist(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
40
+ yield [{'role': 'user', 'content': prompt}]
41
+
42
+ def get_random_inputs(self, length: int) -> List[int]:
43
+ if length <= 0:
44
+ return []
45
+ input_ids = np.random.randint(0, self.tokenizer.vocab_size, size=length).tolist()
46
+ return input_ids
47
+
48
+ def get_template_len(self):
49
+ empty_message = [{'role': 'user', 'content': ''}]
50
+ template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
51
+ return len(template)
@@ -2,6 +2,7 @@ import base64
2
2
  import json
3
3
  import os
4
4
  import pickle
5
+ import re
5
6
  import sqlite3
6
7
  import sys
7
8
  from datetime import datetime
@@ -91,6 +92,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
91
92
  def get_output_path(args: Arguments) -> str:
92
93
  current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
93
94
  output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
95
+ # Filter illegal characters
96
+ output_path = re.sub(r'[<>:"|?*]', '_', output_path)
94
97
  if not os.path.exists(output_path):
95
98
  os.makedirs(output_path, exist_ok=True)
96
99
  logger.info(f'Save the result to: {output_path}')
evalscope/run.py CHANGED
@@ -39,9 +39,11 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
39
39
  configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
40
40
 
41
41
  if task_cfg.eval_backend != EvalBackend.NATIVE:
42
- return run_non_native_backend(task_cfg, outputs)
42
+ result = run_non_native_backend(task_cfg, outputs)
43
43
  else:
44
- return evaluate_model(task_cfg, outputs)
44
+ result = evaluate_model(task_cfg, outputs)
45
+
46
+ return result
45
47
 
46
48
 
47
49
  def setup_work_directory(task_cfg: TaskConfig, run_time: str):
@@ -117,6 +119,16 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
117
119
  res_dict = evaluator.eval()
118
120
  eval_results[evaluator.dataset_name] = res_dict
119
121
 
122
+ # Clean up
123
+ if base_model is not None:
124
+ import gc
125
+ import torch
126
+
127
+ del base_model
128
+ del evaluators
129
+ torch.cuda.empty_cache()
130
+ gc.collect()
131
+
120
132
  return eval_results
121
133
 
122
134
 
@@ -132,7 +144,7 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
132
144
  # EvaluatorCollection is a collection of evaluators
133
145
  from evalscope.collections import EvaluatorCollection
134
146
  data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
135
- return EvaluatorCollection(task_cfg, data_adapter, outputs)
147
+ return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
136
148
 
137
149
  # Initialize model adapter
138
150
  model_adapter = initialize_model_adapter(task_cfg, benchmark, base_model)
@@ -8,7 +8,7 @@ import random
8
8
  import torch
9
9
  from typing import List
10
10
 
11
- from evalscope.models.api import OpenaiApi
11
+ from evalscope.third_party.longbench_write.tools.openai_api import OpenaiApi
12
12
  from evalscope.third_party.longbench_write.utils import count_words
13
13
  from evalscope.utils import get_logger
14
14
 
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.12.1'
4
- __release_datetime__ = '2025-03-10 21:00:00'
3
+ __version__ = '0.13.1'
4
+ __release_datetime__ = '2025-03-24 18:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.12.1
3
+ Version: 0.13.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -175,16 +175,29 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
175
175
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
176
176
 
177
177
  ## 📋 Contents
178
- - [Introduction](#-introduction)
179
- - [News](#-news)
180
- - [Installation](#️-installation)
181
- - [Quick Start](#-quick-start)
178
+ - [📋 Contents](#-contents)
179
+ - [📝 Introduction](#-introduction)
180
+ - [☎ User Groups](#-user-groups)
181
+ - [🎉 News](#-news)
182
+ - [🛠️ Installation](#️-installation)
183
+ - [Method 1: Install Using pip](#method-1-install-using-pip)
184
+ - [Method 2: Install from Source](#method-2-install-from-source)
185
+ - [🚀 Quick Start](#-quick-start)
186
+ - [Method 1. Using Command Line](#method-1-using-command-line)
187
+ - [Method 2. Using Python Code](#method-2-using-python-code)
188
+ - [Basic Parameter](#basic-parameter)
189
+ - [Output Results](#output-results)
190
+ - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
191
+ - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
192
+ - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
193
+ - [Parameter](#parameter)
182
194
  - [Evaluation Backend](#evaluation-backend)
183
- - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
184
- - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
185
- - [Arena Mode](#-arena-mode)
186
- - [Contribution](#️-contribution)
187
- - [Roadmap](#-roadmap)
195
+ - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
196
+ - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
197
+ - [🏟️ Arena Mode](#️-arena-mode)
198
+ - [👷‍♂️ Contribution](#️-contribution)
199
+ - [🔜 Roadmap](#-roadmap)
200
+ - [Star History](#star-history)
188
201
 
189
202
 
190
203
  ## 📝 Introduction
@@ -226,6 +239,9 @@ Please scan the QR code below to join our community groups:
226
239
 
227
240
  ## 🎉 News
228
241
 
242
+ - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
243
+ - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
244
+ - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
229
245
  - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
230
246
  - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
231
247
  - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
@@ -262,23 +278,24 @@ Please scan the QR code below to join our community groups:
262
278
  We recommend using conda to manage your environment and installing dependencies with pip:
263
279
 
264
280
  1. Create a conda environment (optional)
265
- ```shell
266
- # It is recommended to use Python 3.10
267
- conda create -n evalscope python=3.10
268
- # Activate the conda environment
269
- conda activate evalscope
270
- ```
281
+ ```shell
282
+ # It is recommended to use Python 3.10
283
+ conda create -n evalscope python=3.10
284
+ # Activate the conda environment
285
+ conda activate evalscope
286
+ ```
271
287
 
272
288
  2. Install dependencies using pip
273
- ```shell
274
- pip install evalscope # Install Native backend (default)
275
- # Additional options
276
- pip install evalscope[opencompass] # Install OpenCompass backend
277
- pip install evalscope[vlmeval] # Install VLMEvalKit backend
278
- pip install evalscope[rag] # Install RAGEval backend
279
- pip install evalscope[perf] # Install Perf dependencies
280
- pip install evalscope[all] # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
281
- ```
289
+ ```shell
290
+ pip install evalscope # Install Native backend (default)
291
+ # Additional options
292
+ pip install 'evalscope[opencompass]' # Install OpenCompass backend
293
+ pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
294
+ pip install 'evalscope[rag]' # Install RAGEval backend
295
+ pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
296
+ pip install 'evalscope[app]' # Install dependencies for visualization
297
+ pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
298
+ ```
282
299
 
283
300
  > [!WARNING]
284
301
  > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
@@ -292,21 +309,22 @@ We recommend using conda to manage your environment and installing dependencies
292
309
 
293
310
  ### Method 2: Install from Source
294
311
  1. Download the source code
295
- ```shell
296
- git clone https://github.com/modelscope/evalscope.git
297
- ```
312
+ ```shell
313
+ git clone https://github.com/modelscope/evalscope.git
314
+ ```
298
315
 
299
316
  2. Install dependencies
300
- ```shell
301
- cd evalscope/
302
- pip install -e . # Install Native backend
303
- # Additional options
304
- pip install -e '.[opencompass]' # Install OpenCompass backend
305
- pip install -e '.[vlmeval]' # Install VLMEvalKit backend
306
- pip install -e '.[rag]' # Install RAGEval backend
307
- pip install -e '.[perf]' # Install Perf dependencies
308
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
309
- ```
317
+ ```shell
318
+ cd evalscope/
319
+ pip install -e . # Install Native backend
320
+ # Additional options
321
+ pip install -e '.[opencompass]' # Install OpenCompass backend
322
+ pip install -e '.[vlmeval]' # Install VLMEvalKit backend
323
+ pip install -e '.[rag]' # Install RAGEval backend
324
+ pip install -e '.[perf]' # Install Perf dependencies
325
+ pip install -e '.[app]' # Install visualization dependencies
326
+ pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
327
+ ```
310
328
 
311
329
 
312
330
  ## 🚀 Quick Start