evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  11. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  12. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  13. evalscope/benchmarks/ifeval/__init__.py +0 -0
  14. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  15. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  16. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  17. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  18. evalscope/benchmarks/ifeval/utils.py +134 -0
  19. evalscope/benchmarks/iquiz/__init__.py +0 -0
  20. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  23. evalscope/benchmarks/race/race_adapter.py +4 -73
  24. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  25. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  26. evalscope/cli/cli.py +2 -0
  27. evalscope/cli/start_app.py +29 -0
  28. evalscope/collections/evaluator.py +82 -62
  29. evalscope/collections/sampler.py +47 -41
  30. evalscope/collections/schema.py +14 -10
  31. evalscope/constants.py +4 -0
  32. evalscope/evaluator/evaluator.py +22 -13
  33. evalscope/metrics/__init__.py +2 -5
  34. evalscope/metrics/metrics.py +11 -2
  35. evalscope/metrics/named_metrics.py +17 -0
  36. evalscope/models/server_adapter.py +11 -4
  37. evalscope/perf/__init__.py +1 -0
  38. evalscope/perf/main.py +0 -1
  39. evalscope/perf/plugin/api/custom_api.py +1 -1
  40. evalscope/perf/plugin/api/openai_api.py +1 -1
  41. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  42. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  43. evalscope/report/__init__.py +5 -0
  44. evalscope/report/app.py +506 -0
  45. evalscope/report/combinator.py +73 -0
  46. evalscope/report/generator.py +80 -0
  47. evalscope/report/utils.py +133 -0
  48. evalscope/run.py +16 -11
  49. evalscope/summarizer.py +1 -1
  50. evalscope/utils/chat_service.py +1 -1
  51. evalscope/utils/logger.py +1 -0
  52. evalscope/utils/model_utils.py +5 -2
  53. evalscope/version.py +2 -2
  54. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
  55. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
  56. tests/cli/test_collection.py +11 -7
  57. tests/cli/test_run.py +13 -4
  58. evalscope/tools/__init__.py +0 -1
  59. evalscope/tools/combine_reports.py +0 -133
  60. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  61. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  62. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  63. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  64. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  65. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,80 @@
1
+ import pandas as pd
2
+ from pandas import DataFrame
3
+
4
+ from evalscope.constants import DataCollection
5
+ from evalscope.report.utils import *
6
+
7
+
8
+ class ReportGenerator:
9
+
10
+ @staticmethod
11
+ def gen_report(subset_score_map: dict, report_name: str, **kwargs) -> Report:
12
+ """
13
+ Generate report for specific dataset.
14
+ subset_score_map: e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}, {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}]}
15
+ category_map: e.g. {'subset_name': ['category_name1', 'category_name2'], ...}
16
+ metric_list: e.g. [{'object': AverageAccuracy, 'name': 'AverageAccuracy'}, {'object': 'WeightedAverageAccuracy', 'name': 'WeightedAverageAccuracy'}]
17
+ """ # noqa: E501
18
+
19
+ dataset_name = kwargs.get('dataset_name', None)
20
+ model_name = kwargs.get('model_name', None)
21
+ category_map = kwargs.get('category_map', {})
22
+
23
+ def flatten_subset() -> DataFrame:
24
+ """
25
+ Flatten subset score map to a DataFrame.
26
+
27
+ Example:
28
+ name score num categories metric_name
29
+ 0 ARC-Easy 0.5 2 [default] AverageAccuracy
30
+ 1 ARC-Challenge 0.5 2 [default] AverageAccuracy
31
+ """
32
+ subsets = []
33
+ for subset_name, scores in subset_score_map.items():
34
+ for score_item in scores:
35
+ categories = category_map.get(subset_name, ['default'])
36
+ if isinstance(categories, str):
37
+ categories = [categories]
38
+ subsets.append(
39
+ dict(
40
+ name=subset_name,
41
+ score=score_item['score'],
42
+ num=score_item['num'],
43
+ metric_name=score_item['metric_name'],
44
+ categories=tuple(categories)))
45
+ df = pd.DataFrame(subsets)
46
+ return df
47
+
48
+ df = flatten_subset()
49
+
50
+ metrics_list = []
51
+ for metric_name, group_metric in df.groupby('metric_name'):
52
+ categories = []
53
+ for category_name, group_category in group_metric.groupby('categories'):
54
+ subsets = []
55
+ for _, row in group_category.iterrows():
56
+ subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
57
+
58
+ categories.append(Category(name=category_name, subsets=subsets))
59
+
60
+ metrics_list.append(Metric(name=metric_name, categories=categories))
61
+
62
+ report = Report(name=report_name, metrics=metrics_list, dataset_name=dataset_name, model_name=model_name)
63
+ return report
64
+
65
+ @staticmethod
66
+ def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
67
+ categories = []
68
+ for category_name, group_category in df.groupby('categories'):
69
+ subsets = []
70
+ for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
71
+ avg_score = group_subset['score'].mean()
72
+ num = group_subset['score'].count()
73
+ subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
74
+
75
+ categories.append(Category(name=category_name, subsets=subsets))
76
+ return Report(
77
+ name=DataCollection.NAME,
78
+ metrics=[Metric(name='Average', categories=categories)],
79
+ dataset_name=all_dataset_name,
80
+ model_name=model_name)
@@ -0,0 +1,133 @@
1
+ import json
2
+ import pandas as pd
3
+ from collections import defaultdict
4
+ from dataclasses import asdict, dataclass, field
5
+ from typing import Any, Dict, List
6
+
7
+ from evalscope.metrics import macro_mean, micro_mean
8
+ from evalscope.utils import normalize_score
9
+
10
+
11
+ @dataclass
12
+ class Subset:
13
+ name: str = 'default_subset'
14
+ score: float = 0.0
15
+ num: int = 0
16
+
17
+ def __post_init__(self):
18
+ self.score = normalize_score(self.score)
19
+
20
+
21
+ @dataclass
22
+ class Category:
23
+ name: tuple[str] = field(default_factory=tuple)
24
+ num: int = 0
25
+ score: float = 0.0
26
+ macro_score: float = 0.0
27
+ subsets: List[Subset] = field(default_factory=list)
28
+
29
+ def __post_init__(self):
30
+ if isinstance(self.name, str):
31
+ # ensure name is tuple format
32
+ self.name = (self.name, )
33
+ self.num = sum(subset.num for subset in self.subsets)
34
+ self.score = normalize_score(micro_mean(self.subsets))
35
+ self.macro_score = normalize_score(macro_mean(self.subsets))
36
+
37
+ @classmethod
38
+ def from_dict(cls, data: dict):
39
+ subsets = [Subset(**subset) for subset in data.get('subsets', [])]
40
+ return cls(name=data['name'], subsets=subsets)
41
+
42
+
43
+ @dataclass
44
+ class Metric:
45
+ name: str = 'default_metric'
46
+ num: int = 0
47
+ score: float = 0.0
48
+ macro_score: float = 0.0
49
+ categories: List[Category] = field(default_factory=list)
50
+
51
+ def __post_init__(self):
52
+ self.num = sum(category.num for category in self.categories)
53
+ self.score = normalize_score(micro_mean(self.categories))
54
+ self.macro_score = normalize_score(macro_mean(self.categories))
55
+
56
+ @classmethod
57
+ def from_dict(cls, data: dict):
58
+ categories = [Category.from_dict(category) for category in data.get('categories', [])]
59
+ return cls(name=data['name'], categories=categories)
60
+
61
+
62
+ class ReportKey:
63
+ model_name = 'Model'
64
+ dataset_name = 'Dataset'
65
+ metric_name = 'Metric'
66
+ category_name = 'Category'
67
+ category_prefix = 'Cat.'
68
+ subset_name = 'Subset'
69
+ num = 'Num'
70
+ score = 'Score'
71
+
72
+
73
+ @dataclass
74
+ class Report:
75
+ name: str = 'default_report'
76
+ dataset_name: str = 'default_dataset'
77
+ model_name: str = 'default_model'
78
+ score: float = 0.0
79
+ metrics: List[Metric] = field(default_factory=list)
80
+
81
+ def __post_init__(self):
82
+ self.score = self.metrics[0].score # NOTE: only use the first metric by default
83
+
84
+ def to_dict(self) -> Dict[str, Any]:
85
+ return asdict(self)
86
+
87
+ @classmethod
88
+ def from_dict(cls, data: dict):
89
+ metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
90
+ return cls(
91
+ name=data['name'],
92
+ score=data['score'],
93
+ metrics=metrics,
94
+ dataset_name=data['dataset_name'],
95
+ model_name=data['model_name'])
96
+
97
+ @classmethod
98
+ def from_json(cls, json_file: str):
99
+ with open(json_file, 'r') as f:
100
+ data = json.load(f)
101
+ return cls.from_dict(data)
102
+
103
+ def to_dataframe(self, flatten_metrics: bool = True, flatten_categories: bool = True):
104
+ table = defaultdict(list)
105
+ for metric in self.metrics:
106
+ for category in metric.categories:
107
+ for subset in category.subsets:
108
+ table[ReportKey.model_name].append(self.model_name)
109
+ table[ReportKey.dataset_name].append(self.dataset_name)
110
+ table[ReportKey.metric_name].append(metric.name)
111
+ table[ReportKey.category_name].append(category.name)
112
+ table[ReportKey.subset_name].append(subset.name)
113
+ table[ReportKey.num].append(subset.num)
114
+ table[ReportKey.score].append(subset.score) # TODO: convert to percentage
115
+ # NOTE: only flatten metrics if needed, use the first metric by default
116
+ if not flatten_metrics:
117
+ break
118
+ df = pd.DataFrame.from_dict(table, orient='columns')
119
+ if flatten_categories:
120
+ df = self._flatten_categories(df)
121
+ return df
122
+
123
+ def _flatten_categories(self, df: pd.DataFrame):
124
+ # expand categories to multiple rows
125
+ df_categories = df.copy()
126
+ # multi-level aggregation for categories
127
+ max_depth = df_categories[ReportKey.category_name].apply(len).max()
128
+ for level in range(max_depth):
129
+ df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[ReportKey.category_name].apply(
130
+ lambda x: x[level] if len(x) > level else None)
131
+
132
+ df_categories.drop(columns=[ReportKey.category_name], inplace=True)
133
+ return df_categories
evalscope/run.py CHANGED
@@ -5,18 +5,17 @@ Run evaluation for LLMs.
5
5
  import os.path
6
6
  from argparse import Namespace
7
7
  from datetime import datetime
8
- from typing import List, Optional, Union
8
+ from typing import TYPE_CHECKING, List, Optional, Union
9
9
 
10
- from evalscope.arguments import parse_args
11
- from evalscope.benchmarks import Benchmark, BenchmarkMeta
12
10
  from evalscope.config import TaskConfig, parse_task_config
13
- from evalscope.constants import DEFAULT_WORK_DIR, EvalBackend
14
- from evalscope.evaluator import Evaluator
15
- from evalscope.models import LocalModel, get_local_model, initialize_model_adapter
11
+ from evalscope.constants import DataCollection, EvalBackend
16
12
  from evalscope.utils import seed_everything
17
- from evalscope.utils.io_utils import OutputsStructure, are_paths_same
13
+ from evalscope.utils.io_utils import OutputsStructure
18
14
  from evalscope.utils.logger import configure_logging, get_logger
19
15
 
16
+ if TYPE_CHECKING:
17
+ from evalscope.models import LocalModel
18
+
20
19
  logger = get_logger()
21
20
 
22
21
 
@@ -50,8 +49,8 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
50
49
  if task_cfg.use_cache:
51
50
  task_cfg.work_dir = task_cfg.use_cache
52
51
  logger.info(f'Set resume from {task_cfg.work_dir}')
53
- elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
54
- task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
52
+ # elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
53
+ task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
55
54
 
56
55
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
57
56
 
@@ -98,6 +97,8 @@ def get_backend_manager_class(eval_backend: EvalBackend):
98
97
 
99
98
  def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
100
99
  """Evaluate the model based on the provided task configuration."""
100
+ from evalscope.models import get_local_model
101
+
101
102
  # Initialize evaluator
102
103
  eval_results = {}
103
104
  base_model = get_local_model(task_cfg)
@@ -117,10 +118,13 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
117
118
  return eval_results
118
119
 
119
120
 
120
- def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: LocalModel):
121
+ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
121
122
  """Create an evaluator object for the specified dataset."""
123
+ from evalscope.benchmarks import Benchmark, BenchmarkMeta
124
+ from evalscope.evaluator import Evaluator
125
+ from evalscope.models import initialize_model_adapter
122
126
 
123
- if dataset_name == 'data_collection':
127
+ if dataset_name == DataCollection.NAME:
124
128
  # EvaluatorCollection is a collection of evaluators
125
129
  from evalscope.collections import EvaluatorCollection
126
130
  return EvaluatorCollection(task_cfg, outputs)
@@ -143,6 +147,7 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
143
147
 
144
148
 
145
149
  def main():
150
+ from evalscope.arguments import parse_args
146
151
  args = parse_args()
147
152
  run_task(args)
148
153
 
evalscope/summarizer.py CHANGED
@@ -6,7 +6,7 @@ from typing import List, Union
6
6
 
7
7
  from evalscope.config import TaskConfig, parse_task_config
8
8
  from evalscope.constants import EvalBackend
9
- from evalscope.tools.combine_reports import gen_table
9
+ from evalscope.report import gen_table
10
10
  from evalscope.utils import csv_to_list, get_latest_folder_path
11
11
  from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
12
12
  from evalscope.utils.logger import get_logger
@@ -5,7 +5,6 @@ from contextlib import contextmanager
5
5
  from functools import partial
6
6
  from pydantic import BaseModel, Field
7
7
  from threading import Thread
8
- from transformers import TextIteratorStreamer
9
8
  from typing import Any, List, Literal, Optional, Union
10
9
 
11
10
 
@@ -96,6 +95,7 @@ class ChatService:
96
95
 
97
96
  def __init__(self, model_path, attn_implementation):
98
97
  from modelscope import AutoModelForCausalLM, AutoTokenizer
98
+ from transformers import TextIteratorStreamer
99
99
 
100
100
  self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
101
101
  self.model = AutoModelForCausalLM.from_pretrained(
evalscope/utils/logger.py CHANGED
@@ -17,6 +17,7 @@ logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
17
17
  # disable datasets logging
18
18
  logging.getLogger('datasets').setLevel(logging.WARNING)
19
19
  logging.getLogger('modelscope').setLevel(logging.WARNING)
20
+ logging.getLogger('httpx').setLevel(logging.WARNING)
20
21
 
21
22
 
22
23
  def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
@@ -1,5 +1,8 @@
1
1
  from enum import Enum
2
- from transformers import GenerationConfig
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from transformers import GenerationConfig
3
6
 
4
7
 
5
8
  class EvalBackend(Enum):
@@ -11,7 +14,7 @@ class EvalBackend(Enum):
11
14
  THIRD_PARTY = 'ThirdParty'
12
15
 
13
16
 
14
- def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
17
+ def fix_do_sample_warning(generation_config: 'GenerationConfig') -> None:
15
18
  # Use the default values of temperature/top_p/top_k in generation_config.
16
19
  if generation_config.temperature == 0:
17
20
  generation_config.do_sample = False
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.9.0'
4
- __release_datetime__ = '2025-01-03 18:00:00'
3
+ __version__ = '0.10.0'
4
+ __release_datetime__ = '2025-01-20 20:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.9.0
3
+ Version: 0.10.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -92,6 +92,11 @@ Requires-Dist: numpy; extra == "all"
92
92
  Requires-Dist: sse-starlette; extra == "all"
93
93
  Requires-Dist: transformers; extra == "all"
94
94
  Requires-Dist: unicorn; extra == "all"
95
+ Requires-Dist: gradio>=5.4.0; extra == "all"
96
+ Requires-Dist: plotly>=5.23.0; extra == "all"
97
+ Provides-Extra: app
98
+ Requires-Dist: gradio>=5.4.0; extra == "app"
99
+ Requires-Dist: plotly>=5.23.0; extra == "app"
95
100
  Provides-Extra: inner
96
101
  Requires-Dist: absl-py; extra == "inner"
97
102
  Requires-Dist: accelerate; extra == "inner"
@@ -210,6 +215,8 @@ Please scan the QR code below to join our community groups:
210
215
 
211
216
 
212
217
  ## 🎉 News
218
+ - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
219
+ - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
213
220
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
214
221
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
215
222
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
@@ -374,15 +381,85 @@ run_task(task_cfg="config.json")
374
381
  - `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
375
382
 
376
383
  ### Output Results
384
+ ```text
385
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
386
+ | Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
387
+ +=======================+================+=================+=================+===============+=======+=========+
388
+ | Qwen2.5-0.5B-Instruct | gsm8k | AverageAccuracy | default | main | 5 | 0.4 |
389
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
390
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Easy | 5 | 0.8 |
391
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
392
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Challenge | 5 | 0.4 |
393
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
394
+ ```
395
+
396
+ ## 📈 Visualization of Evaluation Results
397
+
398
+ 1. Install the dependencies required for visualization, including gradio, plotly, etc.
399
+ ```bash
400
+ pip install 'evalscope[app]'
377
401
  ```
378
- +-----------------------+-------------------+-----------------+
379
- | Model | ai2_arc | gsm8k |
380
- +=======================+===================+=================+
381
- | Qwen2.5-0.5B-Instruct | (ai2_arc/acc) 0.6 | (gsm8k/acc) 0.6 |
382
- +-----------------------+-------------------+-----------------+
402
+
403
+ 2. Start the Visualization Service
404
+
405
+ Run the following command to start the visualization service.
406
+ ```bash
407
+ evalscope app
408
+ ```
409
+ You can access the visualization service in the browser if the following output appears.
410
+ ```text
411
+ * Running on local URL: http://127.0.0.1:7861
412
+
413
+ To create a public link, set `share=True` in `launch()`.
383
414
  ```
384
415
 
385
- ## ⚙️ Complex Evaluation
416
+ <table>
417
+ <tr>
418
+ <td style="text-align: center;">
419
+ <img src="docs/zh/get_started/images/setting.png" alt="Setting" style="width: 100%;" />
420
+ <p>Setting Interface</p>
421
+ </td>
422
+ <td style="text-align: center;">
423
+ <img src="docs/zh/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
424
+ <p>Model Comparison</p>
425
+ </td>
426
+ </tr>
427
+ <tr>
428
+ <td style="text-align: center;">
429
+ <img src="docs/zh/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
430
+ <p>Report Overview</p>
431
+ </td>
432
+ <td style="text-align: center;">
433
+ <img src="docs/zh/get_started/images/report_details.png" alt="Report Details" style="width: 100%;" />
434
+ <p>Report Details</p>
435
+ </td>
436
+ </tr>
437
+ </table>
438
+
439
+ For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html)
440
+
441
+ ## 🌐 Evaluation of Specified Model API
442
+
443
+ Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
444
+
445
+ For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
446
+
447
+ ```shell
448
+ export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
449
+ ```
450
+ Then, you can use the following command to evaluate the model API service:
451
+ ```shell
452
+ evalscope eval \
453
+ --model qwen2.5 \
454
+ --api-url http://127.0.0.1:8801/v1/chat/completions \
455
+ --api-key EMPTY \
456
+ --eval-type service \
457
+ --datasets gsm8k \
458
+ --limit 10
459
+ ```
460
+
461
+ ## ⚙️ Custom Parameter Evaluation
462
+
386
463
  For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
387
464
 
388
465
  ```shell