evalscope 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (79) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +10 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +23 -99
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +114 -85
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
  26. evalscope/benchmarks/mmlu/__init__.py +0 -5
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
  28. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  30. evalscope/benchmarks/race/__init__.py +0 -5
  31. evalscope/benchmarks/race/race_adapter.py +25 -53
  32. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
  34. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  35. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
  36. evalscope/collections/__init__.py +3 -0
  37. evalscope/collections/evaluator.py +178 -0
  38. evalscope/collections/sampler.py +132 -0
  39. evalscope/collections/schema.py +122 -0
  40. evalscope/config.py +7 -5
  41. evalscope/constants.py +7 -28
  42. evalscope/evaluator/evaluator.py +66 -109
  43. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  44. evalscope/metrics/__init__.py +6 -0
  45. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  46. evalscope/metrics/math_accuracy.py +193 -50
  47. evalscope/metrics/metrics.py +7 -4
  48. evalscope/metrics/rouge_metric.py +13 -8
  49. evalscope/models/__init__.py +14 -1
  50. evalscope/models/base_adapter.py +52 -0
  51. evalscope/models/chat_adapter.py +138 -0
  52. evalscope/models/choice_adapter.py +211 -0
  53. evalscope/models/custom_adapter.py +67 -0
  54. evalscope/models/local_model.py +74 -0
  55. evalscope/models/model.py +141 -0
  56. evalscope/models/server_adapter.py +104 -0
  57. evalscope/run.py +37 -66
  58. evalscope/run_arena.py +1 -1
  59. evalscope/utils/__init__.py +1 -1
  60. evalscope/utils/chat_service.py +4 -3
  61. evalscope/utils/io_utils.py +8 -0
  62. evalscope/utils/logger.py +4 -0
  63. evalscope/utils/model_utils.py +10 -0
  64. evalscope/utils/utils.py +3 -25
  65. evalscope/version.py +2 -2
  66. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/METADATA +32 -15
  67. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/RECORD +75 -66
  68. tests/cli/test_collection.py +53 -0
  69. tests/cli/test_run.py +43 -1
  70. tests/rag/test_mteb.py +3 -2
  71. evalscope/models/api/__init__.py +0 -3
  72. evalscope/models/dummy_chat_model.py +0 -49
  73. evalscope/models/model_adapter.py +0 -525
  74. evalscope/models/openai_model.py +0 -103
  75. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  76. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
  77. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
  78. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
  79. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
evalscope/run.py CHANGED
@@ -2,27 +2,23 @@
2
2
  """
3
3
  Run evaluation for LLMs.
4
4
  """
5
- import logging
6
5
  import os.path
7
- import torch
8
6
  from argparse import Namespace
9
7
  from datetime import datetime
10
8
  from typing import List, Optional, Union
11
9
 
12
10
  from evalscope.arguments import parse_args
11
+ from evalscope.benchmarks import Benchmark, BenchmarkMeta
13
12
  from evalscope.config import TaskConfig, parse_task_config
14
- from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
13
+ from evalscope.constants import DEFAULT_WORK_DIR, EvalBackend
15
14
  from evalscope.evaluator import Evaluator
16
- from evalscope.models.custom import CustomModel
17
- from evalscope.utils import import_module_util, seed_everything
15
+ from evalscope.models import LocalModel, get_local_model, initialize_model_adapter
16
+ from evalscope.utils import seed_everything
18
17
  from evalscope.utils.io_utils import OutputsStructure, are_paths_same
19
18
  from evalscope.utils.logger import configure_logging, get_logger
20
19
 
21
20
  logger = get_logger()
22
21
 
23
- BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
24
- MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
25
-
26
22
 
27
23
  def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
28
24
  """Run evaluation task(s) based on the provided configuration."""
@@ -38,15 +34,13 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]
38
34
 
39
35
  def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
40
36
  """Run a single evaluation task."""
41
- seed_everything(task_cfg.seed)
37
+ if task_cfg.seed is not None:
38
+ seed_everything(task_cfg.seed)
42
39
  outputs = setup_work_directory(task_cfg, run_time)
43
40
  configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
44
41
 
45
- task_cfg.dump_yaml(outputs.configs_dir)
46
- logger.info(task_cfg)
47
-
48
42
  if task_cfg.eval_backend != EvalBackend.NATIVE:
49
- return run_non_native_backend(task_cfg)
43
+ return run_non_native_backend(task_cfg, outputs)
50
44
  else:
51
45
  return evaluate_model(task_cfg, outputs)
52
46
 
@@ -68,7 +62,7 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
68
62
  return outputs
69
63
 
70
64
 
71
- def run_non_native_backend(task_cfg: TaskConfig) -> dict:
65
+ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
72
66
  """Run evaluation using a non-native backend."""
73
67
  eval_backend = task_cfg.eval_backend
74
68
  eval_config = task_cfg.eval_config
@@ -78,6 +72,10 @@ def run_non_native_backend(task_cfg: TaskConfig) -> dict:
78
72
 
79
73
  backend_manager_class = get_backend_manager_class(eval_backend)
80
74
  backend_manager = backend_manager_class(config=eval_config)
75
+
76
+ task_cfg.dump_yaml(outputs.configs_dir)
77
+ logger.info(task_cfg)
78
+
81
79
  backend_manager.run()
82
80
 
83
81
  return dict()
@@ -102,75 +100,48 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
102
100
  """Evaluate the model based on the provided task configuration."""
103
101
  # Initialize evaluator
104
102
  eval_results = {}
105
-
103
+ base_model = get_local_model(task_cfg)
104
+ evaluators = []
106
105
  for dataset_name in task_cfg.datasets:
107
- evaluator = create_evaluator(task_cfg, dataset_name, outputs)
106
+ evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
107
+ evaluators.append(evaluator)
108
+
109
+ # dump task_cfg to outputs.configs_dir after creating evaluators
110
+ task_cfg.dump_yaml(outputs.configs_dir)
111
+ logger.info(task_cfg)
112
+
113
+ for evaluator in evaluators:
108
114
  res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
109
115
  eval_results[dataset_name] = res_dict
110
116
 
111
117
  return eval_results
112
118
 
113
119
 
114
- def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
120
+ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: LocalModel):
115
121
  """Create an evaluator object for the specified dataset."""
116
- imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
117
- model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
118
-
119
- dataset_config = task_cfg.dataset_args.get(dataset_name, {})
120
- dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
121
- in_prompt_template = dataset_config.get('prompt_template', '')
122
- few_shot_num = dataset_config.get('few_shot_num', None)
123
- few_shot_random = dataset_config.get('few_shot_random', True)
124
-
125
- data_adapter = imported_modules['DataAdapterClass'](
126
- few_shot_num=few_shot_num,
127
- few_shot_random=few_shot_random,
128
- prompt_template=in_prompt_template,
129
- outputs=outputs,
130
- )
131
- in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
132
122
 
133
- logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
123
+ if dataset_name == 'data_collection':
124
+ # EvaluatorCollection is a collection of evaluators
125
+ from evalscope.collections import EvaluatorCollection
126
+ return EvaluatorCollection(task_cfg, outputs)
127
+
128
+ benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
129
+
130
+ data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
131
+ model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
132
+
133
+ # update task_cfg.dataset_args
134
+ task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
134
135
 
135
136
  return Evaluator(
136
- dataset_name_or_path=dataset_name_or_path,
137
- subset_list=in_subset_list,
137
+ dataset_name_or_path=benchmark.dataset_id,
138
138
  data_adapter=data_adapter,
139
139
  model_adapter=model_adapter,
140
- use_cache=task_cfg.use_cache,
141
140
  outputs=outputs,
142
- datasets_dir=task_cfg.dataset_dir,
143
- datasets_hub=task_cfg.dataset_hub,
144
- stage=task_cfg.stage,
145
- eval_type=task_cfg.eval_type,
146
- overall_task_cfg=task_cfg,
141
+ task_cfg=task_cfg,
147
142
  )
148
143
 
149
144
 
150
- def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
151
- """Initialize the model adapter based on the task configuration."""
152
- if task_cfg.dry_run:
153
- from evalscope.models.dummy_chat_model import DummyChatModel
154
- return DummyChatModel(model_cfg=dict())
155
- elif task_cfg.eval_type == EvalType.CUSTOM:
156
- if not isinstance(task_cfg.model, CustomModel):
157
- raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
158
- from evalscope.models.model_adapter import CustomModelAdapter
159
- return CustomModelAdapter(custom_model=task_cfg.model)
160
- else:
161
- device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
162
- model_precision = task_cfg.model_args.get('precision', torch.float16)
163
- if isinstance(model_precision, str) and model_precision != 'auto':
164
- model_precision = eval(model_precision)
165
- return imported_modules['ModelAdapterClass'](
166
- model_id=task_cfg.model,
167
- model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
168
- device_map=device_map,
169
- torch_dtype=model_precision,
170
- generation_config=task_cfg.generation_config,
171
- chat_template=task_cfg.chat_template)
172
-
173
-
174
145
  def main():
175
146
  args = parse_args()
176
147
  run_task(args)
evalscope/run_arena.py CHANGED
@@ -10,7 +10,7 @@ from tqdm import tqdm
10
10
 
11
11
  from evalscope.constants import EvalConfigKeys
12
12
  from evalscope.evaluator.rating_eval import RatingEvaluate
13
- from evalscope.models.model_adapter import ChatGenerationModelAdapter
13
+ from evalscope.models import ChatGenerationModelAdapter
14
14
  from evalscope.utils import get_obj_from_cfg
15
15
  from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
16
16
  from evalscope.utils.logger import get_logger
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.constants import *
3
+ from evalscope.utils.model_utils import EvalBackend
4
4
  from evalscope.utils.utils import *
@@ -3,11 +3,10 @@ import time
3
3
  import torch
4
4
  from contextlib import contextmanager
5
5
  from functools import partial
6
- from modelscope import AutoModelForCausalLM, AutoTokenizer
7
6
  from pydantic import BaseModel, Field
8
7
  from threading import Thread
9
8
  from transformers import TextIteratorStreamer
10
- from typing import List, Literal, Optional, Union
9
+ from typing import Any, List, Literal, Optional, Union
11
10
 
12
11
 
13
12
  class Usage(BaseModel):
@@ -66,7 +65,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
66
65
  class ChatCompletionResponse(BaseModel):
67
66
  model: str
68
67
  object: Literal['chat.completion', 'chat.completion.chunk']
69
- choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
68
+ choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
70
69
  created: Optional[int] = Field(default_factory=lambda: int(time.time()))
71
70
  usage: Optional[Usage]
72
71
 
@@ -96,6 +95,8 @@ class TextCompletionResponse(BaseModel):
96
95
  class ChatService:
97
96
 
98
97
  def __init__(self, model_path, attn_implementation):
98
+ from modelscope import AutoModelForCausalLM, AutoTokenizer
99
+
99
100
  self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
100
101
  self.model = AutoModelForCausalLM.from_pretrained(
101
102
  model_path,
@@ -160,3 +160,11 @@ def are_paths_same(path1, path2):
160
160
  real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2)))
161
161
 
162
162
  return real_path1 == real_path2
163
+
164
+
165
+ def dict_to_json(d: dict, json_file: str):
166
+ """
167
+ Dump dict to json file.
168
+ """
169
+ with open(json_file, 'w') as f:
170
+ json.dump(d, f, indent=4, ensure_ascii=False)
evalscope/utils/logger.py CHANGED
@@ -14,6 +14,10 @@ DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else
14
14
 
15
15
  logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
16
16
 
17
+ # disable datasets logging
18
+ logging.getLogger('datasets').setLevel(logging.WARNING)
19
+ logging.getLogger('modelscope').setLevel(logging.WARNING)
20
+
17
21
 
18
22
  def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
19
23
  """Get logging logger
@@ -1,6 +1,16 @@
1
+ from enum import Enum
1
2
  from transformers import GenerationConfig
2
3
 
3
4
 
5
+ class EvalBackend(Enum):
6
+ # NOTE: compatible with ms-swfit v2.x
7
+ NATIVE = 'Native'
8
+ OPEN_COMPASS = 'OpenCompass'
9
+ VLM_EVAL_KIT = 'VLMEvalKit'
10
+ RAG_EVAL = 'RAGEval'
11
+ THIRD_PARTY = 'ThirdParty'
12
+
13
+
4
14
  def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
5
15
  # Use the default values of temperature/top_p/top_k in generation_config.
6
16
  if generation_config.temperature == 0:
evalscope/utils/utils.py CHANGED
@@ -121,7 +121,6 @@ class ResponseParser:
121
121
  f'([{options_concat}])\s?是正确答案',
122
122
  f'选项\s?([{options_concat}])\s?正确',
123
123
  f'所以答\s?([{options_concat}])',
124
- f'1.\s?([{options_concat}])[.。$]?$',
125
124
  f'所以\s?([{options_concat}][.。$]?$)',
126
125
  f'所有\s?([{options_concat}][.。$]?$)',
127
126
  f'[\s,::,]([{options_concat}])[。,,\.]?$',
@@ -137,16 +136,15 @@ class ResponseParser:
137
136
  f'答案为(.*?)[{options_concat}]',
138
137
  f'固选(.*?)[{options_concat}]',
139
138
  f'答案应该是(.*?)[{options_concat}]',
140
- f'[Tt]he answer is [{options_concat}]',
139
+ f'[Tt]he answer is \(?[{options_concat}]\)?',
141
140
  f'[Tt]he correct answer is [{options_concat}]',
142
141
  f'[Tt]he correct answer is:\n[{options_concat}]',
143
142
  f'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
144
- f'[{options_concat}]',
145
143
  f'^选项\s?([{options_concat}])',
146
144
  f'^([{options_concat}])\s?选?项',
147
145
  f'(\s|^)[{options_concat}][\s。,,::\.$]',
148
146
  f'(\s|^)[{options_concat}](\s|$)',
149
- f'1.\s?(.*?)$',
147
+ f'[{options_concat}]',
150
148
  ]
151
149
 
152
150
  regexes = [re.compile(pattern) for pattern in patterns]
@@ -169,6 +167,7 @@ class ResponseParser:
169
167
  """
170
168
  patterns = [
171
169
  r'[Aa]nswer:\s*(\w+)',
170
+ r'answer is \(?(\w+)\)?',
172
171
  r'[Tt]he correct answer is:\s*(\w+)',
173
172
  r'[Tt]he correct answer is:\n\s*(\w+)',
174
173
  r'[Tt]he correct answer is:\n\n-\s*(\w+)',
@@ -199,27 +198,6 @@ class ResponseParser:
199
198
 
200
199
 
201
200
 
202
- def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
203
- """
204
- Import module utility function.
205
-
206
- Args:
207
- import_path_prefix: e.g. 'evalscope.benchmarks.'
208
- module_name: The module name to import. e.g. 'mmlu'
209
- members_to_import: The members to import.
210
- e.g. ['DATASET_ID', 'SUBJECT_MAPPING', 'SUBSET_LIST', 'DataAdapterClass']
211
-
212
- Returns:
213
- dict: imported modules map. e.g. {'DATASET_ID': 'mmlu', 'SUBJECT_MAPPING': {...}, ...}
214
- """
215
- imported_modules = {}
216
- module = importlib.import_module(import_path_prefix + module_name)
217
- for member_name in members_to_import:
218
- imported_modules[member_name] = getattr(module, member_name)
219
-
220
- return imported_modules
221
-
222
-
223
201
  def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
224
202
  """
225
203
  Normalize score.
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.8.2'
4
- __release_datetime__ = '2024-12-26 20:00:00'
3
+ __version__ = '0.9.0'
4
+ __release_datetime__ = '2025-01-03 18:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.8.2
3
+ Version: 0.9.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -160,14 +160,16 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
160
160
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
161
161
 
162
162
  ## 📋 Contents
163
- - [Introduction](#introduction)
164
- - [News](#News)
165
- - [Installation](#installation)
166
- - [Quick Start](#quick-start)
163
+ - [Introduction](#-introduction)
164
+ - [News](#-news)
165
+ - [Installation](#️-installation)
166
+ - [Quick Start](#-quick-start)
167
167
  - [Evaluation Backend](#evaluation-backend)
168
- - [Custom Dataset Evaluation](#custom-dataset-evaluation)
169
- - [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
170
- - [Arena Mode](#arena-mode)
168
+ - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
169
+ - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
170
+ - [Arena Mode](#-arena-mode)
171
+ - [Contribution](#️-contribution)
172
+ - [Roadmap](#-roadmap)
171
173
 
172
174
 
173
175
  ## 📝 Introduction
@@ -208,11 +210,15 @@ Please scan the QR code below to join our community groups:
208
210
 
209
211
 
210
212
  ## 🎉 News
213
+ - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
211
214
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
212
215
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
213
216
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
214
217
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
215
218
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
219
+
220
+ <details><summary>More</summary>
221
+
216
222
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
217
223
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
218
224
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -224,7 +230,7 @@ Please scan the QR code below to join our community groups:
224
230
  - 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
225
231
  - 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
226
232
 
227
-
233
+ </details>
228
234
 
229
235
  ## 🛠️ Installation
230
236
  ### Method 1: Install Using pip
@@ -414,7 +420,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
414
420
  - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
415
421
 
416
422
 
417
- ## Model Serving Performance Evaluation
423
+ ## 📈 Model Serving Performance Evaluation
418
424
  A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
419
425
 
420
426
  Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
@@ -439,19 +445,32 @@ Speed Benchmark Results:
439
445
  +---------------+-----------------+----------------+
440
446
  ```
441
447
 
442
- ## Custom Dataset Evaluation
448
+ ## 🖊️ Custom Dataset Evaluation
443
449
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
444
450
 
445
451
 
446
- ## Arena Mode
452
+ ## 🏟️ Arena Mode
447
453
  The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
448
454
 
449
455
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
450
456
 
457
+ ## 👷‍♂️ Contribution
451
458
 
459
+ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
452
460
 
461
+ <a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
462
+ <table>
463
+ <tr>
464
+ <th colspan="2">
465
+ <br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
466
+ </th>
467
+ </tr>
468
+ </table>
469
+ </a>
453
470
 
454
- ## TO-DO List
471
+ ## 🔜 Roadmap
472
+ - [ ] Support for better evaluation report visualization
473
+ - [x] Support for mixed evaluations across multiple datasets
455
474
  - [x] RAG evaluation
456
475
  - [x] VLM evaluation
457
476
  - [x] Agents evaluation
@@ -462,8 +481,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
462
481
  - [ ] GAIA
463
482
  - [ ] GPQA
464
483
  - [x] MBPP
465
- - [ ] Auto-reviewer
466
- - [ ] Qwen-max
467
484
 
468
485
 
469
486
  ## Star History