evalscope 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (89) hide show
  1. evalscope/arguments.py +3 -1
  2. evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +14 -17
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -11
  6. evalscope/benchmarks/benchmark.py +12 -10
  7. evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +6 -20
  10. evalscope/benchmarks/data_adapter.py +82 -19
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +15 -22
  13. evalscope/benchmarks/general_qa/general_qa_adapter.py +29 -16
  14. evalscope/benchmarks/gpqa/gpqa_adapter.py +13 -8
  15. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -4
  16. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
  17. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
  18. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -4
  19. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  20. evalscope/benchmarks/math_500/math_500_adapter.py +9 -4
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +24 -36
  23. evalscope/benchmarks/musr/__init__.py +0 -0
  24. evalscope/benchmarks/musr/musr_adapter.py +71 -0
  25. evalscope/benchmarks/process_bench/__init__.py +0 -0
  26. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  27. evalscope/benchmarks/process_bench/process_bench_adapter.py +99 -0
  28. evalscope/benchmarks/race/race_adapter.py +12 -16
  29. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  30. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
  31. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  32. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  33. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  34. evalscope/benchmarks/super_gpqa/utils.py +90 -0
  35. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  36. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  37. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +7 -14
  38. evalscope/benchmarks/utils.py +43 -0
  39. evalscope/cli/start_app.py +4 -1
  40. evalscope/cli/start_eval.py +4 -3
  41. evalscope/cli/start_perf.py +4 -2
  42. evalscope/collections/evaluator.py +16 -1
  43. evalscope/config.py +13 -3
  44. evalscope/constants.py +7 -0
  45. evalscope/evaluator/evaluator.py +3 -1
  46. evalscope/metrics/__init__.py +2 -1
  47. evalscope/metrics/metrics.py +23 -2
  48. evalscope/metrics/named_metrics.py +1 -0
  49. evalscope/models/__init__.py +2 -1
  50. evalscope/models/base_adapter.py +32 -6
  51. evalscope/models/chat_adapter.py +4 -1
  52. evalscope/models/choice_adapter.py +4 -0
  53. evalscope/models/custom_adapter.py +2 -0
  54. evalscope/models/local_model.py +3 -2
  55. evalscope/models/register.py +28 -0
  56. evalscope/models/server_adapter.py +107 -29
  57. evalscope/perf/__init__.py +0 -1
  58. evalscope/perf/arguments.py +18 -8
  59. evalscope/perf/http_client.py +8 -6
  60. evalscope/perf/plugin/api/openai_api.py +11 -1
  61. evalscope/perf/utils/analysis_result.py +1 -1
  62. evalscope/perf/utils/benchmark_util.py +6 -2
  63. evalscope/report/app.py +15 -8
  64. evalscope/report/combinator.py +2 -2
  65. evalscope/run.py +6 -5
  66. evalscope/third_party/thinkbench/__init__.py +3 -0
  67. evalscope/third_party/thinkbench/eval.py +429 -0
  68. evalscope/third_party/thinkbench/infer.py +130 -0
  69. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  70. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  71. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  72. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  73. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  74. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  75. evalscope/utils/chat_service.py +1 -0
  76. evalscope/utils/filters.py +59 -0
  77. evalscope/utils/logger.py +3 -3
  78. evalscope/utils/model_utils.py +17 -1
  79. evalscope/utils/utils.py +45 -45
  80. evalscope/version.py +2 -2
  81. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +14 -5
  82. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +89 -65
  83. tests/cli/test_collection.py +1 -1
  84. tests/cli/test_run.py +151 -32
  85. /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
  86. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
  87. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
  88. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
  89. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,48 @@
1
+ import os
2
+ from openai import OpenAI
3
+
4
+
5
+ def request_url(llm_config, content):
6
+ try:
7
+ client = OpenAI(
8
+ api_key=llm_config['api_key'],
9
+ base_url=llm_config['base_url'],
10
+ )
11
+ completion = client.chat.completions.create(
12
+ model=llm_config['model_name'],
13
+ messages=[{'role': 'user', 'content': content}]
14
+ )
15
+ return completion.choices[0].message.content
16
+ except Exception as e:
17
+ print(e)
18
+ return None
19
+
20
+ def request_qwen(content):
21
+ try:
22
+ client = OpenAI(
23
+ api_key=os.getenv('DASHSCOPE_API_KEY'),
24
+ base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
25
+ )
26
+
27
+ completion = client.chat.completions.create(
28
+ model='qwen-max',
29
+ messages=[{'role': 'user', 'content': content}]
30
+ )
31
+ return completion.choices[0].message.content
32
+ except Exception as e:
33
+ print(e)
34
+
35
+
36
+ def request_local(content):
37
+ try:
38
+ client = OpenAI(
39
+ api_key='EMPTY',
40
+ base_url='http://0.0.0.0:8801/v1',
41
+ )
42
+ completion = client.chat.completions.create(
43
+ model='Qwen2.5-72B-Instruct',
44
+ messages=[{'role': 'user', 'content': content}]
45
+ )
46
+ return completion.choices[0].message.content
47
+ except Exception as e:
48
+ print(e)
@@ -0,0 +1,13 @@
1
+ import re
2
+
3
+
4
+ def extract_answer(solution_text: str):
5
+ boxed_pattern = r'\\boxed\{([^}]*)\}'
6
+ matches = re.findall(boxed_pattern, solution_text)
7
+ if matches:
8
+ last_boxed_content = matches[-1]
9
+ number_pattern = r'-?\d+'
10
+ number_matches = re.findall(number_pattern, last_boxed_content)
11
+ if number_matches:
12
+ return number_matches[-1].strip()
13
+ return None
@@ -1,37 +1,67 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
1
+ import os
2
2
  from dataclasses import dataclass
3
- from swift.llm import get_default_template_type, get_model_tokenizer, get_template, inference
4
- from swift.utils import seed_everything
5
-
6
- # TODO: Support custom model for swift infer
3
+ from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
7
4
 
5
+ # 设置GPU环境变量
6
+ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
8
7
 
9
8
  @dataclass
10
9
  class SwiftInferArgs:
11
10
  model_id_or_path: str
12
11
  model_type: str
12
+ infer_backend: str = 'vllm' # 可选 'pt', 'vllm', 'lmdeploy'
13
13
  max_new_tokens: int = 2048
14
-
14
+ temperature: float = 0.1
15
+ max_batch_size: int = 16
15
16
 
16
17
  class SwiftInfer:
17
18
 
18
19
  def __init__(self, args: SwiftInferArgs):
19
- model_type = args.model_type
20
- template_type = get_default_template_type(model_type)
21
- model, tokenizer = get_model_tokenizer(
22
- model_type, model_id_or_path=args.model_id_or_path, model_kwargs={'device_map': 'auto'})
23
- model.generation_config.max_new_tokens = args.max_new_tokens
24
- print(f'** Generation config: {model.generation_config}')
20
+ # infer backend模型初始化
21
+ if args.infer_backend == 'pt':
22
+ self.engine: InferEngine = PtEngine(args.model_id_or_path, max_batch_size=args.max_batch_size)
23
+ elif args.infer_backend == 'vllm':
24
+ from swift.llm import VllmEngine
25
+ self.engine: InferEngine = VllmEngine(args.model_id_or_path, max_model_len=8192)
26
+ elif args.infer_backend == 'lmdeploy':
27
+ from swift.llm import LmdeployEngine
28
+ self.engine: InferEngine = LmdeployEngine(args.model_id_or_path)
29
+ else:
30
+ raise ValueError(f'Unsupported infer_backend: {args.infer_backend}')
25
31
 
26
- template = get_template(template_type, tokenizer)
27
- seed_everything(42)
28
-
29
- self.tokenizer = tokenizer
30
- self.model = model
31
- self.template = template
32
+ # 基本配置获取 (可选)
33
+ self.request_config = RequestConfig(
34
+ max_tokens=args.max_new_tokens,
35
+ temperature=args.temperature,
36
+ stream=False # 可以透传参数改为True进行流式推理
37
+ )
32
38
 
33
39
  def predict(self, system: str, query: str, history: list):
40
+ # Swift 3.0标准接口中,消息传入的格式是:
41
+ # messages: [{"role": "system", "content": "<SYSTEM_PROMPT>"},
42
+ # {"role": "user", "content": "用户问题内容"},
43
+ # {"role": "assistant", "content": "助手回答内容"}, ...]
44
+
45
+ messages = []
46
+ if system.strip():
47
+ messages.append({'role': 'system', 'content': system})
48
+
49
+ # 将历史对话拼接进message中
50
+ for qa_pair in history:
51
+ # 假定 history 中每个元素形如 ("user input", "model response"),请根据你的数据格式进行调整。
52
+ user_answer, model_response = qa_pair
53
+ messages.append({'role': 'user', 'content': user_answer})
54
+ messages.append({'role': 'assistant', 'content': model_response})
55
+
56
+ # 添加本次用户问题
57
+ messages.append({'role': 'user', 'content': query})
58
+
59
+ infer_request = InferRequest(messages=messages)
60
+
61
+ # 进行推理
62
+ response = self.engine.infer([infer_request], self.request_config)
34
63
 
35
- response, history = inference(self.model, self.template, query=query, system=system, history=history)
64
+ # 提取模型返回的文本结果(假设非stream模式)
65
+ result_text = response[0].choices[0].message.content.strip()
36
66
 
37
- return response
67
+ return result_text
@@ -32,6 +32,7 @@ class ModelList(BaseModel):
32
32
  class ChatMessage(BaseModel):
33
33
  role: Literal['user', 'assistant', 'system']
34
34
  content: str
35
+ reasoning_content: Optional[str] = None
35
36
 
36
37
 
37
38
  class DeltaMessage(BaseModel):
@@ -0,0 +1,59 @@
1
+ import re
2
+ from typing import Any, Callable, Dict
3
+
4
+
5
+ class Filter:
6
+ """
7
+ A base Filter class that implements the registry pattern
8
+ """
9
+ _registry: Dict[str, Callable[[str, Any], str]] = {}
10
+
11
+ @classmethod
12
+ def register(cls, name: str) -> Callable:
13
+ """
14
+ Decorator to register a new filter function
15
+ """
16
+
17
+ def decorator(func: Callable[[str, Any], str]) -> Callable[[str, Any], str]:
18
+ cls._registry[name] = func
19
+ return func
20
+
21
+ return decorator
22
+
23
+ @classmethod
24
+ def get_filter(cls, name: str) -> Callable:
25
+ """
26
+ Get a registered filter by name
27
+ """
28
+ return cls._registry.get(name)
29
+
30
+ @classmethod
31
+ def apply(cls, name: str, value: str, *args, **kwargs) -> str:
32
+ """
33
+ Apply a registered filter to a value
34
+ """
35
+ filter_func = cls.get_filter(name)
36
+ if filter_func is None:
37
+ raise ValueError(f'Filter {name} not found')
38
+ return filter_func(value, *args, **kwargs)
39
+
40
+
41
+ @Filter.register('remove_until')
42
+ def remove_until(value: str, marker: str) -> str:
43
+ """
44
+ Remove everything before the last occurrence of marker
45
+ """
46
+ if marker not in value:
47
+ return value
48
+ return value[value.rindex(marker) + len(marker):]
49
+
50
+
51
+ @Filter.register('extract')
52
+ def extract(value: str, pattern: str) -> str:
53
+ """
54
+ Extract content from string using regex pattern
55
+ """
56
+ match = re.search(pattern, value)
57
+ if match:
58
+ return match.group(0)
59
+ return ''
evalscope/utils/logger.py CHANGED
@@ -12,12 +12,12 @@ detailed_formatter = logging.Formatter(detailed_format)
12
12
  simple_formatter = logging.Formatter(simple_format)
13
13
  DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
14
14
 
15
- logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
15
+ logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
16
16
 
17
- # disable datasets logging
17
+ # set logging level
18
18
  logging.getLogger('datasets').setLevel(logging.WARNING)
19
- logging.getLogger('modelscope').setLevel(logging.WARNING)
20
19
  logging.getLogger('httpx').setLevel(logging.WARNING)
20
+ logging.getLogger('modelscope').setLevel(logging.ERROR)
21
21
 
22
22
 
23
23
  def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
@@ -1,5 +1,6 @@
1
+ import os
1
2
  from enum import Enum
2
- from typing import TYPE_CHECKING
3
+ from typing import TYPE_CHECKING, Optional, Tuple, Union
3
4
 
4
5
  if TYPE_CHECKING:
5
6
  from transformers import GenerationConfig
@@ -22,3 +23,18 @@ def fix_do_sample_warning(generation_config: 'GenerationConfig') -> None:
22
23
  generation_config.temperature = 1.
23
24
  generation_config.top_p = 1.
24
25
  generation_config.top_k = 50
26
+
27
+
28
+ def get_device() -> str:
29
+ from transformers.utils import is_torch_cuda_available, is_torch_mps_available, is_torch_npu_available
30
+
31
+ if is_torch_npu_available():
32
+ device = 'npu'
33
+ elif is_torch_mps_available():
34
+ device = 'mps'
35
+ elif is_torch_cuda_available():
36
+ device = 'cuda'
37
+ else:
38
+ device = 'cpu'
39
+
40
+ return device
evalscope/utils/utils.py CHANGED
@@ -101,50 +101,50 @@ class ResponseParser:
101
101
  options_concat = '|'.join([str(i) for i in options])
102
102
 
103
103
  patterns = [
104
- f'答案是?\s?([{options_concat}])',
105
- f'答案是?\s?:([{options_concat}])',
106
- f'答案是?\s?:([{options_concat}])',
107
- f'答案应该?是\s?([{options_concat}])',
108
- f'答案应该?选\s?([{options_concat}])',
109
- f'答案为\s?([{options_concat}])',
110
- f'答案选\s?([{options_concat}])',
111
- f'选择?\s?([{options_concat}])',
112
- f'故选?\s?([{options_concat}])'
113
- f'只有选?项?\s?([{options_concat}])\s?是?对',
114
- f'只有选?项?\s?([{options_concat}])\s?是?错',
115
- f'只有选?项?\s?([{options_concat}])\s?不?正确',
116
- f'只有选?项?\s?([{options_concat}])\s?错误',
117
- f'说法不?对选?项?的?是\s?([{options_concat}])',
118
- f'说法不?正确选?项?的?是\s?([{options_concat}])',
119
- f'说法错误选?项?的?是\s?([{options_concat}])',
120
- f'([{options_concat}])\s?是正确的',
121
- f'([{options_concat}])\s?是正确答案',
122
- f'选项\s?([{options_concat}])\s?正确',
123
- f'所以答\s?([{options_concat}])',
124
- f'所以\s?([{options_concat}][.。$]?$)',
125
- f'所有\s?([{options_concat}][.。$]?$)',
126
- f'[\s,::,]([{options_concat}])[。,,\.]?$',
127
- f'[\s,,::][故即]([{options_concat}])[。\.]?$',
128
- f'[\s,,::]因此([{options_concat}])[。\.]?$',
129
- f'[是为。]\s?([{options_concat}])[。\.]?$',
130
- f'因此\s?([{options_concat}])[。\.]?$',
131
- f'显然\s?([{options_concat}])[。\.]?$',
132
- f'答案是\s?(\S+)(?:。|$)',
133
- f'答案应该是\s?(\S+)(?:。|$)',
134
- f'答案为\s?(\S+)(?:。|$)',
135
- f'答案是(.*?)[{options_concat}]',
136
- f'答案为(.*?)[{options_concat}]',
137
- f'固选(.*?)[{options_concat}]',
138
- f'答案应该是(.*?)[{options_concat}]',
139
- f'[Tt]he answer is \(?[{options_concat}]\)?',
140
- f'[Tt]he correct answer is [{options_concat}]',
141
- f'[Tt]he correct answer is:\n[{options_concat}]',
142
- f'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
143
- f'^选项\s?([{options_concat}])',
144
- f'^([{options_concat}])\s?选?项',
145
- f'(\s|^)[{options_concat}][\s。,,::\.$]',
146
- f'(\s|^)[{options_concat}](\s|$)',
147
- f'[{options_concat}]',
104
+ rf'答案是?\s?([{options_concat}])',
105
+ rf'答案是?\s?:([{options_concat}])',
106
+ rf'答案是?\s?:([{options_concat}])',
107
+ rf'答案应该?是\s?([{options_concat}])',
108
+ rf'答案应该?选\s?([{options_concat}])',
109
+ rf'答案为\s?([{options_concat}])',
110
+ rf'答案选\s?([{options_concat}])',
111
+ rf'选择?\s?([{options_concat}])',
112
+ rf'故选?\s?([{options_concat}])'
113
+ rf'只有选?项?\s?([{options_concat}])\s?是?对',
114
+ rf'只有选?项?\s?([{options_concat}])\s?是?错',
115
+ rf'只有选?项?\s?([{options_concat}])\s?不?正确',
116
+ rf'只有选?项?\s?([{options_concat}])\s?错误',
117
+ rf'说法不?对选?项?的?是\s?([{options_concat}])',
118
+ rf'说法不?正确选?项?的?是\s?([{options_concat}])',
119
+ rf'说法错误选?项?的?是\s?([{options_concat}])',
120
+ rf'([{options_concat}])\s?是正确的',
121
+ rf'([{options_concat}])\s?是正确答案',
122
+ rf'选项\s?([{options_concat}])\s?正确',
123
+ rf'所以答\s?([{options_concat}])',
124
+ rf'所以\s?([{options_concat}][.。$]?$)',
125
+ rf'所有\s?([{options_concat}][.。$]?$)',
126
+ rf'[\s,::,]([{options_concat}])[。,,\.]?$',
127
+ rf'[\s,,::][故即]([{options_concat}])[。\.]?$',
128
+ rf'[\s,,::]因此([{options_concat}])[。\.]?$',
129
+ rf'[是为。]\s?([{options_concat}])[。\.]?$',
130
+ rf'因此\s?([{options_concat}])[。\.]?$',
131
+ rf'显然\s?([{options_concat}])[。\.]?$',
132
+ rf'答案是\s?(\S+)(?:。|$)',
133
+ rf'答案应该是\s?(\S+)(?:。|$)',
134
+ rf'答案为\s?(\S+)(?:。|$)',
135
+ rf'答案是(.*?)[{options_concat}]',
136
+ rf'答案为(.*?)[{options_concat}]',
137
+ rf'固选(.*?)[{options_concat}]',
138
+ rf'答案应该是(.*?)[{options_concat}]',
139
+ rf'[Tt]he answer is \(?[{options_concat}]\)?',
140
+ rf'[Tt]he correct answer is [{options_concat}]',
141
+ rf'[Tt]he correct answer is:\n[{options_concat}]',
142
+ rf'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
143
+ rf'^选项\s?([{options_concat}])',
144
+ rf'^([{options_concat}])\s?选?项',
145
+ rf'(\s|^)[{options_concat}][\s。,,::\.$]',
146
+ rf'(\s|^)[{options_concat}](\s|$)',
147
+ rf'[{options_concat}]',
148
148
  ]
149
149
 
150
150
  regexes = [re.compile(pattern) for pattern in patterns]
@@ -166,8 +166,8 @@ class ResponseParser:
166
166
  text: The text to parse.
167
167
  """
168
168
  patterns = [
169
- r'[Aa]nswer:\s*(\w+)',
170
169
  r'answer is \(?(\w+)\)?',
170
+ r'[Aa]nswer:\s*(\w+)',
171
171
  r'[Tt]he correct answer is:\s*(\w+)',
172
172
  r'[Tt]he correct answer is:\n\s*(\w+)',
173
173
  r'[Tt]he correct answer is:\n\n-\s*(\w+)',
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.11.0'
4
- __release_datetime__ = '2025-02-13 12:00:00'
3
+ __version__ = '0.12.1'
4
+ __release_datetime__ = '2025-03-10 21:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.11.0
3
+ Version: 0.12.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -21,6 +21,7 @@ Requires-Dist: accelerate
21
21
  Requires-Dist: cachetools
22
22
  Requires-Dist: datasets<=3.2.0,>=3.0.0
23
23
  Requires-Dist: editdistance
24
+ Requires-Dist: immutabledict
24
25
  Requires-Dist: jieba
25
26
  Requires-Dist: jsonlines
26
27
  Requires-Dist: langdetect
@@ -58,6 +59,7 @@ Requires-Dist: accelerate; extra == "all"
58
59
  Requires-Dist: cachetools; extra == "all"
59
60
  Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
60
61
  Requires-Dist: editdistance; extra == "all"
62
+ Requires-Dist: immutabledict; extra == "all"
61
63
  Requires-Dist: jieba; extra == "all"
62
64
  Requires-Dist: jsonlines; extra == "all"
63
65
  Requires-Dist: langdetect; extra == "all"
@@ -101,10 +103,10 @@ Requires-Dist: sse-starlette; extra == "all"
101
103
  Requires-Dist: transformers; extra == "all"
102
104
  Requires-Dist: unicorn; extra == "all"
103
105
  Requires-Dist: gradio==5.4.0; extra == "all"
104
- Requires-Dist: plotly>=5.23.0; extra == "all"
106
+ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
105
107
  Provides-Extra: app
106
108
  Requires-Dist: gradio==5.4.0; extra == "app"
107
- Requires-Dist: plotly>=5.23.0; extra == "app"
109
+ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
108
110
  Provides-Extra: inner
109
111
  Requires-Dist: absl-py; extra == "inner"
110
112
  Requires-Dist: accelerate; extra == "inner"
@@ -223,7 +225,14 @@ Please scan the QR code below to join our community groups:
223
225
 
224
226
 
225
227
  ## 🎉 News
226
- - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
228
+
229
+ - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
230
+ - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
231
+ - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
232
+ - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
233
+ - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
234
+ - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
235
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
227
236
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
228
237
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
229
238
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
@@ -460,7 +469,7 @@ Then, you can use the following command to evaluate the model API service:
460
469
  ```shell
461
470
  evalscope eval \
462
471
  --model qwen2.5 \
463
- --api-url http://127.0.0.1:8801/v1/chat/completions \
472
+ --api-url http://127.0.0.1:8801/v1 \
464
473
  --api-key EMPTY \
465
474
  --eval-type service \
466
475
  --datasets gsm8k \