evalscope 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +3 -1
- evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +14 -17
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -11
- evalscope/benchmarks/benchmark.py +12 -10
- evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +6 -20
- evalscope/benchmarks/data_adapter.py +82 -19
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +15 -22
- evalscope/benchmarks/general_qa/general_qa_adapter.py +29 -16
- evalscope/benchmarks/gpqa/gpqa_adapter.py +13 -8
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -4
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/math_500/math_500_adapter.py +9 -4
- evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +24 -36
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +71 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +99 -0
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +90 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +7 -14
- evalscope/benchmarks/utils.py +43 -0
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +16 -1
- evalscope/config.py +13 -3
- evalscope/constants.py +7 -0
- evalscope/evaluator/evaluator.py +3 -1
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/metrics.py +23 -2
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +32 -6
- evalscope/models/chat_adapter.py +4 -1
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/local_model.py +3 -2
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +107 -29
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +18 -8
- evalscope/perf/http_client.py +8 -6
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +15 -8
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +6 -5
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +429 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +14 -5
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +89 -65
- tests/cli/test_collection.py +1 -1
- tests/cli/test_run.py +151 -32
- /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from openai import OpenAI
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def request_url(llm_config, content):
|
|
6
|
+
try:
|
|
7
|
+
client = OpenAI(
|
|
8
|
+
api_key=llm_config['api_key'],
|
|
9
|
+
base_url=llm_config['base_url'],
|
|
10
|
+
)
|
|
11
|
+
completion = client.chat.completions.create(
|
|
12
|
+
model=llm_config['model_name'],
|
|
13
|
+
messages=[{'role': 'user', 'content': content}]
|
|
14
|
+
)
|
|
15
|
+
return completion.choices[0].message.content
|
|
16
|
+
except Exception as e:
|
|
17
|
+
print(e)
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
def request_qwen(content):
|
|
21
|
+
try:
|
|
22
|
+
client = OpenAI(
|
|
23
|
+
api_key=os.getenv('DASHSCOPE_API_KEY'),
|
|
24
|
+
base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
completion = client.chat.completions.create(
|
|
28
|
+
model='qwen-max',
|
|
29
|
+
messages=[{'role': 'user', 'content': content}]
|
|
30
|
+
)
|
|
31
|
+
return completion.choices[0].message.content
|
|
32
|
+
except Exception as e:
|
|
33
|
+
print(e)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def request_local(content):
|
|
37
|
+
try:
|
|
38
|
+
client = OpenAI(
|
|
39
|
+
api_key='EMPTY',
|
|
40
|
+
base_url='http://0.0.0.0:8801/v1',
|
|
41
|
+
)
|
|
42
|
+
completion = client.chat.completions.create(
|
|
43
|
+
model='Qwen2.5-72B-Instruct',
|
|
44
|
+
messages=[{'role': 'user', 'content': content}]
|
|
45
|
+
)
|
|
46
|
+
return completion.choices[0].message.content
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(e)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def extract_answer(solution_text: str):
|
|
5
|
+
boxed_pattern = r'\\boxed\{([^}]*)\}'
|
|
6
|
+
matches = re.findall(boxed_pattern, solution_text)
|
|
7
|
+
if matches:
|
|
8
|
+
last_boxed_content = matches[-1]
|
|
9
|
+
number_pattern = r'-?\d+'
|
|
10
|
+
number_matches = re.findall(number_pattern, last_boxed_content)
|
|
11
|
+
if number_matches:
|
|
12
|
+
return number_matches[-1].strip()
|
|
13
|
+
return None
|
|
@@ -1,37 +1,67 @@
|
|
|
1
|
-
|
|
1
|
+
import os
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from swift.llm import
|
|
4
|
-
from swift.utils import seed_everything
|
|
5
|
-
|
|
6
|
-
# TODO: Support custom model for swift infer
|
|
3
|
+
from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
|
|
7
4
|
|
|
5
|
+
# 设置GPU环境变量
|
|
6
|
+
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
8
7
|
|
|
9
8
|
@dataclass
|
|
10
9
|
class SwiftInferArgs:
|
|
11
10
|
model_id_or_path: str
|
|
12
11
|
model_type: str
|
|
12
|
+
infer_backend: str = 'vllm' # 可选 'pt', 'vllm', 'lmdeploy'
|
|
13
13
|
max_new_tokens: int = 2048
|
|
14
|
-
|
|
14
|
+
temperature: float = 0.1
|
|
15
|
+
max_batch_size: int = 16
|
|
15
16
|
|
|
16
17
|
class SwiftInfer:
|
|
17
18
|
|
|
18
19
|
def __init__(self, args: SwiftInferArgs):
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
20
|
+
# infer backend模型初始化
|
|
21
|
+
if args.infer_backend == 'pt':
|
|
22
|
+
self.engine: InferEngine = PtEngine(args.model_id_or_path, max_batch_size=args.max_batch_size)
|
|
23
|
+
elif args.infer_backend == 'vllm':
|
|
24
|
+
from swift.llm import VllmEngine
|
|
25
|
+
self.engine: InferEngine = VllmEngine(args.model_id_or_path, max_model_len=8192)
|
|
26
|
+
elif args.infer_backend == 'lmdeploy':
|
|
27
|
+
from swift.llm import LmdeployEngine
|
|
28
|
+
self.engine: InferEngine = LmdeployEngine(args.model_id_or_path)
|
|
29
|
+
else:
|
|
30
|
+
raise ValueError(f'Unsupported infer_backend: {args.infer_backend}')
|
|
25
31
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
+
# 基本配置获取 (可选)
|
|
33
|
+
self.request_config = RequestConfig(
|
|
34
|
+
max_tokens=args.max_new_tokens,
|
|
35
|
+
temperature=args.temperature,
|
|
36
|
+
stream=False # 可以透传参数改为True进行流式推理
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
def predict(self, system: str, query: str, history: list):
|
|
40
|
+
# Swift 3.0标准接口中,消息传入的格式是:
|
|
41
|
+
# messages: [{"role": "system", "content": "<SYSTEM_PROMPT>"},
|
|
42
|
+
# {"role": "user", "content": "用户问题内容"},
|
|
43
|
+
# {"role": "assistant", "content": "助手回答内容"}, ...]
|
|
44
|
+
|
|
45
|
+
messages = []
|
|
46
|
+
if system.strip():
|
|
47
|
+
messages.append({'role': 'system', 'content': system})
|
|
48
|
+
|
|
49
|
+
# 将历史对话拼接进message中
|
|
50
|
+
for qa_pair in history:
|
|
51
|
+
# 假定 history 中每个元素形如 ("user input", "model response"),请根据你的数据格式进行调整。
|
|
52
|
+
user_answer, model_response = qa_pair
|
|
53
|
+
messages.append({'role': 'user', 'content': user_answer})
|
|
54
|
+
messages.append({'role': 'assistant', 'content': model_response})
|
|
55
|
+
|
|
56
|
+
# 添加本次用户问题
|
|
57
|
+
messages.append({'role': 'user', 'content': query})
|
|
58
|
+
|
|
59
|
+
infer_request = InferRequest(messages=messages)
|
|
60
|
+
|
|
61
|
+
# 进行推理
|
|
62
|
+
response = self.engine.infer([infer_request], self.request_config)
|
|
34
63
|
|
|
35
|
-
|
|
64
|
+
# 提取模型返回的文本结果(假设非stream模式)
|
|
65
|
+
result_text = response[0].choices[0].message.content.strip()
|
|
36
66
|
|
|
37
|
-
return
|
|
67
|
+
return result_text
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Callable, Dict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Filter:
|
|
6
|
+
"""
|
|
7
|
+
A base Filter class that implements the registry pattern
|
|
8
|
+
"""
|
|
9
|
+
_registry: Dict[str, Callable[[str, Any], str]] = {}
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def register(cls, name: str) -> Callable:
|
|
13
|
+
"""
|
|
14
|
+
Decorator to register a new filter function
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def decorator(func: Callable[[str, Any], str]) -> Callable[[str, Any], str]:
|
|
18
|
+
cls._registry[name] = func
|
|
19
|
+
return func
|
|
20
|
+
|
|
21
|
+
return decorator
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def get_filter(cls, name: str) -> Callable:
|
|
25
|
+
"""
|
|
26
|
+
Get a registered filter by name
|
|
27
|
+
"""
|
|
28
|
+
return cls._registry.get(name)
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def apply(cls, name: str, value: str, *args, **kwargs) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Apply a registered filter to a value
|
|
34
|
+
"""
|
|
35
|
+
filter_func = cls.get_filter(name)
|
|
36
|
+
if filter_func is None:
|
|
37
|
+
raise ValueError(f'Filter {name} not found')
|
|
38
|
+
return filter_func(value, *args, **kwargs)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@Filter.register('remove_until')
|
|
42
|
+
def remove_until(value: str, marker: str) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Remove everything before the last occurrence of marker
|
|
45
|
+
"""
|
|
46
|
+
if marker not in value:
|
|
47
|
+
return value
|
|
48
|
+
return value[value.rindex(marker) + len(marker):]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@Filter.register('extract')
|
|
52
|
+
def extract(value: str, pattern: str) -> str:
|
|
53
|
+
"""
|
|
54
|
+
Extract content from string using regex pattern
|
|
55
|
+
"""
|
|
56
|
+
match = re.search(pattern, value)
|
|
57
|
+
if match:
|
|
58
|
+
return match.group(0)
|
|
59
|
+
return ''
|
evalscope/utils/logger.py
CHANGED
|
@@ -12,12 +12,12 @@ detailed_formatter = logging.Formatter(detailed_format)
|
|
|
12
12
|
simple_formatter = logging.Formatter(simple_format)
|
|
13
13
|
DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
|
|
14
14
|
|
|
15
|
-
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
|
|
15
|
+
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
|
|
16
16
|
|
|
17
|
-
#
|
|
17
|
+
# set logging level
|
|
18
18
|
logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
19
|
-
logging.getLogger('modelscope').setLevel(logging.WARNING)
|
|
20
19
|
logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
20
|
+
logging.getLogger('modelscope').setLevel(logging.ERROR)
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
|
evalscope/utils/model_utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from enum import Enum
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
3
|
+
from typing import TYPE_CHECKING, Optional, Tuple, Union
|
|
3
4
|
|
|
4
5
|
if TYPE_CHECKING:
|
|
5
6
|
from transformers import GenerationConfig
|
|
@@ -22,3 +23,18 @@ def fix_do_sample_warning(generation_config: 'GenerationConfig') -> None:
|
|
|
22
23
|
generation_config.temperature = 1.
|
|
23
24
|
generation_config.top_p = 1.
|
|
24
25
|
generation_config.top_k = 50
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_device() -> str:
|
|
29
|
+
from transformers.utils import is_torch_cuda_available, is_torch_mps_available, is_torch_npu_available
|
|
30
|
+
|
|
31
|
+
if is_torch_npu_available():
|
|
32
|
+
device = 'npu'
|
|
33
|
+
elif is_torch_mps_available():
|
|
34
|
+
device = 'mps'
|
|
35
|
+
elif is_torch_cuda_available():
|
|
36
|
+
device = 'cuda'
|
|
37
|
+
else:
|
|
38
|
+
device = 'cpu'
|
|
39
|
+
|
|
40
|
+
return device
|
evalscope/utils/utils.py
CHANGED
|
@@ -101,50 +101,50 @@ class ResponseParser:
|
|
|
101
101
|
options_concat = '|'.join([str(i) for i in options])
|
|
102
102
|
|
|
103
103
|
patterns = [
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
104
|
+
rf'答案是?\s?([{options_concat}])',
|
|
105
|
+
rf'答案是?\s?:([{options_concat}])',
|
|
106
|
+
rf'答案是?\s?:([{options_concat}])',
|
|
107
|
+
rf'答案应该?是\s?([{options_concat}])',
|
|
108
|
+
rf'答案应该?选\s?([{options_concat}])',
|
|
109
|
+
rf'答案为\s?([{options_concat}])',
|
|
110
|
+
rf'答案选\s?([{options_concat}])',
|
|
111
|
+
rf'选择?\s?([{options_concat}])',
|
|
112
|
+
rf'故选?\s?([{options_concat}])'
|
|
113
|
+
rf'只有选?项?\s?([{options_concat}])\s?是?对',
|
|
114
|
+
rf'只有选?项?\s?([{options_concat}])\s?是?错',
|
|
115
|
+
rf'只有选?项?\s?([{options_concat}])\s?不?正确',
|
|
116
|
+
rf'只有选?项?\s?([{options_concat}])\s?错误',
|
|
117
|
+
rf'说法不?对选?项?的?是\s?([{options_concat}])',
|
|
118
|
+
rf'说法不?正确选?项?的?是\s?([{options_concat}])',
|
|
119
|
+
rf'说法错误选?项?的?是\s?([{options_concat}])',
|
|
120
|
+
rf'([{options_concat}])\s?是正确的',
|
|
121
|
+
rf'([{options_concat}])\s?是正确答案',
|
|
122
|
+
rf'选项\s?([{options_concat}])\s?正确',
|
|
123
|
+
rf'所以答\s?([{options_concat}])',
|
|
124
|
+
rf'所以\s?([{options_concat}][.。$]?$)',
|
|
125
|
+
rf'所有\s?([{options_concat}][.。$]?$)',
|
|
126
|
+
rf'[\s,::,]([{options_concat}])[。,,\.]?$',
|
|
127
|
+
rf'[\s,,::][故即]([{options_concat}])[。\.]?$',
|
|
128
|
+
rf'[\s,,::]因此([{options_concat}])[。\.]?$',
|
|
129
|
+
rf'[是为。]\s?([{options_concat}])[。\.]?$',
|
|
130
|
+
rf'因此\s?([{options_concat}])[。\.]?$',
|
|
131
|
+
rf'显然\s?([{options_concat}])[。\.]?$',
|
|
132
|
+
rf'答案是\s?(\S+)(?:。|$)',
|
|
133
|
+
rf'答案应该是\s?(\S+)(?:。|$)',
|
|
134
|
+
rf'答案为\s?(\S+)(?:。|$)',
|
|
135
|
+
rf'答案是(.*?)[{options_concat}]',
|
|
136
|
+
rf'答案为(.*?)[{options_concat}]',
|
|
137
|
+
rf'固选(.*?)[{options_concat}]',
|
|
138
|
+
rf'答案应该是(.*?)[{options_concat}]',
|
|
139
|
+
rf'[Tt]he answer is \(?[{options_concat}]\)?',
|
|
140
|
+
rf'[Tt]he correct answer is [{options_concat}]',
|
|
141
|
+
rf'[Tt]he correct answer is:\n[{options_concat}]',
|
|
142
|
+
rf'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
|
|
143
|
+
rf'^选项\s?([{options_concat}])',
|
|
144
|
+
rf'^([{options_concat}])\s?选?项',
|
|
145
|
+
rf'(\s|^)[{options_concat}][\s。,,::\.$]',
|
|
146
|
+
rf'(\s|^)[{options_concat}](\s|$)',
|
|
147
|
+
rf'[{options_concat}]',
|
|
148
148
|
]
|
|
149
149
|
|
|
150
150
|
regexes = [re.compile(pattern) for pattern in patterns]
|
|
@@ -166,8 +166,8 @@ class ResponseParser:
|
|
|
166
166
|
text: The text to parse.
|
|
167
167
|
"""
|
|
168
168
|
patterns = [
|
|
169
|
-
r'[Aa]nswer:\s*(\w+)',
|
|
170
169
|
r'answer is \(?(\w+)\)?',
|
|
170
|
+
r'[Aa]nswer:\s*(\w+)',
|
|
171
171
|
r'[Tt]he correct answer is:\s*(\w+)',
|
|
172
172
|
r'[Tt]he correct answer is:\n\s*(\w+)',
|
|
173
173
|
r'[Tt]he correct answer is:\n\n-\s*(\w+)',
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -21,6 +21,7 @@ Requires-Dist: accelerate
|
|
|
21
21
|
Requires-Dist: cachetools
|
|
22
22
|
Requires-Dist: datasets<=3.2.0,>=3.0.0
|
|
23
23
|
Requires-Dist: editdistance
|
|
24
|
+
Requires-Dist: immutabledict
|
|
24
25
|
Requires-Dist: jieba
|
|
25
26
|
Requires-Dist: jsonlines
|
|
26
27
|
Requires-Dist: langdetect
|
|
@@ -58,6 +59,7 @@ Requires-Dist: accelerate; extra == "all"
|
|
|
58
59
|
Requires-Dist: cachetools; extra == "all"
|
|
59
60
|
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
60
61
|
Requires-Dist: editdistance; extra == "all"
|
|
62
|
+
Requires-Dist: immutabledict; extra == "all"
|
|
61
63
|
Requires-Dist: jieba; extra == "all"
|
|
62
64
|
Requires-Dist: jsonlines; extra == "all"
|
|
63
65
|
Requires-Dist: langdetect; extra == "all"
|
|
@@ -101,10 +103,10 @@ Requires-Dist: sse-starlette; extra == "all"
|
|
|
101
103
|
Requires-Dist: transformers; extra == "all"
|
|
102
104
|
Requires-Dist: unicorn; extra == "all"
|
|
103
105
|
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
104
|
-
Requires-Dist: plotly
|
|
106
|
+
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
105
107
|
Provides-Extra: app
|
|
106
108
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
107
|
-
Requires-Dist: plotly
|
|
109
|
+
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
108
110
|
Provides-Extra: inner
|
|
109
111
|
Requires-Dist: absl-py; extra == "inner"
|
|
110
112
|
Requires-Dist: accelerate; extra == "inner"
|
|
@@ -223,7 +225,14 @@ Please scan the QR code below to join our community groups:
|
|
|
223
225
|
|
|
224
226
|
|
|
225
227
|
## 🎉 News
|
|
226
|
-
|
|
228
|
+
|
|
229
|
+
- 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
|
|
230
|
+
- 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
|
|
231
|
+
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
232
|
+
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
233
|
+
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
234
|
+
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
235
|
+
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
227
236
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
228
237
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
229
238
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
@@ -460,7 +469,7 @@ Then, you can use the following command to evaluate the model API service:
|
|
|
460
469
|
```shell
|
|
461
470
|
evalscope eval \
|
|
462
471
|
--model qwen2.5 \
|
|
463
|
-
--api-url http://127.0.0.1:8801/v1
|
|
472
|
+
--api-url http://127.0.0.1:8801/v1 \
|
|
464
473
|
--api-key EMPTY \
|
|
465
474
|
--eval-type service \
|
|
466
475
|
--datasets gsm8k \
|