evalscope 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/benchmarks/aime/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +3 -3
- evalscope/benchmarks/arc/arc_adapter.py +14 -17
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
- evalscope/benchmarks/benchmark.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
- evalscope/benchmarks/data_adapter.py +31 -21
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
- evalscope/benchmarks/general_qa/general_qa_adapter.py +25 -11
- evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
- evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
- evalscope/benchmarks/musr/musr_adapter.py +8 -5
- evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +90 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
- evalscope/benchmarks/utils.py +43 -0
- evalscope/collections/evaluator.py +11 -2
- evalscope/config.py +10 -2
- evalscope/constants.py +7 -0
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +25 -5
- evalscope/models/chat_adapter.py +3 -0
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +35 -8
- evalscope/perf/arguments.py +13 -7
- evalscope/perf/http_client.py +6 -4
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/report/app.py +3 -0
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +5 -4
- evalscope/third_party/thinkbench/eval.py +220 -55
- evalscope/third_party/thinkbench/infer.py +37 -7
- evalscope/third_party/thinkbench/tools/llm.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/version.py +2 -2
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +7 -3
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +68 -58
- tests/cli/test_collection.py +1 -1
- tests/cli/test_run.py +135 -28
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
evalscope/models/chat_adapter.py
CHANGED
|
@@ -3,8 +3,10 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import List, Union
|
|
5
5
|
|
|
6
|
+
from evalscope.constants import OutputType
|
|
6
7
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
7
8
|
from evalscope.models.local_model import LocalModel
|
|
9
|
+
from evalscope.models.register import register_model_adapter
|
|
8
10
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
from evalscope.utils.model_utils import fix_do_sample_warning
|
|
@@ -12,6 +14,7 @@ from evalscope.utils.model_utils import fix_do_sample_warning
|
|
|
12
14
|
logger = get_logger()
|
|
13
15
|
|
|
14
16
|
|
|
17
|
+
@register_model_adapter(OutputType.GENERATION)
|
|
15
18
|
class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
16
19
|
"""
|
|
17
20
|
Chat generation model adapter.
|
|
@@ -3,11 +3,14 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
|
+
from evalscope.constants import OutputType
|
|
6
7
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
7
8
|
from evalscope.models.local_model import LocalModel
|
|
9
|
+
from evalscope.models.register import register_model_adapter
|
|
8
10
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
9
11
|
|
|
10
12
|
|
|
13
|
+
@register_model_adapter(OutputType.MULTIPLE_CHOICE)
|
|
11
14
|
class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
12
15
|
""" The multi-choice model adapter. """
|
|
13
16
|
|
|
@@ -110,6 +113,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
|
110
113
|
return log_probs, {'tokens': tokens}
|
|
111
114
|
|
|
112
115
|
|
|
116
|
+
@register_model_adapter(OutputType.CONTINUOUS)
|
|
113
117
|
class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
114
118
|
"""
|
|
115
119
|
Continuation-logits model adapter.
|
|
@@ -2,8 +2,10 @@ from typing import Any, Dict, List, Union
|
|
|
2
2
|
|
|
3
3
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
4
4
|
from evalscope.models.custom import CustomModel
|
|
5
|
+
from evalscope.models.register import register_model_adapter
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
@register_model_adapter('custom')
|
|
7
9
|
class CustomModelAdapter(BaseModelAdapter):
|
|
8
10
|
|
|
9
11
|
def __init__(self, custom_model: CustomModel, **kwargs):
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
MODEL_ADAPTERS = {}
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def register_model_adapter(name):
|
|
5
|
+
"""
|
|
6
|
+
Decorator to register a model adapter with a given name.
|
|
7
|
+
:param name: The name of the model adapter.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def decorator(adapter):
|
|
11
|
+
if name in MODEL_ADAPTERS:
|
|
12
|
+
raise ValueError(f"Model adapter '{name}' is already registered.")
|
|
13
|
+
MODEL_ADAPTERS[name] = adapter
|
|
14
|
+
return adapter
|
|
15
|
+
|
|
16
|
+
return decorator
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_model_adapter(name):
|
|
20
|
+
"""
|
|
21
|
+
Retrieve a registered model adapter by name.
|
|
22
|
+
:param name: The name of the model adapter.
|
|
23
|
+
:return: The model adapter class or function.
|
|
24
|
+
"""
|
|
25
|
+
if name not in MODEL_ADAPTERS:
|
|
26
|
+
raise ValueError(
|
|
27
|
+
f"Model adapter '{name}' is not registered. Available model adapters: {list(MODEL_ADAPTERS.keys())}")
|
|
28
|
+
return MODEL_ADAPTERS[name]
|
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
import openai
|
|
2
2
|
from collections import defaultdict
|
|
3
|
+
from inspect import signature
|
|
3
4
|
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
4
5
|
from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
|
|
5
6
|
from typing import List, Optional, Union
|
|
6
7
|
|
|
7
8
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
9
|
+
from evalscope.models.register import register_model_adapter
|
|
8
10
|
from evalscope.utils.logger import get_logger
|
|
9
11
|
|
|
10
12
|
logger = get_logger()
|
|
11
13
|
|
|
12
14
|
|
|
15
|
+
@register_model_adapter('server')
|
|
13
16
|
class ServerModelAdapter(BaseModelAdapter):
|
|
14
17
|
"""
|
|
15
18
|
Server model adapter to request remote API model and generate results.
|
|
@@ -30,6 +33,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
30
33
|
api_key=api_key,
|
|
31
34
|
base_url=self.api_url,
|
|
32
35
|
)
|
|
36
|
+
self.supported_params = self._get_supported_params()
|
|
33
37
|
|
|
34
38
|
self.seed = kwargs.get('seed', None)
|
|
35
39
|
self.timeout = kwargs.get('timeout', 60)
|
|
@@ -37,12 +41,16 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
37
41
|
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
38
42
|
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
39
43
|
|
|
40
|
-
def
|
|
44
|
+
def _get_supported_params(self):
|
|
45
|
+
sig = signature(self.client.chat.completions.create)
|
|
46
|
+
return list(sig.parameters.keys())
|
|
47
|
+
|
|
48
|
+
def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
|
|
41
49
|
"""
|
|
42
50
|
Model prediction func.
|
|
43
51
|
|
|
44
52
|
Args:
|
|
45
|
-
inputs (List[
|
|
53
|
+
inputs (List[dict]): The input data.
|
|
46
54
|
infer_cfg (dict): Inference configuration.
|
|
47
55
|
|
|
48
56
|
Returns:
|
|
@@ -104,34 +112,52 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
104
112
|
request_json['stream_options'] = {'include_usage': True}
|
|
105
113
|
|
|
106
114
|
logger.debug(f'Request to remote API: {request_json}')
|
|
115
|
+
|
|
107
116
|
return request_json
|
|
108
117
|
|
|
118
|
+
def _parse_extra_params(self, request_json):
|
|
119
|
+
api_params = {}
|
|
120
|
+
extra_body = {}
|
|
121
|
+
for key, value in request_json.items():
|
|
122
|
+
if key in self.supported_params:
|
|
123
|
+
api_params[key] = value
|
|
124
|
+
else:
|
|
125
|
+
extra_body[key] = value
|
|
126
|
+
|
|
127
|
+
if extra_body:
|
|
128
|
+
api_params['extra_body'] = extra_body
|
|
129
|
+
return api_params
|
|
130
|
+
|
|
109
131
|
def send_request(self, request_json: dict) -> dict:
|
|
110
132
|
try:
|
|
111
|
-
|
|
133
|
+
parsed_request = self._parse_extra_params(request_json)
|
|
134
|
+
response = self.client.chat.completions.create(**parsed_request)
|
|
112
135
|
|
|
113
|
-
if self.stream:
|
|
136
|
+
if response and self.stream:
|
|
114
137
|
response = self._collect_stream_response(response)
|
|
115
138
|
|
|
116
139
|
return response.model_dump(exclude_unset=True)
|
|
117
140
|
except Exception as e:
|
|
118
|
-
logger.error(f'Error when calling
|
|
141
|
+
logger.error(f'Error when calling remote API: {str(e)}')
|
|
119
142
|
raise
|
|
120
143
|
|
|
121
144
|
def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
|
|
122
145
|
collected_chunks = []
|
|
123
146
|
collected_messages = defaultdict(list)
|
|
147
|
+
collected_reasoning = defaultdict(list)
|
|
124
148
|
|
|
125
149
|
for chunk in response_stream:
|
|
126
150
|
collected_chunks.append(chunk)
|
|
127
151
|
for choice in chunk.choices:
|
|
152
|
+
if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
|
|
153
|
+
collected_reasoning[choice.index].append(choice.delta.reasoning_content)
|
|
128
154
|
if choice.delta.content is not None:
|
|
129
155
|
collected_messages[choice.index].append(choice.delta.content)
|
|
130
156
|
|
|
131
157
|
choices = []
|
|
132
158
|
for index, messages in collected_messages.items():
|
|
133
159
|
full_reply_content = ''.join(messages)
|
|
134
|
-
|
|
160
|
+
reasoning = ''.join(collected_reasoning[index])
|
|
135
161
|
# use the finish_reason from the last chunk that generated this choice
|
|
136
162
|
finish_reason = None
|
|
137
163
|
for chunk in reversed(collected_chunks):
|
|
@@ -140,9 +166,10 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
140
166
|
break
|
|
141
167
|
|
|
142
168
|
choice = Choice(
|
|
143
|
-
finish_reason=finish_reason,
|
|
169
|
+
finish_reason=finish_reason or 'stop',
|
|
144
170
|
index=index,
|
|
145
|
-
message=ChatCompletionMessage(
|
|
171
|
+
message=ChatCompletionMessage(
|
|
172
|
+
role='assistant', content=full_reply_content, reasoning_content=reasoning))
|
|
146
173
|
choices.append(choice)
|
|
147
174
|
|
|
148
175
|
# build the final completion object
|
evalscope/perf/arguments.py
CHANGED
|
@@ -21,9 +21,9 @@ class Arguments:
|
|
|
21
21
|
# Connection settings
|
|
22
22
|
url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
|
|
23
23
|
headers: Dict[str, Any] = field(default_factory=dict) # Custom headers
|
|
24
|
-
connect_timeout: int =
|
|
25
|
-
read_timeout: int =
|
|
26
|
-
api_key: str =
|
|
24
|
+
connect_timeout: int = 600 # Connection timeout in seconds
|
|
25
|
+
read_timeout: int = 600 # Read timeout in seconds
|
|
26
|
+
api_key: Optional[str] = None
|
|
27
27
|
|
|
28
28
|
# Performance and parallelism
|
|
29
29
|
number: Optional[int] = None # Number of requests to be made
|
|
@@ -125,7 +125,13 @@ class ParseKVAction(argparse.Action):
|
|
|
125
125
|
setattr(namespace, self.dest, {})
|
|
126
126
|
else:
|
|
127
127
|
try:
|
|
128
|
-
kv_dict =
|
|
128
|
+
kv_dict = {}
|
|
129
|
+
for kv in values:
|
|
130
|
+
parts = kv.split('=', 1) # only split the first '='
|
|
131
|
+
if len(parts) != 2:
|
|
132
|
+
raise ValueError(f'Invalid key-value pair: {kv}')
|
|
133
|
+
key, value = parts
|
|
134
|
+
kv_dict[key.strip()] = value.strip()
|
|
129
135
|
setattr(namespace, self.dest, kv_dict)
|
|
130
136
|
except ValueError as e:
|
|
131
137
|
parser.error(f'Error parsing key-value pairs: {e}')
|
|
@@ -144,9 +150,9 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
144
150
|
parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
|
|
145
151
|
parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
|
|
146
152
|
parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
|
|
147
|
-
parser.add_argument('--api-key', type=str, required=False, default=
|
|
148
|
-
parser.add_argument('--connect-timeout', type=int, default=
|
|
149
|
-
parser.add_argument('--read-timeout', type=int, default=
|
|
153
|
+
parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
|
|
154
|
+
parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
|
|
155
|
+
parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
|
|
150
156
|
|
|
151
157
|
# Performance and parallelism
|
|
152
158
|
parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
|
evalscope/perf/http_client.py
CHANGED
|
@@ -23,10 +23,7 @@ class AioHttpClient:
|
|
|
23
23
|
self.read_timeout = args.read_timeout
|
|
24
24
|
self.connect_timeout = args.connect_timeout
|
|
25
25
|
self.client = aiohttp.ClientSession(
|
|
26
|
-
timeout=aiohttp.ClientTimeout(
|
|
27
|
-
total=self.read_timeout + self.connect_timeout,
|
|
28
|
-
connect=self.connect_timeout,
|
|
29
|
-
sock_read=self.read_timeout),
|
|
26
|
+
timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
|
|
30
27
|
connector=aiohttp.TCPConnector(limit=1),
|
|
31
28
|
trace_configs=[self._create_trace_config()] if args.debug else [])
|
|
32
29
|
|
|
@@ -102,6 +99,11 @@ class AioHttpClient:
|
|
|
102
99
|
async with self.client.request('POST', url=self.url, data=data, headers=headers) as response:
|
|
103
100
|
async for rsp in self._handle_response(response):
|
|
104
101
|
yield rsp
|
|
102
|
+
except asyncio.TimeoutError:
|
|
103
|
+
logger.error(
|
|
104
|
+
f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longger timeout.' # noqa: E501
|
|
105
|
+
)
|
|
106
|
+
yield (True, None, 'Timeout')
|
|
105
107
|
except (aiohttp.ClientConnectorError, Exception) as e:
|
|
106
108
|
logger.error(e)
|
|
107
109
|
yield (True, None, e)
|
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import pickle
|
|
4
4
|
import sqlite3
|
|
5
5
|
|
|
6
|
-
result_db_path = '
|
|
6
|
+
result_db_path = './outputs/qwen2.5_benchmark_20241111_160543.db'
|
|
7
7
|
con = sqlite3.connect(result_db_path)
|
|
8
8
|
query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
|
|
9
9
|
FROM result WHERE success='1'"
|
evalscope/report/app.py
CHANGED
|
@@ -125,6 +125,9 @@ def get_compare_report_df(acc_df: pd.DataFrame):
|
|
|
125
125
|
|
|
126
126
|
|
|
127
127
|
def plot_single_report_scores(df: pd.DataFrame):
|
|
128
|
+
if df is None:
|
|
129
|
+
return None
|
|
130
|
+
logger.debug(f'df: {df}')
|
|
128
131
|
plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
|
|
129
132
|
|
|
130
133
|
width = 0.2 if len(df[ReportKey.dataset_name]) <= 5 else None
|
evalscope/report/combinator.py
CHANGED
|
@@ -57,8 +57,8 @@ class ReportsRecorder:
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
if __name__ == '__main__':
|
|
60
|
-
report_dir_1 = '
|
|
61
|
-
# report_dir_2 = '
|
|
60
|
+
report_dir_1 = './outputs/20250117_151926'
|
|
61
|
+
# report_dir_2 = './outputs/20250107_204445/reports'
|
|
62
62
|
|
|
63
63
|
report_table = gen_table([report_dir_1])
|
|
64
64
|
print(report_table)
|
evalscope/run.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"""
|
|
3
3
|
Run evaluation for LLMs.
|
|
4
4
|
"""
|
|
5
|
-
import os
|
|
5
|
+
import os
|
|
6
6
|
from argparse import Namespace
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from typing import TYPE_CHECKING, List, Optional, Union
|
|
@@ -127,16 +127,17 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
|
|
|
127
127
|
from evalscope.models import initialize_model_adapter
|
|
128
128
|
|
|
129
129
|
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
130
|
-
# Initialize data adapter
|
|
131
|
-
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
132
130
|
|
|
133
131
|
if dataset_name == DataCollection.NAME:
|
|
134
132
|
# EvaluatorCollection is a collection of evaluators
|
|
135
133
|
from evalscope.collections import EvaluatorCollection
|
|
134
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
136
135
|
return EvaluatorCollection(task_cfg, data_adapter, outputs)
|
|
137
136
|
|
|
138
137
|
# Initialize model adapter
|
|
139
|
-
model_adapter = initialize_model_adapter(task_cfg, benchmark
|
|
138
|
+
model_adapter = initialize_model_adapter(task_cfg, benchmark, base_model)
|
|
139
|
+
# Initialize data adapter
|
|
140
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
140
141
|
|
|
141
142
|
# update task_cfg.dataset_args
|
|
142
143
|
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|