evalscope 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +3 -1
- evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +14 -17
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -11
- evalscope/benchmarks/benchmark.py +12 -10
- evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +6 -20
- evalscope/benchmarks/data_adapter.py +82 -19
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +15 -22
- evalscope/benchmarks/general_qa/general_qa_adapter.py +29 -16
- evalscope/benchmarks/gpqa/gpqa_adapter.py +13 -8
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -4
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/math_500/math_500_adapter.py +9 -4
- evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +24 -36
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +71 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +99 -0
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +90 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +7 -14
- evalscope/benchmarks/utils.py +43 -0
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +16 -1
- evalscope/config.py +13 -3
- evalscope/constants.py +7 -0
- evalscope/evaluator/evaluator.py +3 -1
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/metrics.py +23 -2
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +32 -6
- evalscope/models/chat_adapter.py +4 -1
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/local_model.py +3 -2
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +107 -29
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +18 -8
- evalscope/perf/http_client.py +8 -6
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +15 -8
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +6 -5
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +429 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +14 -5
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +89 -65
- tests/cli/test_collection.py +1 -1
- tests/cli/test_run.py +151 -32
- /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
|
@@ -35,6 +35,7 @@ metric_registry = MetricRegistry()
|
|
|
35
35
|
metric_registry.register(Metric(name='AverageAccuracy', object=mean))
|
|
36
36
|
metric_registry.register(Metric(name='WeightedAverageAccuracy', object=weighted_mean))
|
|
37
37
|
metric_registry.register(Metric(name='AverageBLEU', object=mean))
|
|
38
|
+
metric_registry.register(Metric(name='AverageRouge', object=mean))
|
|
38
39
|
metric_registry.register(Metric(name='WeightedAverageBLEU', object=weighted_mean))
|
|
39
40
|
metric_registry.register(Metric(name='AveragePass@1', object=mean))
|
|
40
41
|
for k in range(1, 17):
|
evalscope/models/__init__.py
CHANGED
|
@@ -7,10 +7,11 @@ from evalscope.models.custom import CustomModel
|
|
|
7
7
|
from evalscope.models.custom_adapter import CustomModelAdapter
|
|
8
8
|
from evalscope.models.local_model import LocalModel, get_local_model
|
|
9
9
|
from evalscope.models.model import BaseModel, ChatBaseModel, OpenAIModel
|
|
10
|
+
from evalscope.models.register import get_model_adapter
|
|
10
11
|
from evalscope.models.server_adapter import ServerModelAdapter
|
|
11
12
|
|
|
12
13
|
__all__ = [
|
|
13
14
|
'CustomModel', 'BaseModel', 'ChatBaseModel', 'OpenAIModel', 'BaseModelAdapter', 'ChatGenerationModelAdapter',
|
|
14
15
|
'MultiChoiceModelAdapter', 'ContinuationLogitsModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter',
|
|
15
|
-
'LocalModel', 'get_local_model', 'initialize_model_adapter'
|
|
16
|
+
'LocalModel', 'get_local_model', 'initialize_model_adapter', 'get_model_adapter'
|
|
16
17
|
]
|
evalscope/models/base_adapter.py
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
3
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Union
|
|
4
4
|
|
|
5
|
-
from evalscope.constants import EvalType
|
|
5
|
+
from evalscope.constants import EvalType, OutputType
|
|
6
6
|
from evalscope.models.custom import CustomModel
|
|
7
7
|
from evalscope.models.local_model import LocalModel
|
|
8
|
+
from evalscope.models.register import get_model_adapter, register_model_adapter
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
8
12
|
|
|
9
13
|
if TYPE_CHECKING:
|
|
14
|
+
from evalscope.benchmarks import BenchmarkMeta
|
|
10
15
|
from evalscope.config import TaskConfig
|
|
11
16
|
|
|
12
17
|
|
|
18
|
+
@register_model_adapter('base')
|
|
13
19
|
class BaseModelAdapter(ABC):
|
|
14
20
|
|
|
15
21
|
def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
|
|
@@ -33,7 +39,7 @@ class BaseModelAdapter(ABC):
|
|
|
33
39
|
raise NotImplementedError
|
|
34
40
|
|
|
35
41
|
|
|
36
|
-
def initialize_model_adapter(task_cfg: 'TaskConfig',
|
|
42
|
+
def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'BenchmarkMeta', base_model: 'LocalModel'):
|
|
37
43
|
"""Initialize the model adapter based on the task configuration."""
|
|
38
44
|
if task_cfg.dry_run:
|
|
39
45
|
from evalscope.models.model import DummyChatModel
|
|
@@ -43,10 +49,30 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseMod
|
|
|
43
49
|
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
44
50
|
from evalscope.models import CustomModelAdapter
|
|
45
51
|
return CustomModelAdapter(custom_model=task_cfg.model)
|
|
46
|
-
elif task_cfg.eval_type == EvalType.SERVICE:
|
|
52
|
+
elif task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
|
|
47
53
|
from evalscope.models import ServerModelAdapter
|
|
54
|
+
|
|
55
|
+
if benchmark.model_adapter in [OutputType.CONTINUOUS, OutputType.MULTIPLE_CHOICE]:
|
|
56
|
+
logger.warning('Output type is set to logits. This is not supported for service evaluation. '
|
|
57
|
+
'Setting output type to generation by default.')
|
|
58
|
+
benchmark.model_adapter = OutputType.GENERATION
|
|
59
|
+
|
|
48
60
|
return ServerModelAdapter(
|
|
49
|
-
api_url=task_cfg.api_url,
|
|
61
|
+
api_url=task_cfg.api_url,
|
|
62
|
+
model_id=task_cfg.model,
|
|
63
|
+
api_key=task_cfg.api_key,
|
|
64
|
+
seed=task_cfg.seed,
|
|
65
|
+
timeout=task_cfg.timeout,
|
|
66
|
+
stream=task_cfg.stream,
|
|
67
|
+
)
|
|
50
68
|
else:
|
|
51
|
-
|
|
69
|
+
# for local model, we need to determine the model adapter class based on the output type
|
|
70
|
+
model_adapter_cls = benchmark.model_adapter
|
|
71
|
+
if model_adapter_cls not in benchmark.output_types:
|
|
72
|
+
logger.warning(f'Output type {model_adapter_cls} is not supported for benchmark {benchmark.name}. '
|
|
73
|
+
f'Using {benchmark.output_types[0]} instead.')
|
|
74
|
+
model_adapter_cls = benchmark.output_types[0]
|
|
75
|
+
|
|
76
|
+
model_adapter = get_model_adapter(model_adapter_cls)
|
|
77
|
+
return model_adapter(
|
|
52
78
|
model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template)
|
evalscope/models/chat_adapter.py
CHANGED
|
@@ -3,8 +3,10 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import List, Union
|
|
5
5
|
|
|
6
|
+
from evalscope.constants import OutputType
|
|
6
7
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
7
8
|
from evalscope.models.local_model import LocalModel
|
|
9
|
+
from evalscope.models.register import register_model_adapter
|
|
8
10
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
from evalscope.utils.model_utils import fix_do_sample_warning
|
|
@@ -12,6 +14,7 @@ from evalscope.utils.model_utils import fix_do_sample_warning
|
|
|
12
14
|
logger = get_logger()
|
|
13
15
|
|
|
14
16
|
|
|
17
|
+
@register_model_adapter(OutputType.GENERATION)
|
|
15
18
|
class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
16
19
|
"""
|
|
17
20
|
Chat generation model adapter.
|
|
@@ -102,7 +105,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
102
105
|
# Get input ids
|
|
103
106
|
inputs = self.tokenizer(
|
|
104
107
|
formatted_prompts, return_tensors='pt', padding=True, truncation=True,
|
|
105
|
-
padding_side='left').to(self.device) # padding_side='left' is important for chat model
|
|
108
|
+
padding_side='left').to(self.model.device) # padding_side='left' is important for chat model
|
|
106
109
|
input_ids = inputs['input_ids']
|
|
107
110
|
|
|
108
111
|
# Run inference
|
|
@@ -3,11 +3,14 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
|
+
from evalscope.constants import OutputType
|
|
6
7
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
7
8
|
from evalscope.models.local_model import LocalModel
|
|
9
|
+
from evalscope.models.register import register_model_adapter
|
|
8
10
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
9
11
|
|
|
10
12
|
|
|
13
|
+
@register_model_adapter(OutputType.MULTIPLE_CHOICE)
|
|
11
14
|
class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
12
15
|
""" The multi-choice model adapter. """
|
|
13
16
|
|
|
@@ -110,6 +113,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
|
110
113
|
return log_probs, {'tokens': tokens}
|
|
111
114
|
|
|
112
115
|
|
|
116
|
+
@register_model_adapter(OutputType.CONTINUOUS)
|
|
113
117
|
class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
114
118
|
"""
|
|
115
119
|
Continuation-logits model adapter.
|
|
@@ -2,8 +2,10 @@ from typing import Any, Dict, List, Union
|
|
|
2
2
|
|
|
3
3
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
4
4
|
from evalscope.models.custom import CustomModel
|
|
5
|
+
from evalscope.models.register import register_model_adapter
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
@register_model_adapter('custom')
|
|
7
9
|
class CustomModelAdapter(BaseModelAdapter):
|
|
8
10
|
|
|
9
11
|
def __init__(self, custom_model: CustomModel, **kwargs):
|
evalscope/models/local_model.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Optional
|
|
|
3
3
|
|
|
4
4
|
from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType
|
|
5
5
|
from evalscope.utils.logger import get_logger
|
|
6
|
+
from evalscope.utils.model_utils import get_device
|
|
6
7
|
|
|
7
8
|
if TYPE_CHECKING:
|
|
8
9
|
from evalscope.config import TaskConfig
|
|
@@ -28,7 +29,7 @@ class LocalModel:
|
|
|
28
29
|
|
|
29
30
|
self.model_id = model_id
|
|
30
31
|
self.model_revision = model_revision
|
|
31
|
-
self.device =
|
|
32
|
+
self.device = device_map
|
|
32
33
|
|
|
33
34
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
34
35
|
self.model_id,
|
|
@@ -64,7 +65,7 @@ def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
|
|
|
64
65
|
if task_cfg.eval_type != EvalType.CHECKPOINT:
|
|
65
66
|
return None
|
|
66
67
|
else:
|
|
67
|
-
device_map = task_cfg.model_args.get('device_map',
|
|
68
|
+
device_map = task_cfg.model_args.get('device_map', get_device())
|
|
68
69
|
cache_dir = task_cfg.model_args.get('cache_dir', None)
|
|
69
70
|
model_precision = task_cfg.model_args.get('precision', 'torch.float16')
|
|
70
71
|
model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
MODEL_ADAPTERS = {}
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def register_model_adapter(name):
|
|
5
|
+
"""
|
|
6
|
+
Decorator to register a model adapter with a given name.
|
|
7
|
+
:param name: The name of the model adapter.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def decorator(adapter):
|
|
11
|
+
if name in MODEL_ADAPTERS:
|
|
12
|
+
raise ValueError(f"Model adapter '{name}' is already registered.")
|
|
13
|
+
MODEL_ADAPTERS[name] = adapter
|
|
14
|
+
return adapter
|
|
15
|
+
|
|
16
|
+
return decorator
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_model_adapter(name):
|
|
20
|
+
"""
|
|
21
|
+
Retrieve a registered model adapter by name.
|
|
22
|
+
:param name: The name of the model adapter.
|
|
23
|
+
:return: The model adapter class or function.
|
|
24
|
+
"""
|
|
25
|
+
if name not in MODEL_ADAPTERS:
|
|
26
|
+
raise ValueError(
|
|
27
|
+
f"Model adapter '{name}' is not registered. Available model adapters: {list(MODEL_ADAPTERS.keys())}")
|
|
28
|
+
return MODEL_ADAPTERS[name]
|
|
@@ -1,14 +1,18 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
1
|
+
import openai
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from inspect import signature
|
|
4
|
+
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
5
|
+
from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
|
|
3
6
|
from typing import List, Optional, Union
|
|
4
7
|
|
|
5
8
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
6
|
-
from evalscope.
|
|
9
|
+
from evalscope.models.register import register_model_adapter
|
|
7
10
|
from evalscope.utils.logger import get_logger
|
|
8
11
|
|
|
9
12
|
logger = get_logger()
|
|
10
13
|
|
|
11
14
|
|
|
15
|
+
@register_model_adapter('server')
|
|
12
16
|
class ServerModelAdapter(BaseModelAdapter):
|
|
13
17
|
"""
|
|
14
18
|
Server model adapter to request remote API model and generate results.
|
|
@@ -21,19 +25,32 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
21
25
|
model_id: The ID of the remote API model.
|
|
22
26
|
api_key: The API key of the remote API model.
|
|
23
27
|
"""
|
|
24
|
-
self.api_url = api_url
|
|
28
|
+
self.api_url = api_url.rstrip('/').rsplit('/chat/completions', 1)[0]
|
|
25
29
|
self.model_id = model_id
|
|
26
30
|
self.api_key = api_key
|
|
31
|
+
|
|
32
|
+
self.client = openai.OpenAI(
|
|
33
|
+
api_key=api_key,
|
|
34
|
+
base_url=self.api_url,
|
|
35
|
+
)
|
|
36
|
+
self.supported_params = self._get_supported_params()
|
|
37
|
+
|
|
27
38
|
self.seed = kwargs.get('seed', None)
|
|
39
|
+
self.timeout = kwargs.get('timeout', 60)
|
|
40
|
+
self.stream = kwargs.get('stream', False)
|
|
28
41
|
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
29
42
|
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
30
43
|
|
|
31
|
-
def
|
|
44
|
+
def _get_supported_params(self):
|
|
45
|
+
sig = signature(self.client.chat.completions.create)
|
|
46
|
+
return list(sig.parameters.keys())
|
|
47
|
+
|
|
48
|
+
def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
|
|
32
49
|
"""
|
|
33
50
|
Model prediction func.
|
|
34
51
|
|
|
35
52
|
Args:
|
|
36
|
-
inputs (List[
|
|
53
|
+
inputs (List[dict]): The input data.
|
|
37
54
|
infer_cfg (dict): Inference configuration.
|
|
38
55
|
|
|
39
56
|
Returns:
|
|
@@ -63,20 +80,19 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
63
80
|
response = self.send_request(request_json)
|
|
64
81
|
return response
|
|
65
82
|
|
|
66
|
-
def make_request_content(self, query: str, system_prompt: Optional[str] = None) ->
|
|
83
|
+
def make_request_content(self, query: str, system_prompt: Optional[str] = None) -> list:
|
|
67
84
|
"""
|
|
68
|
-
Make request content for API.
|
|
85
|
+
Make request content for OpenAI API.
|
|
69
86
|
"""
|
|
87
|
+
messages = []
|
|
70
88
|
if system_prompt:
|
|
71
|
-
messages
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
]
|
|
75
|
-
else:
|
|
76
|
-
messages = [ChatMessage(role='user', content=query).model_dump(exclude_unset=True)]
|
|
77
|
-
return {'messages': messages}
|
|
89
|
+
messages.append({'role': 'system', 'content': system_prompt})
|
|
90
|
+
|
|
91
|
+
messages.append({'role': 'user', 'content': query})
|
|
78
92
|
|
|
79
|
-
|
|
93
|
+
return messages
|
|
94
|
+
|
|
95
|
+
def make_request(self, content: list, infer_cfg: dict = {}) -> dict:
|
|
80
96
|
"""Make request to remote API."""
|
|
81
97
|
# Format request JSON according to OpenAI API format
|
|
82
98
|
from evalscope.config import DEFAULT_GENERATION_CONFIG
|
|
@@ -86,20 +102,82 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
86
102
|
'temperature': 0.0,
|
|
87
103
|
}
|
|
88
104
|
|
|
89
|
-
request_json = {'model': self.model_id,
|
|
105
|
+
request_json = {'model': self.model_id, 'messages': content, **infer_cfg}
|
|
106
|
+
|
|
107
|
+
if self.timeout:
|
|
108
|
+
request_json['timeout'] = self.timeout
|
|
109
|
+
|
|
110
|
+
if self.stream:
|
|
111
|
+
request_json['stream'] = self.stream
|
|
112
|
+
request_json['stream_options'] = {'include_usage': True}
|
|
113
|
+
|
|
90
114
|
logger.debug(f'Request to remote API: {request_json}')
|
|
115
|
+
|
|
91
116
|
return request_json
|
|
92
117
|
|
|
93
|
-
def
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
if
|
|
98
|
-
|
|
99
|
-
return response_data
|
|
100
|
-
logger.warning(f'Failed to request to remote API: {response.status_code} {response.text}')
|
|
101
|
-
if attempt < max_retries - 1:
|
|
102
|
-
time.sleep(5) # Sleep for 5 seconds before retrying
|
|
118
|
+
def _parse_extra_params(self, request_json):
|
|
119
|
+
api_params = {}
|
|
120
|
+
extra_body = {}
|
|
121
|
+
for key, value in request_json.items():
|
|
122
|
+
if key in self.supported_params:
|
|
123
|
+
api_params[key] = value
|
|
103
124
|
else:
|
|
104
|
-
|
|
105
|
-
|
|
125
|
+
extra_body[key] = value
|
|
126
|
+
|
|
127
|
+
if extra_body:
|
|
128
|
+
api_params['extra_body'] = extra_body
|
|
129
|
+
return api_params
|
|
130
|
+
|
|
131
|
+
def send_request(self, request_json: dict) -> dict:
|
|
132
|
+
try:
|
|
133
|
+
parsed_request = self._parse_extra_params(request_json)
|
|
134
|
+
response = self.client.chat.completions.create(**parsed_request)
|
|
135
|
+
|
|
136
|
+
if response and self.stream:
|
|
137
|
+
response = self._collect_stream_response(response)
|
|
138
|
+
|
|
139
|
+
return response.model_dump(exclude_unset=True)
|
|
140
|
+
except Exception as e:
|
|
141
|
+
logger.error(f'Error when calling remote API: {str(e)}')
|
|
142
|
+
raise
|
|
143
|
+
|
|
144
|
+
def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
|
|
145
|
+
collected_chunks = []
|
|
146
|
+
collected_messages = defaultdict(list)
|
|
147
|
+
collected_reasoning = defaultdict(list)
|
|
148
|
+
|
|
149
|
+
for chunk in response_stream:
|
|
150
|
+
collected_chunks.append(chunk)
|
|
151
|
+
for choice in chunk.choices:
|
|
152
|
+
if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
|
|
153
|
+
collected_reasoning[choice.index].append(choice.delta.reasoning_content)
|
|
154
|
+
if choice.delta.content is not None:
|
|
155
|
+
collected_messages[choice.index].append(choice.delta.content)
|
|
156
|
+
|
|
157
|
+
choices = []
|
|
158
|
+
for index, messages in collected_messages.items():
|
|
159
|
+
full_reply_content = ''.join(messages)
|
|
160
|
+
reasoning = ''.join(collected_reasoning[index])
|
|
161
|
+
# use the finish_reason from the last chunk that generated this choice
|
|
162
|
+
finish_reason = None
|
|
163
|
+
for chunk in reversed(collected_chunks):
|
|
164
|
+
if chunk.choices and chunk.choices[0].index == index:
|
|
165
|
+
finish_reason = chunk.choices[0].finish_reason
|
|
166
|
+
break
|
|
167
|
+
|
|
168
|
+
choice = Choice(
|
|
169
|
+
finish_reason=finish_reason or 'stop',
|
|
170
|
+
index=index,
|
|
171
|
+
message=ChatCompletionMessage(
|
|
172
|
+
role='assistant', content=full_reply_content, reasoning_content=reasoning))
|
|
173
|
+
choices.append(choice)
|
|
174
|
+
|
|
175
|
+
# build the final completion object
|
|
176
|
+
return ChatCompletion(
|
|
177
|
+
id=collected_chunks[0].id,
|
|
178
|
+
choices=choices,
|
|
179
|
+
created=collected_chunks[0].created,
|
|
180
|
+
model=collected_chunks[0].model,
|
|
181
|
+
object='chat.completion',
|
|
182
|
+
usage=collected_chunks[-1].usage # use the usage from the last chunk
|
|
183
|
+
)
|
evalscope/perf/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from evalscope.perf.main import run_perf_benchmark
|
evalscope/perf/arguments.py
CHANGED
|
@@ -21,9 +21,9 @@ class Arguments:
|
|
|
21
21
|
# Connection settings
|
|
22
22
|
url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
|
|
23
23
|
headers: Dict[str, Any] = field(default_factory=dict) # Custom headers
|
|
24
|
-
connect_timeout: int =
|
|
25
|
-
read_timeout: int =
|
|
26
|
-
api_key: str =
|
|
24
|
+
connect_timeout: int = 600 # Connection timeout in seconds
|
|
25
|
+
read_timeout: int = 600 # Read timeout in seconds
|
|
26
|
+
api_key: Optional[str] = None
|
|
27
27
|
|
|
28
28
|
# Performance and parallelism
|
|
29
29
|
number: Optional[int] = None # Number of requests to be made
|
|
@@ -61,6 +61,7 @@ class Arguments:
|
|
|
61
61
|
stream: Optional[bool] = None # Whether to stream the response
|
|
62
62
|
temperature: Optional[float] = None # Temperature setting for the response
|
|
63
63
|
top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
|
|
64
|
+
top_k: Optional[int] = None # Top-k sampling setting for the response
|
|
64
65
|
|
|
65
66
|
@staticmethod
|
|
66
67
|
def from_args(args):
|
|
@@ -99,7 +100,9 @@ class Arguments:
|
|
|
99
100
|
stop_token_ids=args.stop_token_ids,
|
|
100
101
|
stream=args.stream,
|
|
101
102
|
temperature=args.temperature,
|
|
102
|
-
top_p=args.top_p
|
|
103
|
+
top_p=args.top_p,
|
|
104
|
+
top_k=args.top_k,
|
|
105
|
+
)
|
|
103
106
|
|
|
104
107
|
def __post_init__(self):
|
|
105
108
|
self.headers = self.headers or {} # Default to empty dictionary
|
|
@@ -122,7 +125,13 @@ class ParseKVAction(argparse.Action):
|
|
|
122
125
|
setattr(namespace, self.dest, {})
|
|
123
126
|
else:
|
|
124
127
|
try:
|
|
125
|
-
kv_dict =
|
|
128
|
+
kv_dict = {}
|
|
129
|
+
for kv in values:
|
|
130
|
+
parts = kv.split('=', 1) # only split the first '='
|
|
131
|
+
if len(parts) != 2:
|
|
132
|
+
raise ValueError(f'Invalid key-value pair: {kv}')
|
|
133
|
+
key, value = parts
|
|
134
|
+
kv_dict[key.strip()] = value.strip()
|
|
126
135
|
setattr(namespace, self.dest, kv_dict)
|
|
127
136
|
except ValueError as e:
|
|
128
137
|
parser.error(f'Error parsing key-value pairs: {e}')
|
|
@@ -141,9 +150,9 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
141
150
|
parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
|
|
142
151
|
parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
|
|
143
152
|
parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
|
|
144
|
-
parser.add_argument('--api-key', type=str, required=False, default=
|
|
145
|
-
parser.add_argument('--connect-timeout', type=int, default=
|
|
146
|
-
parser.add_argument('--read-timeout', type=int, default=
|
|
153
|
+
parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
|
|
154
|
+
parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
|
|
155
|
+
parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
|
|
147
156
|
|
|
148
157
|
# Performance and parallelism
|
|
149
158
|
parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
|
|
@@ -183,6 +192,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
183
192
|
parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
|
|
184
193
|
parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
|
|
185
194
|
parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
|
|
195
|
+
parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
|
|
186
196
|
|
|
187
197
|
# yapf: enable
|
|
188
198
|
|
evalscope/perf/http_client.py
CHANGED
|
@@ -23,10 +23,7 @@ class AioHttpClient:
|
|
|
23
23
|
self.read_timeout = args.read_timeout
|
|
24
24
|
self.connect_timeout = args.connect_timeout
|
|
25
25
|
self.client = aiohttp.ClientSession(
|
|
26
|
-
timeout=aiohttp.ClientTimeout(
|
|
27
|
-
total=self.read_timeout + self.connect_timeout,
|
|
28
|
-
connect=self.connect_timeout,
|
|
29
|
-
sock_read=self.read_timeout),
|
|
26
|
+
timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
|
|
30
27
|
connector=aiohttp.TCPConnector(limit=1),
|
|
31
28
|
trace_configs=[self._create_trace_config()] if args.debug else [])
|
|
32
29
|
|
|
@@ -102,6 +99,11 @@ class AioHttpClient:
|
|
|
102
99
|
async with self.client.request('POST', url=self.url, data=data, headers=headers) as response:
|
|
103
100
|
async for rsp in self._handle_response(response):
|
|
104
101
|
yield rsp
|
|
102
|
+
except asyncio.TimeoutError:
|
|
103
|
+
logger.error(
|
|
104
|
+
f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longger timeout.' # noqa: E501
|
|
105
|
+
)
|
|
106
|
+
yield (True, None, 'Timeout')
|
|
105
107
|
except (aiohttp.ClientConnectorError, Exception) as e:
|
|
106
108
|
logger.error(e)
|
|
107
109
|
yield (True, None, e)
|
|
@@ -143,9 +145,9 @@ async def test_connection(args: Arguments) -> bool:
|
|
|
143
145
|
client = AioHttpClient(args)
|
|
144
146
|
async with client:
|
|
145
147
|
if 'chat/completions' in args.url:
|
|
146
|
-
request = {'messages': [{'role': 'user', 'content': 'hello'}], 'model': args.model}
|
|
148
|
+
request = {'messages': [{'role': 'user', 'content': 'hello'}], 'model': args.model, 'max_tokens': 10}
|
|
147
149
|
else:
|
|
148
|
-
request = {'prompt': 'hello', 'model': args.model}
|
|
150
|
+
request = {'prompt': 'hello', 'model': args.model, 'max_tokens': 10}
|
|
149
151
|
async for is_error, state_code, response_data in client.post(request):
|
|
150
152
|
return is_error, state_code, response_data
|
|
151
153
|
|
|
@@ -92,6 +92,8 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
92
92
|
payload['temperature'] = param.temperature
|
|
93
93
|
if param.top_p is not None:
|
|
94
94
|
payload['top_p'] = param.top_p
|
|
95
|
+
if param.top_k is not None:
|
|
96
|
+
payload['top_k'] = param.top_k
|
|
95
97
|
return payload
|
|
96
98
|
|
|
97
99
|
def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
|
|
@@ -155,5 +157,13 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
155
157
|
input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
|
|
156
158
|
output_tokens += len(self.tokenizer.encode(full_response_content))
|
|
157
159
|
else:
|
|
158
|
-
|
|
160
|
+
raise ValueError('Error: Unable to retrieve usage information\n\n'
|
|
161
|
+
'This error occurs when:\n'
|
|
162
|
+
'1. The API response does not contain usage data, AND\n'
|
|
163
|
+
'2. No tokenizer has been specified or found.\n\n'
|
|
164
|
+
'To resolve this issue, do ONE of the following:\n'
|
|
165
|
+
"a) Ensure that the API you're using supports and returns usage information, OR\n"
|
|
166
|
+
'b) Specify a tokenizer using the `--tokenizer-path` parameter.\n\n'
|
|
167
|
+
'If you continue to experience issues, '
|
|
168
|
+
'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .')
|
|
159
169
|
return input_tokens, output_tokens
|
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import pickle
|
|
4
4
|
import sqlite3
|
|
5
5
|
|
|
6
|
-
result_db_path = '
|
|
6
|
+
result_db_path = './outputs/qwen2.5_benchmark_20241111_160543.db'
|
|
7
7
|
con = sqlite3.connect(result_db_path)
|
|
8
8
|
query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
|
|
9
9
|
FROM result WHERE success='1'"
|
|
@@ -23,6 +23,7 @@ class BenchmarkData:
|
|
|
23
23
|
n_chunks: int = 0
|
|
24
24
|
n_chunks_time: float = 0.0
|
|
25
25
|
max_gpu_memory_cost = 0
|
|
26
|
+
time_per_output_token: float = 0.0
|
|
26
27
|
|
|
27
28
|
prompt_tokens = None
|
|
28
29
|
completion_tokens = None
|
|
@@ -37,6 +38,7 @@ class BenchmarkData:
|
|
|
37
38
|
self.first_chunk_latency = self.query_latency
|
|
38
39
|
self.n_chunks = 1
|
|
39
40
|
self.n_chunks_time = self.query_latency
|
|
41
|
+
self.time_per_output_token = self.query_latency / self.completion_tokens
|
|
40
42
|
|
|
41
43
|
def _calculate_tokens(self, api_plugin):
|
|
42
44
|
self.prompt_tokens, self.completion_tokens = \
|
|
@@ -63,6 +65,7 @@ class BenchmarkMetrics:
|
|
|
63
65
|
start_time: Optional[float] = None
|
|
64
66
|
total_time: float = 1.0
|
|
65
67
|
n_total_queries: int = 0
|
|
68
|
+
n_time_per_output_token: float = 0.0
|
|
66
69
|
|
|
67
70
|
avg_first_chunk_latency: float = -1
|
|
68
71
|
avg_latency: float = -1
|
|
@@ -92,6 +95,7 @@ class BenchmarkMetrics:
|
|
|
92
95
|
self.total_first_chunk_latency += benchmark_data.first_chunk_latency
|
|
93
96
|
self.n_total_chunks += benchmark_data.n_chunks
|
|
94
97
|
self.total_chunks_time += benchmark_data.n_chunks_time
|
|
98
|
+
self.n_time_per_output_token += benchmark_data.time_per_output_token
|
|
95
99
|
else:
|
|
96
100
|
self.n_failed_queries += 1
|
|
97
101
|
|
|
@@ -108,7 +112,7 @@ class BenchmarkMetrics:
|
|
|
108
112
|
self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
|
|
109
113
|
self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
|
|
110
114
|
self.avg_token_per_seconds = self.n_total_completion_tokens / self.total_time
|
|
111
|
-
self.avg_time_per_token = self.
|
|
115
|
+
self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
|
|
112
116
|
self.qps = self.n_succeed_queries / self.total_time
|
|
113
117
|
except ZeroDivisionError as e:
|
|
114
118
|
logger.exception(e)
|
|
@@ -125,7 +129,7 @@ class BenchmarkMetrics:
|
|
|
125
129
|
'Average QPS': round(self.qps, default_ndigits),
|
|
126
130
|
'Average latency (s)': round(self.avg_latency, default_ndigits),
|
|
127
131
|
'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
|
|
128
|
-
'Average time per output token (s)': round(self.avg_time_per_token,
|
|
132
|
+
'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
|
|
129
133
|
'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
|
|
130
134
|
'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
|
|
131
135
|
'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
|
evalscope/report/app.py
CHANGED
|
@@ -19,6 +19,9 @@ from evalscope.version import __version__
|
|
|
19
19
|
logger = get_logger()
|
|
20
20
|
|
|
21
21
|
PLOTLY_THEME = 'plotly_dark'
|
|
22
|
+
REPORT_TOKEN = '@@'
|
|
23
|
+
MODEL_TOKEN = '::'
|
|
24
|
+
DATASET_TOKEN = ', '
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
def scan_for_report_folders(root_path):
|
|
@@ -42,8 +45,9 @@ def scan_for_report_folders(root_path):
|
|
|
42
45
|
datasets = []
|
|
43
46
|
for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
|
|
44
47
|
datasets.append(os.path.basename(dataset_item).split('.')[0])
|
|
45
|
-
datasets =
|
|
46
|
-
reports.append(
|
|
48
|
+
datasets = DATASET_TOKEN.join(datasets)
|
|
49
|
+
reports.append(
|
|
50
|
+
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
|
|
47
51
|
|
|
48
52
|
reports = sorted(reports, reverse=True)
|
|
49
53
|
logger.debug(f'reports: {reports}')
|
|
@@ -51,9 +55,9 @@ def scan_for_report_folders(root_path):
|
|
|
51
55
|
|
|
52
56
|
|
|
53
57
|
def process_report_name(report_name: str):
|
|
54
|
-
prefix, report_name = report_name.split(
|
|
55
|
-
model_name, datasets = report_name.split(
|
|
56
|
-
datasets = datasets.split(
|
|
58
|
+
prefix, report_name = report_name.split(REPORT_TOKEN)
|
|
59
|
+
model_name, datasets = report_name.split(MODEL_TOKEN)
|
|
60
|
+
datasets = datasets.split(DATASET_TOKEN)
|
|
57
61
|
return prefix, model_name, datasets
|
|
58
62
|
|
|
59
63
|
|
|
@@ -121,6 +125,9 @@ def get_compare_report_df(acc_df: pd.DataFrame):
|
|
|
121
125
|
|
|
122
126
|
|
|
123
127
|
def plot_single_report_scores(df: pd.DataFrame):
|
|
128
|
+
if df is None:
|
|
129
|
+
return None
|
|
130
|
+
logger.debug(f'df: {df}')
|
|
124
131
|
plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
|
|
125
132
|
|
|
126
133
|
width = 0.2 if len(df[ReportKey.dataset_name]) <= 5 else None
|
|
@@ -171,7 +178,7 @@ def plot_single_dataset_scores(df: pd.DataFrame):
|
|
|
171
178
|
text=df[ReportKey.score],
|
|
172
179
|
barmode='group')
|
|
173
180
|
|
|
174
|
-
width = 0.2 if len(df[ReportKey.subset_name]) <=
|
|
181
|
+
width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
|
|
175
182
|
plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
|
|
176
183
|
plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
|
|
177
184
|
return plot
|
|
@@ -519,8 +526,8 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
519
526
|
outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
|
|
520
527
|
def update_single_report_data(root_path, report_name):
|
|
521
528
|
report_list, datasets, task_cfg = load_single_report(root_path, report_name)
|
|
522
|
-
work_dir = os.path.join(root_path, report_name.split(
|
|
523
|
-
model_name = report_name.split(
|
|
529
|
+
work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
|
|
530
|
+
model_name = report_name.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
|
|
524
531
|
return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
|
|
525
532
|
|
|
526
533
|
@report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])
|