evalscope 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (89) hide show
  1. evalscope/arguments.py +3 -1
  2. evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +14 -17
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -11
  6. evalscope/benchmarks/benchmark.py +12 -10
  7. evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +6 -20
  10. evalscope/benchmarks/data_adapter.py +82 -19
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +15 -22
  13. evalscope/benchmarks/general_qa/general_qa_adapter.py +29 -16
  14. evalscope/benchmarks/gpqa/gpqa_adapter.py +13 -8
  15. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -4
  16. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
  17. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
  18. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -4
  19. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  20. evalscope/benchmarks/math_500/math_500_adapter.py +9 -4
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +24 -36
  23. evalscope/benchmarks/musr/__init__.py +0 -0
  24. evalscope/benchmarks/musr/musr_adapter.py +71 -0
  25. evalscope/benchmarks/process_bench/__init__.py +0 -0
  26. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  27. evalscope/benchmarks/process_bench/process_bench_adapter.py +99 -0
  28. evalscope/benchmarks/race/race_adapter.py +12 -16
  29. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  30. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
  31. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  32. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  33. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  34. evalscope/benchmarks/super_gpqa/utils.py +90 -0
  35. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  36. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  37. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +7 -14
  38. evalscope/benchmarks/utils.py +43 -0
  39. evalscope/cli/start_app.py +4 -1
  40. evalscope/cli/start_eval.py +4 -3
  41. evalscope/cli/start_perf.py +4 -2
  42. evalscope/collections/evaluator.py +16 -1
  43. evalscope/config.py +13 -3
  44. evalscope/constants.py +7 -0
  45. evalscope/evaluator/evaluator.py +3 -1
  46. evalscope/metrics/__init__.py +2 -1
  47. evalscope/metrics/metrics.py +23 -2
  48. evalscope/metrics/named_metrics.py +1 -0
  49. evalscope/models/__init__.py +2 -1
  50. evalscope/models/base_adapter.py +32 -6
  51. evalscope/models/chat_adapter.py +4 -1
  52. evalscope/models/choice_adapter.py +4 -0
  53. evalscope/models/custom_adapter.py +2 -0
  54. evalscope/models/local_model.py +3 -2
  55. evalscope/models/register.py +28 -0
  56. evalscope/models/server_adapter.py +107 -29
  57. evalscope/perf/__init__.py +0 -1
  58. evalscope/perf/arguments.py +18 -8
  59. evalscope/perf/http_client.py +8 -6
  60. evalscope/perf/plugin/api/openai_api.py +11 -1
  61. evalscope/perf/utils/analysis_result.py +1 -1
  62. evalscope/perf/utils/benchmark_util.py +6 -2
  63. evalscope/report/app.py +15 -8
  64. evalscope/report/combinator.py +2 -2
  65. evalscope/run.py +6 -5
  66. evalscope/third_party/thinkbench/__init__.py +3 -0
  67. evalscope/third_party/thinkbench/eval.py +429 -0
  68. evalscope/third_party/thinkbench/infer.py +130 -0
  69. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  70. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  71. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  72. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  73. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  74. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  75. evalscope/utils/chat_service.py +1 -0
  76. evalscope/utils/filters.py +59 -0
  77. evalscope/utils/logger.py +3 -3
  78. evalscope/utils/model_utils.py +17 -1
  79. evalscope/utils/utils.py +45 -45
  80. evalscope/version.py +2 -2
  81. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +14 -5
  82. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +89 -65
  83. tests/cli/test_collection.py +1 -1
  84. tests/cli/test_run.py +151 -32
  85. /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
  86. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
  87. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
  88. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
  89. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
@@ -35,6 +35,7 @@ metric_registry = MetricRegistry()
35
35
  metric_registry.register(Metric(name='AverageAccuracy', object=mean))
36
36
  metric_registry.register(Metric(name='WeightedAverageAccuracy', object=weighted_mean))
37
37
  metric_registry.register(Metric(name='AverageBLEU', object=mean))
38
+ metric_registry.register(Metric(name='AverageRouge', object=mean))
38
39
  metric_registry.register(Metric(name='WeightedAverageBLEU', object=weighted_mean))
39
40
  metric_registry.register(Metric(name='AveragePass@1', object=mean))
40
41
  for k in range(1, 17):
@@ -7,10 +7,11 @@ from evalscope.models.custom import CustomModel
7
7
  from evalscope.models.custom_adapter import CustomModelAdapter
8
8
  from evalscope.models.local_model import LocalModel, get_local_model
9
9
  from evalscope.models.model import BaseModel, ChatBaseModel, OpenAIModel
10
+ from evalscope.models.register import get_model_adapter
10
11
  from evalscope.models.server_adapter import ServerModelAdapter
11
12
 
12
13
  __all__ = [
13
14
  'CustomModel', 'BaseModel', 'ChatBaseModel', 'OpenAIModel', 'BaseModelAdapter', 'ChatGenerationModelAdapter',
14
15
  'MultiChoiceModelAdapter', 'ContinuationLogitsModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter',
15
- 'LocalModel', 'get_local_model', 'initialize_model_adapter'
16
+ 'LocalModel', 'get_local_model', 'initialize_model_adapter', 'get_model_adapter'
16
17
  ]
@@ -1,15 +1,21 @@
1
1
  import torch
2
2
  from abc import ABC, abstractmethod
3
- from typing import TYPE_CHECKING, Any, Optional, Union
3
+ from typing import TYPE_CHECKING, Any, List, Optional, Union
4
4
 
5
- from evalscope.constants import EvalType
5
+ from evalscope.constants import EvalType, OutputType
6
6
  from evalscope.models.custom import CustomModel
7
7
  from evalscope.models.local_model import LocalModel
8
+ from evalscope.models.register import get_model_adapter, register_model_adapter
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
8
12
 
9
13
  if TYPE_CHECKING:
14
+ from evalscope.benchmarks import BenchmarkMeta
10
15
  from evalscope.config import TaskConfig
11
16
 
12
17
 
18
+ @register_model_adapter('base')
13
19
  class BaseModelAdapter(ABC):
14
20
 
15
21
  def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
@@ -33,7 +39,7 @@ class BaseModelAdapter(ABC):
33
39
  raise NotImplementedError
34
40
 
35
41
 
36
- def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseModelAdapter', base_model: 'LocalModel'):
42
+ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'BenchmarkMeta', base_model: 'LocalModel'):
37
43
  """Initialize the model adapter based on the task configuration."""
38
44
  if task_cfg.dry_run:
39
45
  from evalscope.models.model import DummyChatModel
@@ -43,10 +49,30 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseMod
43
49
  raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
44
50
  from evalscope.models import CustomModelAdapter
45
51
  return CustomModelAdapter(custom_model=task_cfg.model)
46
- elif task_cfg.eval_type == EvalType.SERVICE:
52
+ elif task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
47
53
  from evalscope.models import ServerModelAdapter
54
+
55
+ if benchmark.model_adapter in [OutputType.CONTINUOUS, OutputType.MULTIPLE_CHOICE]:
56
+ logger.warning('Output type is set to logits. This is not supported for service evaluation. '
57
+ 'Setting output type to generation by default.')
58
+ benchmark.model_adapter = OutputType.GENERATION
59
+
48
60
  return ServerModelAdapter(
49
- api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key, seed=task_cfg.seed)
61
+ api_url=task_cfg.api_url,
62
+ model_id=task_cfg.model,
63
+ api_key=task_cfg.api_key,
64
+ seed=task_cfg.seed,
65
+ timeout=task_cfg.timeout,
66
+ stream=task_cfg.stream,
67
+ )
50
68
  else:
51
- return model_adapter_cls(
69
+ # for local model, we need to determine the model adapter class based on the output type
70
+ model_adapter_cls = benchmark.model_adapter
71
+ if model_adapter_cls not in benchmark.output_types:
72
+ logger.warning(f'Output type {model_adapter_cls} is not supported for benchmark {benchmark.name}. '
73
+ f'Using {benchmark.output_types[0]} instead.')
74
+ model_adapter_cls = benchmark.output_types[0]
75
+
76
+ model_adapter = get_model_adapter(model_adapter_cls)
77
+ return model_adapter(
52
78
  model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template)
@@ -3,8 +3,10 @@ import time
3
3
  import torch
4
4
  from typing import List, Union
5
5
 
6
+ from evalscope.constants import OutputType
6
7
  from evalscope.models.base_adapter import BaseModelAdapter
7
8
  from evalscope.models.local_model import LocalModel
9
+ from evalscope.models.register import register_model_adapter
8
10
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
9
11
  from evalscope.utils.logger import get_logger
10
12
  from evalscope.utils.model_utils import fix_do_sample_warning
@@ -12,6 +14,7 @@ from evalscope.utils.model_utils import fix_do_sample_warning
12
14
  logger = get_logger()
13
15
 
14
16
 
17
+ @register_model_adapter(OutputType.GENERATION)
15
18
  class ChatGenerationModelAdapter(BaseModelAdapter):
16
19
  """
17
20
  Chat generation model adapter.
@@ -102,7 +105,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
102
105
  # Get input ids
103
106
  inputs = self.tokenizer(
104
107
  formatted_prompts, return_tensors='pt', padding=True, truncation=True,
105
- padding_side='left').to(self.device) # padding_side='left' is important for chat model
108
+ padding_side='left').to(self.model.device) # padding_side='left' is important for chat model
106
109
  input_ids = inputs['input_ids']
107
110
 
108
111
  # Run inference
@@ -3,11 +3,14 @@ import time
3
3
  import torch
4
4
  from typing import List
5
5
 
6
+ from evalscope.constants import OutputType
6
7
  from evalscope.models.base_adapter import BaseModelAdapter
7
8
  from evalscope.models.local_model import LocalModel
9
+ from evalscope.models.register import register_model_adapter
8
10
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
9
11
 
10
12
 
13
+ @register_model_adapter(OutputType.MULTIPLE_CHOICE)
11
14
  class MultiChoiceModelAdapter(BaseModelAdapter):
12
15
  """ The multi-choice model adapter. """
13
16
 
@@ -110,6 +113,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
110
113
  return log_probs, {'tokens': tokens}
111
114
 
112
115
 
116
+ @register_model_adapter(OutputType.CONTINUOUS)
113
117
  class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
114
118
  """
115
119
  Continuation-logits model adapter.
@@ -2,8 +2,10 @@ from typing import Any, Dict, List, Union
2
2
 
3
3
  from evalscope.models.base_adapter import BaseModelAdapter
4
4
  from evalscope.models.custom import CustomModel
5
+ from evalscope.models.register import register_model_adapter
5
6
 
6
7
 
8
+ @register_model_adapter('custom')
7
9
  class CustomModelAdapter(BaseModelAdapter):
8
10
 
9
11
  def __init__(self, custom_model: CustomModel, **kwargs):
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType
5
5
  from evalscope.utils.logger import get_logger
6
+ from evalscope.utils.model_utils import get_device
6
7
 
7
8
  if TYPE_CHECKING:
8
9
  from evalscope.config import TaskConfig
@@ -28,7 +29,7 @@ class LocalModel:
28
29
 
29
30
  self.model_id = model_id
30
31
  self.model_revision = model_revision
31
- self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
32
+ self.device = device_map
32
33
 
33
34
  self.tokenizer = AutoTokenizer.from_pretrained(
34
35
  self.model_id,
@@ -64,7 +65,7 @@ def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
64
65
  if task_cfg.eval_type != EvalType.CHECKPOINT:
65
66
  return None
66
67
  else:
67
- device_map = task_cfg.model_args.get('device_map', 'auto')
68
+ device_map = task_cfg.model_args.get('device_map', get_device())
68
69
  cache_dir = task_cfg.model_args.get('cache_dir', None)
69
70
  model_precision = task_cfg.model_args.get('precision', 'torch.float16')
70
71
  model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION)
@@ -0,0 +1,28 @@
1
+ MODEL_ADAPTERS = {}
2
+
3
+
4
+ def register_model_adapter(name):
5
+ """
6
+ Decorator to register a model adapter with a given name.
7
+ :param name: The name of the model adapter.
8
+ """
9
+
10
+ def decorator(adapter):
11
+ if name in MODEL_ADAPTERS:
12
+ raise ValueError(f"Model adapter '{name}' is already registered.")
13
+ MODEL_ADAPTERS[name] = adapter
14
+ return adapter
15
+
16
+ return decorator
17
+
18
+
19
+ def get_model_adapter(name):
20
+ """
21
+ Retrieve a registered model adapter by name.
22
+ :param name: The name of the model adapter.
23
+ :return: The model adapter class or function.
24
+ """
25
+ if name not in MODEL_ADAPTERS:
26
+ raise ValueError(
27
+ f"Model adapter '{name}' is not registered. Available model adapters: {list(MODEL_ADAPTERS.keys())}")
28
+ return MODEL_ADAPTERS[name]
@@ -1,14 +1,18 @@
1
- import requests
2
- import time
1
+ import openai
2
+ from collections import defaultdict
3
+ from inspect import signature
4
+ from openai.types.chat import ChatCompletion, ChatCompletionChunk
5
+ from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
3
6
  from typing import List, Optional, Union
4
7
 
5
8
  from evalscope.models.base_adapter import BaseModelAdapter
6
- from evalscope.utils.chat_service import ChatMessage
9
+ from evalscope.models.register import register_model_adapter
7
10
  from evalscope.utils.logger import get_logger
8
11
 
9
12
  logger = get_logger()
10
13
 
11
14
 
15
+ @register_model_adapter('server')
12
16
  class ServerModelAdapter(BaseModelAdapter):
13
17
  """
14
18
  Server model adapter to request remote API model and generate results.
@@ -21,19 +25,32 @@ class ServerModelAdapter(BaseModelAdapter):
21
25
  model_id: The ID of the remote API model.
22
26
  api_key: The API key of the remote API model.
23
27
  """
24
- self.api_url = api_url
28
+ self.api_url = api_url.rstrip('/').rsplit('/chat/completions', 1)[0]
25
29
  self.model_id = model_id
26
30
  self.api_key = api_key
31
+
32
+ self.client = openai.OpenAI(
33
+ api_key=api_key,
34
+ base_url=self.api_url,
35
+ )
36
+ self.supported_params = self._get_supported_params()
37
+
27
38
  self.seed = kwargs.get('seed', None)
39
+ self.timeout = kwargs.get('timeout', 60)
40
+ self.stream = kwargs.get('stream', False)
28
41
  self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
29
42
  super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
30
43
 
31
- def predict(self, inputs: List[Union[str, dict, list]], infer_cfg: dict = None) -> List[dict]:
44
+ def _get_supported_params(self):
45
+ sig = signature(self.client.chat.completions.create)
46
+ return list(sig.parameters.keys())
47
+
48
+ def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
32
49
  """
33
50
  Model prediction func.
34
51
 
35
52
  Args:
36
- inputs (List[Union[str, dict, list]]): The input data.
53
+ inputs (List[dict]): The input data.
37
54
  infer_cfg (dict): Inference configuration.
38
55
 
39
56
  Returns:
@@ -63,20 +80,19 @@ class ServerModelAdapter(BaseModelAdapter):
63
80
  response = self.send_request(request_json)
64
81
  return response
65
82
 
66
- def make_request_content(self, query: str, system_prompt: Optional[str] = None) -> dict:
83
+ def make_request_content(self, query: str, system_prompt: Optional[str] = None) -> list:
67
84
  """
68
- Make request content for API.
85
+ Make request content for OpenAI API.
69
86
  """
87
+ messages = []
70
88
  if system_prompt:
71
- messages = [
72
- ChatMessage(role='system', content=system_prompt).model_dump(exclude_unset=True),
73
- ChatMessage(role='user', content=query).model_dump(exclude_unset=True)
74
- ]
75
- else:
76
- messages = [ChatMessage(role='user', content=query).model_dump(exclude_unset=True)]
77
- return {'messages': messages}
89
+ messages.append({'role': 'system', 'content': system_prompt})
90
+
91
+ messages.append({'role': 'user', 'content': query})
78
92
 
79
- def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
93
+ return messages
94
+
95
+ def make_request(self, content: list, infer_cfg: dict = {}) -> dict:
80
96
  """Make request to remote API."""
81
97
  # Format request JSON according to OpenAI API format
82
98
  from evalscope.config import DEFAULT_GENERATION_CONFIG
@@ -86,20 +102,82 @@ class ServerModelAdapter(BaseModelAdapter):
86
102
  'temperature': 0.0,
87
103
  }
88
104
 
89
- request_json = {'model': self.model_id, **content, **infer_cfg}
105
+ request_json = {'model': self.model_id, 'messages': content, **infer_cfg}
106
+
107
+ if self.timeout:
108
+ request_json['timeout'] = self.timeout
109
+
110
+ if self.stream:
111
+ request_json['stream'] = self.stream
112
+ request_json['stream_options'] = {'include_usage': True}
113
+
90
114
  logger.debug(f'Request to remote API: {request_json}')
115
+
91
116
  return request_json
92
117
 
93
- def send_request(self, request_json: dict, max_retries: int = 3) -> dict:
94
- for attempt in range(max_retries):
95
- response = requests.post(
96
- self.api_url, json=request_json, headers={'Authorization': f'Bearer {self.api_key}'})
97
- if response.status_code == 200:
98
- response_data = response.json()
99
- return response_data
100
- logger.warning(f'Failed to request to remote API: {response.status_code} {response.text}')
101
- if attempt < max_retries - 1:
102
- time.sleep(5) # Sleep for 5 seconds before retrying
118
+ def _parse_extra_params(self, request_json):
119
+ api_params = {}
120
+ extra_body = {}
121
+ for key, value in request_json.items():
122
+ if key in self.supported_params:
123
+ api_params[key] = value
103
124
  else:
104
- raise RuntimeError(f'Failed to request to remote API after {max_retries} attempts: '
105
- f'{response.status_code} {response.text}')
125
+ extra_body[key] = value
126
+
127
+ if extra_body:
128
+ api_params['extra_body'] = extra_body
129
+ return api_params
130
+
131
+ def send_request(self, request_json: dict) -> dict:
132
+ try:
133
+ parsed_request = self._parse_extra_params(request_json)
134
+ response = self.client.chat.completions.create(**parsed_request)
135
+
136
+ if response and self.stream:
137
+ response = self._collect_stream_response(response)
138
+
139
+ return response.model_dump(exclude_unset=True)
140
+ except Exception as e:
141
+ logger.error(f'Error when calling remote API: {str(e)}')
142
+ raise
143
+
144
+ def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
145
+ collected_chunks = []
146
+ collected_messages = defaultdict(list)
147
+ collected_reasoning = defaultdict(list)
148
+
149
+ for chunk in response_stream:
150
+ collected_chunks.append(chunk)
151
+ for choice in chunk.choices:
152
+ if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
153
+ collected_reasoning[choice.index].append(choice.delta.reasoning_content)
154
+ if choice.delta.content is not None:
155
+ collected_messages[choice.index].append(choice.delta.content)
156
+
157
+ choices = []
158
+ for index, messages in collected_messages.items():
159
+ full_reply_content = ''.join(messages)
160
+ reasoning = ''.join(collected_reasoning[index])
161
+ # use the finish_reason from the last chunk that generated this choice
162
+ finish_reason = None
163
+ for chunk in reversed(collected_chunks):
164
+ if chunk.choices and chunk.choices[0].index == index:
165
+ finish_reason = chunk.choices[0].finish_reason
166
+ break
167
+
168
+ choice = Choice(
169
+ finish_reason=finish_reason or 'stop',
170
+ index=index,
171
+ message=ChatCompletionMessage(
172
+ role='assistant', content=full_reply_content, reasoning_content=reasoning))
173
+ choices.append(choice)
174
+
175
+ # build the final completion object
176
+ return ChatCompletion(
177
+ id=collected_chunks[0].id,
178
+ choices=choices,
179
+ created=collected_chunks[0].created,
180
+ model=collected_chunks[0].model,
181
+ object='chat.completion',
182
+ usage=collected_chunks[-1].usage # use the usage from the last chunk
183
+ )
@@ -1 +0,0 @@
1
- from evalscope.perf.main import run_perf_benchmark
@@ -21,9 +21,9 @@ class Arguments:
21
21
  # Connection settings
22
22
  url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
23
23
  headers: Dict[str, Any] = field(default_factory=dict) # Custom headers
24
- connect_timeout: int = 120 # Connection timeout in seconds
25
- read_timeout: int = 120 # Read timeout in seconds
26
- api_key: str = 'EMPTY'
24
+ connect_timeout: int = 600 # Connection timeout in seconds
25
+ read_timeout: int = 600 # Read timeout in seconds
26
+ api_key: Optional[str] = None
27
27
 
28
28
  # Performance and parallelism
29
29
  number: Optional[int] = None # Number of requests to be made
@@ -61,6 +61,7 @@ class Arguments:
61
61
  stream: Optional[bool] = None # Whether to stream the response
62
62
  temperature: Optional[float] = None # Temperature setting for the response
63
63
  top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
64
+ top_k: Optional[int] = None # Top-k sampling setting for the response
64
65
 
65
66
  @staticmethod
66
67
  def from_args(args):
@@ -99,7 +100,9 @@ class Arguments:
99
100
  stop_token_ids=args.stop_token_ids,
100
101
  stream=args.stream,
101
102
  temperature=args.temperature,
102
- top_p=args.top_p)
103
+ top_p=args.top_p,
104
+ top_k=args.top_k,
105
+ )
103
106
 
104
107
  def __post_init__(self):
105
108
  self.headers = self.headers or {} # Default to empty dictionary
@@ -122,7 +125,13 @@ class ParseKVAction(argparse.Action):
122
125
  setattr(namespace, self.dest, {})
123
126
  else:
124
127
  try:
125
- kv_dict = dict(kv.split('=') for kv in values)
128
+ kv_dict = {}
129
+ for kv in values:
130
+ parts = kv.split('=', 1) # only split the first '='
131
+ if len(parts) != 2:
132
+ raise ValueError(f'Invalid key-value pair: {kv}')
133
+ key, value = parts
134
+ kv_dict[key.strip()] = value.strip()
126
135
  setattr(namespace, self.dest, kv_dict)
127
136
  except ValueError as e:
128
137
  parser.error(f'Error parsing key-value pairs: {e}')
@@ -141,9 +150,9 @@ def add_argument(parser: argparse.ArgumentParser):
141
150
  parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
142
151
  parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
143
152
  parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
144
- parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
145
- parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
146
- parser.add_argument('--read-timeout', type=int, default=120, help='The network read timeout')
153
+ parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
154
+ parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
155
+ parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
147
156
 
148
157
  # Performance and parallelism
149
158
  parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
@@ -183,6 +192,7 @@ def add_argument(parser: argparse.ArgumentParser):
183
192
  parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
184
193
  parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
185
194
  parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
195
+ parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
186
196
 
187
197
  # yapf: enable
188
198
 
@@ -23,10 +23,7 @@ class AioHttpClient:
23
23
  self.read_timeout = args.read_timeout
24
24
  self.connect_timeout = args.connect_timeout
25
25
  self.client = aiohttp.ClientSession(
26
- timeout=aiohttp.ClientTimeout(
27
- total=self.read_timeout + self.connect_timeout,
28
- connect=self.connect_timeout,
29
- sock_read=self.read_timeout),
26
+ timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
30
27
  connector=aiohttp.TCPConnector(limit=1),
31
28
  trace_configs=[self._create_trace_config()] if args.debug else [])
32
29
 
@@ -102,6 +99,11 @@ class AioHttpClient:
102
99
  async with self.client.request('POST', url=self.url, data=data, headers=headers) as response:
103
100
  async for rsp in self._handle_response(response):
104
101
  yield rsp
102
+ except asyncio.TimeoutError:
103
+ logger.error(
104
+ f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longger timeout.' # noqa: E501
105
+ )
106
+ yield (True, None, 'Timeout')
105
107
  except (aiohttp.ClientConnectorError, Exception) as e:
106
108
  logger.error(e)
107
109
  yield (True, None, e)
@@ -143,9 +145,9 @@ async def test_connection(args: Arguments) -> bool:
143
145
  client = AioHttpClient(args)
144
146
  async with client:
145
147
  if 'chat/completions' in args.url:
146
- request = {'messages': [{'role': 'user', 'content': 'hello'}], 'model': args.model}
148
+ request = {'messages': [{'role': 'user', 'content': 'hello'}], 'model': args.model, 'max_tokens': 10}
147
149
  else:
148
- request = {'prompt': 'hello', 'model': args.model}
150
+ request = {'prompt': 'hello', 'model': args.model, 'max_tokens': 10}
149
151
  async for is_error, state_code, response_data in client.post(request):
150
152
  return is_error, state_code, response_data
151
153
 
@@ -92,6 +92,8 @@ class OpenaiPlugin(ApiPluginBase):
92
92
  payload['temperature'] = param.temperature
93
93
  if param.top_p is not None:
94
94
  payload['top_p'] = param.top_p
95
+ if param.top_k is not None:
96
+ payload['top_k'] = param.top_k
95
97
  return payload
96
98
 
97
99
  def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
@@ -155,5 +157,13 @@ class OpenaiPlugin(ApiPluginBase):
155
157
  input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
156
158
  output_tokens += len(self.tokenizer.encode(full_response_content))
157
159
  else:
158
- logger.warning('No usage information found. Please specify `--tokenizer-path` to generate usage details.')
160
+ raise ValueError('Error: Unable to retrieve usage information\n\n'
161
+ 'This error occurs when:\n'
162
+ '1. The API response does not contain usage data, AND\n'
163
+ '2. No tokenizer has been specified or found.\n\n'
164
+ 'To resolve this issue, do ONE of the following:\n'
165
+ "a) Ensure that the API you're using supports and returns usage information, OR\n"
166
+ 'b) Specify a tokenizer using the `--tokenizer-path` parameter.\n\n'
167
+ 'If you continue to experience issues, '
168
+ 'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .')
159
169
  return input_tokens, output_tokens
@@ -3,7 +3,7 @@ import json
3
3
  import pickle
4
4
  import sqlite3
5
5
 
6
- result_db_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/qwen2.5_benchmark_20241111_160543.db'
6
+ result_db_path = './outputs/qwen2.5_benchmark_20241111_160543.db'
7
7
  con = sqlite3.connect(result_db_path)
8
8
  query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
9
9
  FROM result WHERE success='1'"
@@ -23,6 +23,7 @@ class BenchmarkData:
23
23
  n_chunks: int = 0
24
24
  n_chunks_time: float = 0.0
25
25
  max_gpu_memory_cost = 0
26
+ time_per_output_token: float = 0.0
26
27
 
27
28
  prompt_tokens = None
28
29
  completion_tokens = None
@@ -37,6 +38,7 @@ class BenchmarkData:
37
38
  self.first_chunk_latency = self.query_latency
38
39
  self.n_chunks = 1
39
40
  self.n_chunks_time = self.query_latency
41
+ self.time_per_output_token = self.query_latency / self.completion_tokens
40
42
 
41
43
  def _calculate_tokens(self, api_plugin):
42
44
  self.prompt_tokens, self.completion_tokens = \
@@ -63,6 +65,7 @@ class BenchmarkMetrics:
63
65
  start_time: Optional[float] = None
64
66
  total_time: float = 1.0
65
67
  n_total_queries: int = 0
68
+ n_time_per_output_token: float = 0.0
66
69
 
67
70
  avg_first_chunk_latency: float = -1
68
71
  avg_latency: float = -1
@@ -92,6 +95,7 @@ class BenchmarkMetrics:
92
95
  self.total_first_chunk_latency += benchmark_data.first_chunk_latency
93
96
  self.n_total_chunks += benchmark_data.n_chunks
94
97
  self.total_chunks_time += benchmark_data.n_chunks_time
98
+ self.n_time_per_output_token += benchmark_data.time_per_output_token
95
99
  else:
96
100
  self.n_failed_queries += 1
97
101
 
@@ -108,7 +112,7 @@ class BenchmarkMetrics:
108
112
  self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
109
113
  self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
110
114
  self.avg_token_per_seconds = self.n_total_completion_tokens / self.total_time
111
- self.avg_time_per_token = self.total_time / self.n_total_completion_tokens
115
+ self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
112
116
  self.qps = self.n_succeed_queries / self.total_time
113
117
  except ZeroDivisionError as e:
114
118
  logger.exception(e)
@@ -125,7 +129,7 @@ class BenchmarkMetrics:
125
129
  'Average QPS': round(self.qps, default_ndigits),
126
130
  'Average latency (s)': round(self.avg_latency, default_ndigits),
127
131
  'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
128
- 'Average time per output token (s)': round(self.avg_time_per_token, 5),
132
+ 'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
129
133
  'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
130
134
  'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
131
135
  'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
evalscope/report/app.py CHANGED
@@ -19,6 +19,9 @@ from evalscope.version import __version__
19
19
  logger = get_logger()
20
20
 
21
21
  PLOTLY_THEME = 'plotly_dark'
22
+ REPORT_TOKEN = '@@'
23
+ MODEL_TOKEN = '::'
24
+ DATASET_TOKEN = ', '
22
25
 
23
26
 
24
27
  def scan_for_report_folders(root_path):
@@ -42,8 +45,9 @@ def scan_for_report_folders(root_path):
42
45
  datasets = []
43
46
  for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
44
47
  datasets.append(os.path.basename(dataset_item).split('.')[0])
45
- datasets = ','.join(datasets)
46
- reports.append(f'{os.path.basename(folder)}@{os.path.basename(model_item)}:{datasets}')
48
+ datasets = DATASET_TOKEN.join(datasets)
49
+ reports.append(
50
+ f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
47
51
 
48
52
  reports = sorted(reports, reverse=True)
49
53
  logger.debug(f'reports: {reports}')
@@ -51,9 +55,9 @@ def scan_for_report_folders(root_path):
51
55
 
52
56
 
53
57
  def process_report_name(report_name: str):
54
- prefix, report_name = report_name.split('@')
55
- model_name, datasets = report_name.split(':')
56
- datasets = datasets.split(',')
58
+ prefix, report_name = report_name.split(REPORT_TOKEN)
59
+ model_name, datasets = report_name.split(MODEL_TOKEN)
60
+ datasets = datasets.split(DATASET_TOKEN)
57
61
  return prefix, model_name, datasets
58
62
 
59
63
 
@@ -121,6 +125,9 @@ def get_compare_report_df(acc_df: pd.DataFrame):
121
125
 
122
126
 
123
127
  def plot_single_report_scores(df: pd.DataFrame):
128
+ if df is None:
129
+ return None
130
+ logger.debug(f'df: {df}')
124
131
  plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
125
132
 
126
133
  width = 0.2 if len(df[ReportKey.dataset_name]) <= 5 else None
@@ -171,7 +178,7 @@ def plot_single_dataset_scores(df: pd.DataFrame):
171
178
  text=df[ReportKey.score],
172
179
  barmode='group')
173
180
 
174
- width = 0.2 if len(df[ReportKey.subset_name]) <= 5 else None
181
+ width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
175
182
  plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
176
183
  plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
177
184
  return plot
@@ -519,8 +526,8 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
519
526
  outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
520
527
  def update_single_report_data(root_path, report_name):
521
528
  report_list, datasets, task_cfg = load_single_report(root_path, report_name)
522
- work_dir = os.path.join(root_path, report_name.split('@')[0])
523
- model_name = report_name.split('@')[1].split(':')[0]
529
+ work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
530
+ model_name = report_name.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
524
531
  return (report_list, task_cfg, gr.update(choices=datasets, value=datasets[0]), work_dir, model_name)
525
532
 
526
533
  @report_list.change(inputs=[report_list], outputs=[score_plot, score_table, sunburst_plot])