evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (85) hide show
  1. evalscope/arguments.py +6 -1
  2. evalscope/benchmarks/aime/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +3 -3
  4. evalscope/benchmarks/arc/arc_adapter.py +15 -18
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
  6. evalscope/benchmarks/benchmark.py +12 -11
  7. evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
  8. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  9. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  10. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
  11. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
  12. evalscope/benchmarks/data_adapter.py +59 -21
  13. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
  16. evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
  17. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
  18. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
  20. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
  21. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  22. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  23. evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  24. evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  25. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  26. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  27. evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  28. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  29. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  30. evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  31. evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
  32. evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
  33. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
  34. evalscope/benchmarks/musr/musr_adapter.py +8 -5
  35. evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
  36. evalscope/benchmarks/race/race_adapter.py +12 -16
  37. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  38. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
  39. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  40. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  41. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  42. evalscope/benchmarks/super_gpqa/utils.py +85 -0
  43. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  45. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
  46. evalscope/benchmarks/utils.py +43 -0
  47. evalscope/collections/evaluator.py +14 -5
  48. evalscope/config.py +15 -2
  49. evalscope/constants.py +14 -0
  50. evalscope/evaluator/evaluator.py +51 -13
  51. evalscope/metrics/llm_judge.py +104 -0
  52. evalscope/metrics/named_metrics.py +1 -0
  53. evalscope/models/__init__.py +2 -1
  54. evalscope/models/base_adapter.py +25 -5
  55. evalscope/models/chat_adapter.py +3 -0
  56. evalscope/models/choice_adapter.py +4 -0
  57. evalscope/models/custom_adapter.py +2 -0
  58. evalscope/models/register.py +28 -0
  59. evalscope/models/server_adapter.py +35 -8
  60. evalscope/perf/arguments.py +13 -7
  61. evalscope/perf/benchmark.py +5 -0
  62. evalscope/perf/http_client.py +15 -5
  63. evalscope/perf/main.py +1 -0
  64. evalscope/perf/utils/analysis_result.py +1 -1
  65. evalscope/report/app.py +3 -0
  66. evalscope/report/combinator.py +2 -2
  67. evalscope/run.py +6 -5
  68. evalscope/third_party/longbench_write/infer.py +1 -1
  69. evalscope/third_party/thinkbench/eval.py +220 -55
  70. evalscope/third_party/thinkbench/infer.py +37 -7
  71. evalscope/third_party/thinkbench/tools/llm.py +1 -0
  72. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  73. evalscope/utils/chat_service.py +1 -0
  74. evalscope/utils/filters.py +59 -0
  75. evalscope/utils/logger.py +3 -3
  76. evalscope/version.py +2 -2
  77. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
  78. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
  79. tests/cli/test_all.py +144 -0
  80. tests/cli/test_collection.py +28 -2
  81. tests/cli/test_run.py +201 -32
  82. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
  83. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
  84. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
  85. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,104 @@
1
+ import os
2
+ import re
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from evalscope.utils.logger import get_logger
6
+
7
+ logger = get_logger()
8
+
9
+ DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect.
10
+
11
+ Question: {question}
12
+
13
+ Reference Answer: {gold}
14
+
15
+ Model Answer: {pred}
16
+
17
+ Evaluate the model's answer based on correctness compared to the reference answer.
18
+ Grade the predicted answer of this new question as one of:
19
+ A: CORRECT
20
+ B: INCORRECT
21
+
22
+ Just return the letters "A" or "B", with no text around it.
23
+ """ # noqa: E501
24
+
25
+
26
+ class LLMJudge:
27
+ """
28
+ A metric that uses LLM to judge the quality of model predictions by comparing them with reference answers.
29
+ """
30
+
31
+ def __init__(self,
32
+ api_key: Optional[str] = None,
33
+ api_url: Optional[str] = None,
34
+ model_id: Optional[str] = None,
35
+ system_prompt: Optional[str] = None,
36
+ prompt_template: Optional[str] = None,
37
+ generation_config: Optional[Dict[str, Any]] = None,
38
+ **kwargs):
39
+ """
40
+ Initialize LLMJudge metric.
41
+
42
+ Args:
43
+ api_key (str, optional): API key for OpenAI or compatible service
44
+ api_base (str, optional): API base URL
45
+ model_id (str, optional): Model ID for LLM
46
+ system_prompt (str, optional): System prompt for the judge
47
+ prompt_template (str, optional): Prompt template for the judge
48
+ generation_config (dict, optional): Generation configuration for the judge
49
+ """
50
+ self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
51
+ self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
52
+ self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-3.5-turbo')
53
+ self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
54
+ self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
55
+ self.generation_config = generation_config
56
+
57
+ from evalscope.models.server_adapter import ServerModelAdapter
58
+
59
+ # Initialize ServerModelAdapter
60
+ self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
61
+
62
+ def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> float:
63
+ """
64
+ Args:
65
+ prompt (str): The prompt to evaluate
66
+ system_prompt (str, optional): The system prompt to use for the evaluation
67
+ Returns:
68
+ float: The score of the evaluation
69
+ """
70
+ input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
71
+
72
+ # Inference configuration
73
+ infer_cfg = {'temperature': 0.0, 'max_tokens': 1024}
74
+ if self.generation_config:
75
+ infer_cfg.update(self.generation_config)
76
+
77
+ try:
78
+ # Send request using ServerModelAdapter
79
+ response = self.server_adapter.process_single_input(input_data, infer_cfg)
80
+
81
+ # Extract content from response
82
+ llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
83
+ return llm_response
84
+ except Exception as e:
85
+ logger.error(f'Error during LLM evaluation: {e}')
86
+ return None
87
+
88
+ def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
89
+ if question is None:
90
+ question = 'Not provided'
91
+ return self.prompt_template.format(question=question, pred=pred, gold=gold)
92
+
93
+ def get_score(self, response: str) -> float:
94
+ if response is None:
95
+ return 0
96
+ match = re.search(r'(A|B)', response)
97
+ if match:
98
+ answer = match.group(0)
99
+ if answer == 'A':
100
+ return 1
101
+ elif answer == 'B':
102
+ return 0
103
+ else:
104
+ return 0
@@ -35,6 +35,7 @@ metric_registry = MetricRegistry()
35
35
  metric_registry.register(Metric(name='AverageAccuracy', object=mean))
36
36
  metric_registry.register(Metric(name='WeightedAverageAccuracy', object=weighted_mean))
37
37
  metric_registry.register(Metric(name='AverageBLEU', object=mean))
38
+ metric_registry.register(Metric(name='AverageRouge', object=mean))
38
39
  metric_registry.register(Metric(name='WeightedAverageBLEU', object=weighted_mean))
39
40
  metric_registry.register(Metric(name='AveragePass@1', object=mean))
40
41
  for k in range(1, 17):
@@ -7,10 +7,11 @@ from evalscope.models.custom import CustomModel
7
7
  from evalscope.models.custom_adapter import CustomModelAdapter
8
8
  from evalscope.models.local_model import LocalModel, get_local_model
9
9
  from evalscope.models.model import BaseModel, ChatBaseModel, OpenAIModel
10
+ from evalscope.models.register import get_model_adapter
10
11
  from evalscope.models.server_adapter import ServerModelAdapter
11
12
 
12
13
  __all__ = [
13
14
  'CustomModel', 'BaseModel', 'ChatBaseModel', 'OpenAIModel', 'BaseModelAdapter', 'ChatGenerationModelAdapter',
14
15
  'MultiChoiceModelAdapter', 'ContinuationLogitsModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter',
15
- 'LocalModel', 'get_local_model', 'initialize_model_adapter'
16
+ 'LocalModel', 'get_local_model', 'initialize_model_adapter', 'get_model_adapter'
16
17
  ]
@@ -1,15 +1,21 @@
1
1
  import torch
2
2
  from abc import ABC, abstractmethod
3
- from typing import TYPE_CHECKING, Any, Optional, Union
3
+ from typing import TYPE_CHECKING, Any, List, Optional, Union
4
4
 
5
- from evalscope.constants import EvalType
5
+ from evalscope.constants import EvalType, OutputType
6
6
  from evalscope.models.custom import CustomModel
7
7
  from evalscope.models.local_model import LocalModel
8
+ from evalscope.models.register import get_model_adapter, register_model_adapter
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
8
12
 
9
13
  if TYPE_CHECKING:
14
+ from evalscope.benchmarks import BenchmarkMeta
10
15
  from evalscope.config import TaskConfig
11
16
 
12
17
 
18
+ @register_model_adapter('base')
13
19
  class BaseModelAdapter(ABC):
14
20
 
15
21
  def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
@@ -33,7 +39,7 @@ class BaseModelAdapter(ABC):
33
39
  raise NotImplementedError
34
40
 
35
41
 
36
- def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseModelAdapter', base_model: 'LocalModel'):
42
+ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'BenchmarkMeta', base_model: 'LocalModel'):
37
43
  """Initialize the model adapter based on the task configuration."""
38
44
  if task_cfg.dry_run:
39
45
  from evalscope.models.model import DummyChatModel
@@ -43,8 +49,14 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseMod
43
49
  raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
44
50
  from evalscope.models import CustomModelAdapter
45
51
  return CustomModelAdapter(custom_model=task_cfg.model)
46
- elif task_cfg.eval_type == EvalType.SERVICE:
52
+ elif task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
47
53
  from evalscope.models import ServerModelAdapter
54
+
55
+ if benchmark.model_adapter in [OutputType.CONTINUOUS, OutputType.MULTIPLE_CHOICE]:
56
+ logger.warning('Output type is set to logits. This is not supported for service evaluation. '
57
+ 'Setting output type to generation by default.')
58
+ benchmark.model_adapter = OutputType.GENERATION
59
+
48
60
  return ServerModelAdapter(
49
61
  api_url=task_cfg.api_url,
50
62
  model_id=task_cfg.model,
@@ -54,5 +66,13 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseMod
54
66
  stream=task_cfg.stream,
55
67
  )
56
68
  else:
57
- return model_adapter_cls(
69
+ # for local model, we need to determine the model adapter class based on the output type
70
+ model_adapter_cls = benchmark.model_adapter
71
+ if model_adapter_cls not in benchmark.output_types:
72
+ logger.warning(f'Output type {model_adapter_cls} is not supported for benchmark {benchmark.name}. '
73
+ f'Using {benchmark.output_types[0]} instead.')
74
+ model_adapter_cls = benchmark.output_types[0]
75
+
76
+ model_adapter = get_model_adapter(model_adapter_cls)
77
+ return model_adapter(
58
78
  model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template)
@@ -3,8 +3,10 @@ import time
3
3
  import torch
4
4
  from typing import List, Union
5
5
 
6
+ from evalscope.constants import OutputType
6
7
  from evalscope.models.base_adapter import BaseModelAdapter
7
8
  from evalscope.models.local_model import LocalModel
9
+ from evalscope.models.register import register_model_adapter
8
10
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
9
11
  from evalscope.utils.logger import get_logger
10
12
  from evalscope.utils.model_utils import fix_do_sample_warning
@@ -12,6 +14,7 @@ from evalscope.utils.model_utils import fix_do_sample_warning
12
14
  logger = get_logger()
13
15
 
14
16
 
17
+ @register_model_adapter(OutputType.GENERATION)
15
18
  class ChatGenerationModelAdapter(BaseModelAdapter):
16
19
  """
17
20
  Chat generation model adapter.
@@ -3,11 +3,14 @@ import time
3
3
  import torch
4
4
  from typing import List
5
5
 
6
+ from evalscope.constants import OutputType
6
7
  from evalscope.models.base_adapter import BaseModelAdapter
7
8
  from evalscope.models.local_model import LocalModel
9
+ from evalscope.models.register import register_model_adapter
8
10
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
9
11
 
10
12
 
13
+ @register_model_adapter(OutputType.MULTIPLE_CHOICE)
11
14
  class MultiChoiceModelAdapter(BaseModelAdapter):
12
15
  """ The multi-choice model adapter. """
13
16
 
@@ -110,6 +113,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
110
113
  return log_probs, {'tokens': tokens}
111
114
 
112
115
 
116
+ @register_model_adapter(OutputType.CONTINUOUS)
113
117
  class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
114
118
  """
115
119
  Continuation-logits model adapter.
@@ -2,8 +2,10 @@ from typing import Any, Dict, List, Union
2
2
 
3
3
  from evalscope.models.base_adapter import BaseModelAdapter
4
4
  from evalscope.models.custom import CustomModel
5
+ from evalscope.models.register import register_model_adapter
5
6
 
6
7
 
8
+ @register_model_adapter('custom')
7
9
  class CustomModelAdapter(BaseModelAdapter):
8
10
 
9
11
  def __init__(self, custom_model: CustomModel, **kwargs):
@@ -0,0 +1,28 @@
1
+ MODEL_ADAPTERS = {}
2
+
3
+
4
+ def register_model_adapter(name):
5
+ """
6
+ Decorator to register a model adapter with a given name.
7
+ :param name: The name of the model adapter.
8
+ """
9
+
10
+ def decorator(adapter):
11
+ if name in MODEL_ADAPTERS:
12
+ raise ValueError(f"Model adapter '{name}' is already registered.")
13
+ MODEL_ADAPTERS[name] = adapter
14
+ return adapter
15
+
16
+ return decorator
17
+
18
+
19
+ def get_model_adapter(name):
20
+ """
21
+ Retrieve a registered model adapter by name.
22
+ :param name: The name of the model adapter.
23
+ :return: The model adapter class or function.
24
+ """
25
+ if name not in MODEL_ADAPTERS:
26
+ raise ValueError(
27
+ f"Model adapter '{name}' is not registered. Available model adapters: {list(MODEL_ADAPTERS.keys())}")
28
+ return MODEL_ADAPTERS[name]
@@ -1,15 +1,18 @@
1
1
  import openai
2
2
  from collections import defaultdict
3
+ from inspect import signature
3
4
  from openai.types.chat import ChatCompletion, ChatCompletionChunk
4
5
  from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
5
6
  from typing import List, Optional, Union
6
7
 
7
8
  from evalscope.models.base_adapter import BaseModelAdapter
9
+ from evalscope.models.register import register_model_adapter
8
10
  from evalscope.utils.logger import get_logger
9
11
 
10
12
  logger = get_logger()
11
13
 
12
14
 
15
+ @register_model_adapter('server')
13
16
  class ServerModelAdapter(BaseModelAdapter):
14
17
  """
15
18
  Server model adapter to request remote API model and generate results.
@@ -30,6 +33,7 @@ class ServerModelAdapter(BaseModelAdapter):
30
33
  api_key=api_key,
31
34
  base_url=self.api_url,
32
35
  )
36
+ self.supported_params = self._get_supported_params()
33
37
 
34
38
  self.seed = kwargs.get('seed', None)
35
39
  self.timeout = kwargs.get('timeout', 60)
@@ -37,12 +41,16 @@ class ServerModelAdapter(BaseModelAdapter):
37
41
  self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
38
42
  super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
39
43
 
40
- def predict(self, inputs: List[Union[str, dict, list]], infer_cfg: dict = None) -> List[dict]:
44
+ def _get_supported_params(self):
45
+ sig = signature(self.client.chat.completions.create)
46
+ return list(sig.parameters.keys())
47
+
48
+ def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
41
49
  """
42
50
  Model prediction func.
43
51
 
44
52
  Args:
45
- inputs (List[Union[str, dict, list]]): The input data.
53
+ inputs (List[dict]): The input data.
46
54
  infer_cfg (dict): Inference configuration.
47
55
 
48
56
  Returns:
@@ -104,34 +112,52 @@ class ServerModelAdapter(BaseModelAdapter):
104
112
  request_json['stream_options'] = {'include_usage': True}
105
113
 
106
114
  logger.debug(f'Request to remote API: {request_json}')
115
+
107
116
  return request_json
108
117
 
118
+ def _parse_extra_params(self, request_json):
119
+ api_params = {}
120
+ extra_body = {}
121
+ for key, value in request_json.items():
122
+ if key in self.supported_params:
123
+ api_params[key] = value
124
+ else:
125
+ extra_body[key] = value
126
+
127
+ if extra_body:
128
+ api_params['extra_body'] = extra_body
129
+ return api_params
130
+
109
131
  def send_request(self, request_json: dict) -> dict:
110
132
  try:
111
- response = self.client.chat.completions.create(**request_json)
133
+ parsed_request = self._parse_extra_params(request_json)
134
+ response = self.client.chat.completions.create(**parsed_request)
112
135
 
113
- if self.stream:
136
+ if response and self.stream:
114
137
  response = self._collect_stream_response(response)
115
138
 
116
139
  return response.model_dump(exclude_unset=True)
117
140
  except Exception as e:
118
- logger.error(f'Error when calling OpenAI API: {str(e)}')
141
+ logger.error(f'Error when calling remote API: {str(e)}')
119
142
  raise
120
143
 
121
144
  def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
122
145
  collected_chunks = []
123
146
  collected_messages = defaultdict(list)
147
+ collected_reasoning = defaultdict(list)
124
148
 
125
149
  for chunk in response_stream:
126
150
  collected_chunks.append(chunk)
127
151
  for choice in chunk.choices:
152
+ if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
153
+ collected_reasoning[choice.index].append(choice.delta.reasoning_content)
128
154
  if choice.delta.content is not None:
129
155
  collected_messages[choice.index].append(choice.delta.content)
130
156
 
131
157
  choices = []
132
158
  for index, messages in collected_messages.items():
133
159
  full_reply_content = ''.join(messages)
134
-
160
+ reasoning = ''.join(collected_reasoning[index])
135
161
  # use the finish_reason from the last chunk that generated this choice
136
162
  finish_reason = None
137
163
  for chunk in reversed(collected_chunks):
@@ -140,9 +166,10 @@ class ServerModelAdapter(BaseModelAdapter):
140
166
  break
141
167
 
142
168
  choice = Choice(
143
- finish_reason=finish_reason,
169
+ finish_reason=finish_reason or 'stop',
144
170
  index=index,
145
- message=ChatCompletionMessage(role='assistant', content=full_reply_content))
171
+ message=ChatCompletionMessage(
172
+ role='assistant', content=full_reply_content, reasoning_content=reasoning))
146
173
  choices.append(choice)
147
174
 
148
175
  # build the final completion object
@@ -21,9 +21,9 @@ class Arguments:
21
21
  # Connection settings
22
22
  url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
23
23
  headers: Dict[str, Any] = field(default_factory=dict) # Custom headers
24
- connect_timeout: int = 120 # Connection timeout in seconds
25
- read_timeout: int = 120 # Read timeout in seconds
26
- api_key: str = 'EMPTY'
24
+ connect_timeout: int = 600 # Connection timeout in seconds
25
+ read_timeout: int = 600 # Read timeout in seconds
26
+ api_key: Optional[str] = None
27
27
 
28
28
  # Performance and parallelism
29
29
  number: Optional[int] = None # Number of requests to be made
@@ -125,7 +125,13 @@ class ParseKVAction(argparse.Action):
125
125
  setattr(namespace, self.dest, {})
126
126
  else:
127
127
  try:
128
- kv_dict = dict(kv.split('=') for kv in values)
128
+ kv_dict = {}
129
+ for kv in values:
130
+ parts = kv.split('=', 1) # only split the first '='
131
+ if len(parts) != 2:
132
+ raise ValueError(f'Invalid key-value pair: {kv}')
133
+ key, value = parts
134
+ kv_dict[key.strip()] = value.strip()
129
135
  setattr(namespace, self.dest, kv_dict)
130
136
  except ValueError as e:
131
137
  parser.error(f'Error parsing key-value pairs: {e}')
@@ -144,9 +150,9 @@ def add_argument(parser: argparse.ArgumentParser):
144
150
  parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
145
151
  parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
146
152
  parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
147
- parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
148
- parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
149
- parser.add_argument('--read-timeout', type=int, default=120, help='The network read timeout')
153
+ parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
154
+ parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
155
+ parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
150
156
 
151
157
  # Performance and parallelism
152
158
  parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
@@ -209,9 +209,14 @@ async def benchmark(args: Arguments) -> None:
209
209
  loop = asyncio.get_running_loop()
210
210
  add_signal_handlers(loop)
211
211
 
212
+ # init queue
212
213
  request_queue = asyncio.Queue()
213
214
  benchmark_data_queue = asyncio.Queue()
214
215
 
216
+ # reset event
217
+ query_send_completed_event.clear()
218
+ data_process_completed_event.clear()
219
+
215
220
  async def create_send_request_tasks():
216
221
  tasks: List[asyncio.Task] = []
217
222
  for idx in range(args.parallel):
@@ -23,10 +23,7 @@ class AioHttpClient:
23
23
  self.read_timeout = args.read_timeout
24
24
  self.connect_timeout = args.connect_timeout
25
25
  self.client = aiohttp.ClientSession(
26
- timeout=aiohttp.ClientTimeout(
27
- total=self.read_timeout + self.connect_timeout,
28
- connect=self.connect_timeout,
29
- sock_read=self.read_timeout),
26
+ timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
30
27
  connector=aiohttp.TCPConnector(limit=1),
31
28
  trace_configs=[self._create_trace_config()] if args.debug else [])
32
29
 
@@ -102,6 +99,11 @@ class AioHttpClient:
102
99
  async with self.client.request('POST', url=self.url, data=data, headers=headers) as response:
103
100
  async for rsp in self._handle_response(response):
104
101
  yield rsp
102
+ except asyncio.TimeoutError:
103
+ logger.error(
104
+ f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longger timeout.' # noqa: E501
105
+ )
106
+ yield (True, None, 'Timeout')
105
107
  except (aiohttp.ClientConnectorError, Exception) as e:
106
108
  logger.error(e)
107
109
  yield (True, None, e)
@@ -143,7 +145,15 @@ async def test_connection(args: Arguments) -> bool:
143
145
  client = AioHttpClient(args)
144
146
  async with client:
145
147
  if 'chat/completions' in args.url:
146
- request = {'messages': [{'role': 'user', 'content': 'hello'}], 'model': args.model, 'max_tokens': 10}
148
+ request = {
149
+ 'messages': [{
150
+ 'role': 'user',
151
+ 'content': 'hello'
152
+ }],
153
+ 'model': args.model,
154
+ 'max_tokens': 10,
155
+ 'stream': args.stream
156
+ }
147
157
  else:
148
158
  request = {'prompt': 'hello', 'model': args.model, 'max_tokens': 10}
149
159
  async for is_error, state_code, response_data in client.post(request):
evalscope/perf/main.py CHANGED
@@ -35,6 +35,7 @@ def run_perf_benchmark(args):
35
35
  loop = asyncio.get_event_loop()
36
36
  if platform.system() != 'Windows':
37
37
  add_signal_handlers(loop)
38
+
38
39
  loop.run_until_complete(benchmark(args))
39
40
 
40
41
 
@@ -3,7 +3,7 @@ import json
3
3
  import pickle
4
4
  import sqlite3
5
5
 
6
- result_db_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/qwen2.5_benchmark_20241111_160543.db'
6
+ result_db_path = './outputs/qwen2.5_benchmark_20241111_160543.db'
7
7
  con = sqlite3.connect(result_db_path)
8
8
  query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
9
9
  FROM result WHERE success='1'"
evalscope/report/app.py CHANGED
@@ -125,6 +125,9 @@ def get_compare_report_df(acc_df: pd.DataFrame):
125
125
 
126
126
 
127
127
  def plot_single_report_scores(df: pd.DataFrame):
128
+ if df is None:
129
+ return None
130
+ logger.debug(f'df: {df}')
128
131
  plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
129
132
 
130
133
  width = 0.2 if len(df[ReportKey.dataset_name]) <= 5 else None
@@ -57,8 +57,8 @@ class ReportsRecorder:
57
57
 
58
58
 
59
59
  if __name__ == '__main__':
60
- report_dir_1 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250117_151926'
61
- # report_dir_2 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250107_204445/reports'
60
+ report_dir_1 = './outputs/20250117_151926'
61
+ # report_dir_2 = './outputs/20250107_204445/reports'
62
62
 
63
63
  report_table = gen_table([report_dir_1])
64
64
  print(report_table)
evalscope/run.py CHANGED
@@ -2,7 +2,7 @@
2
2
  """
3
3
  Run evaluation for LLMs.
4
4
  """
5
- import os.path
5
+ import os
6
6
  from argparse import Namespace
7
7
  from datetime import datetime
8
8
  from typing import TYPE_CHECKING, List, Optional, Union
@@ -127,16 +127,17 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
127
127
  from evalscope.models import initialize_model_adapter
128
128
 
129
129
  benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
130
- # Initialize data adapter
131
- data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
132
130
 
133
131
  if dataset_name == DataCollection.NAME:
134
132
  # EvaluatorCollection is a collection of evaluators
135
133
  from evalscope.collections import EvaluatorCollection
136
- return EvaluatorCollection(task_cfg, data_adapter, outputs)
134
+ data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
135
+ return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
137
136
 
138
137
  # Initialize model adapter
139
- model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
138
+ model_adapter = initialize_model_adapter(task_cfg, benchmark, base_model)
139
+ # Initialize data adapter
140
+ data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
140
141
 
141
142
  # update task_cfg.dataset_args
142
143
  task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
@@ -8,7 +8,7 @@ import random
8
8
  import torch
9
9
  from typing import List
10
10
 
11
- from evalscope.models.api import OpenaiApi
11
+ from evalscope.third_party.longbench_write.tools.openai_api import OpenaiApi
12
12
  from evalscope.third_party.longbench_write.utils import count_words
13
13
  from evalscope.utils import get_logger
14
14