evalscope 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (68) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/benchmarks/aime/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +3 -3
  4. evalscope/benchmarks/arc/arc_adapter.py +14 -17
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
  6. evalscope/benchmarks/benchmark.py +9 -9
  7. evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
  10. evalscope/benchmarks/data_adapter.py +31 -21
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
  13. evalscope/benchmarks/general_qa/general_qa_adapter.py +25 -11
  14. evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
  15. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
  16. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
  17. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
  18. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
  19. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  20. evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
  23. evalscope/benchmarks/musr/musr_adapter.py +8 -5
  24. evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
  25. evalscope/benchmarks/race/race_adapter.py +12 -16
  26. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  27. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
  28. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  29. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  30. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  31. evalscope/benchmarks/super_gpqa/utils.py +90 -0
  32. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  34. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
  35. evalscope/benchmarks/utils.py +43 -0
  36. evalscope/collections/evaluator.py +11 -2
  37. evalscope/config.py +10 -2
  38. evalscope/constants.py +7 -0
  39. evalscope/metrics/named_metrics.py +1 -0
  40. evalscope/models/__init__.py +2 -1
  41. evalscope/models/base_adapter.py +25 -5
  42. evalscope/models/chat_adapter.py +3 -0
  43. evalscope/models/choice_adapter.py +4 -0
  44. evalscope/models/custom_adapter.py +2 -0
  45. evalscope/models/register.py +28 -0
  46. evalscope/models/server_adapter.py +35 -8
  47. evalscope/perf/arguments.py +13 -7
  48. evalscope/perf/http_client.py +6 -4
  49. evalscope/perf/utils/analysis_result.py +1 -1
  50. evalscope/report/app.py +3 -0
  51. evalscope/report/combinator.py +2 -2
  52. evalscope/run.py +5 -4
  53. evalscope/third_party/thinkbench/eval.py +220 -55
  54. evalscope/third_party/thinkbench/infer.py +37 -7
  55. evalscope/third_party/thinkbench/tools/llm.py +1 -0
  56. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  57. evalscope/utils/chat_service.py +1 -0
  58. evalscope/utils/filters.py +59 -0
  59. evalscope/utils/logger.py +3 -3
  60. evalscope/version.py +2 -2
  61. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +7 -3
  62. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +68 -58
  63. tests/cli/test_collection.py +1 -1
  64. tests/cli/test_run.py +135 -28
  65. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
  66. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
  67. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
  68. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,10 @@ import time
3
3
  import torch
4
4
  from typing import List, Union
5
5
 
6
+ from evalscope.constants import OutputType
6
7
  from evalscope.models.base_adapter import BaseModelAdapter
7
8
  from evalscope.models.local_model import LocalModel
9
+ from evalscope.models.register import register_model_adapter
8
10
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
9
11
  from evalscope.utils.logger import get_logger
10
12
  from evalscope.utils.model_utils import fix_do_sample_warning
@@ -12,6 +14,7 @@ from evalscope.utils.model_utils import fix_do_sample_warning
12
14
  logger = get_logger()
13
15
 
14
16
 
17
+ @register_model_adapter(OutputType.GENERATION)
15
18
  class ChatGenerationModelAdapter(BaseModelAdapter):
16
19
  """
17
20
  Chat generation model adapter.
@@ -3,11 +3,14 @@ import time
3
3
  import torch
4
4
  from typing import List
5
5
 
6
+ from evalscope.constants import OutputType
6
7
  from evalscope.models.base_adapter import BaseModelAdapter
7
8
  from evalscope.models.local_model import LocalModel
9
+ from evalscope.models.register import register_model_adapter
8
10
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
9
11
 
10
12
 
13
+ @register_model_adapter(OutputType.MULTIPLE_CHOICE)
11
14
  class MultiChoiceModelAdapter(BaseModelAdapter):
12
15
  """ The multi-choice model adapter. """
13
16
 
@@ -110,6 +113,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
110
113
  return log_probs, {'tokens': tokens}
111
114
 
112
115
 
116
+ @register_model_adapter(OutputType.CONTINUOUS)
113
117
  class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
114
118
  """
115
119
  Continuation-logits model adapter.
@@ -2,8 +2,10 @@ from typing import Any, Dict, List, Union
2
2
 
3
3
  from evalscope.models.base_adapter import BaseModelAdapter
4
4
  from evalscope.models.custom import CustomModel
5
+ from evalscope.models.register import register_model_adapter
5
6
 
6
7
 
8
+ @register_model_adapter('custom')
7
9
  class CustomModelAdapter(BaseModelAdapter):
8
10
 
9
11
  def __init__(self, custom_model: CustomModel, **kwargs):
@@ -0,0 +1,28 @@
1
+ MODEL_ADAPTERS = {}
2
+
3
+
4
+ def register_model_adapter(name):
5
+ """
6
+ Decorator to register a model adapter with a given name.
7
+ :param name: The name of the model adapter.
8
+ """
9
+
10
+ def decorator(adapter):
11
+ if name in MODEL_ADAPTERS:
12
+ raise ValueError(f"Model adapter '{name}' is already registered.")
13
+ MODEL_ADAPTERS[name] = adapter
14
+ return adapter
15
+
16
+ return decorator
17
+
18
+
19
+ def get_model_adapter(name):
20
+ """
21
+ Retrieve a registered model adapter by name.
22
+ :param name: The name of the model adapter.
23
+ :return: The model adapter class or function.
24
+ """
25
+ if name not in MODEL_ADAPTERS:
26
+ raise ValueError(
27
+ f"Model adapter '{name}' is not registered. Available model adapters: {list(MODEL_ADAPTERS.keys())}")
28
+ return MODEL_ADAPTERS[name]
@@ -1,15 +1,18 @@
1
1
  import openai
2
2
  from collections import defaultdict
3
+ from inspect import signature
3
4
  from openai.types.chat import ChatCompletion, ChatCompletionChunk
4
5
  from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
5
6
  from typing import List, Optional, Union
6
7
 
7
8
  from evalscope.models.base_adapter import BaseModelAdapter
9
+ from evalscope.models.register import register_model_adapter
8
10
  from evalscope.utils.logger import get_logger
9
11
 
10
12
  logger = get_logger()
11
13
 
12
14
 
15
+ @register_model_adapter('server')
13
16
  class ServerModelAdapter(BaseModelAdapter):
14
17
  """
15
18
  Server model adapter to request remote API model and generate results.
@@ -30,6 +33,7 @@ class ServerModelAdapter(BaseModelAdapter):
30
33
  api_key=api_key,
31
34
  base_url=self.api_url,
32
35
  )
36
+ self.supported_params = self._get_supported_params()
33
37
 
34
38
  self.seed = kwargs.get('seed', None)
35
39
  self.timeout = kwargs.get('timeout', 60)
@@ -37,12 +41,16 @@ class ServerModelAdapter(BaseModelAdapter):
37
41
  self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
38
42
  super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
39
43
 
40
- def predict(self, inputs: List[Union[str, dict, list]], infer_cfg: dict = None) -> List[dict]:
44
+ def _get_supported_params(self):
45
+ sig = signature(self.client.chat.completions.create)
46
+ return list(sig.parameters.keys())
47
+
48
+ def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
41
49
  """
42
50
  Model prediction func.
43
51
 
44
52
  Args:
45
- inputs (List[Union[str, dict, list]]): The input data.
53
+ inputs (List[dict]): The input data.
46
54
  infer_cfg (dict): Inference configuration.
47
55
 
48
56
  Returns:
@@ -104,34 +112,52 @@ class ServerModelAdapter(BaseModelAdapter):
104
112
  request_json['stream_options'] = {'include_usage': True}
105
113
 
106
114
  logger.debug(f'Request to remote API: {request_json}')
115
+
107
116
  return request_json
108
117
 
118
+ def _parse_extra_params(self, request_json):
119
+ api_params = {}
120
+ extra_body = {}
121
+ for key, value in request_json.items():
122
+ if key in self.supported_params:
123
+ api_params[key] = value
124
+ else:
125
+ extra_body[key] = value
126
+
127
+ if extra_body:
128
+ api_params['extra_body'] = extra_body
129
+ return api_params
130
+
109
131
  def send_request(self, request_json: dict) -> dict:
110
132
  try:
111
- response = self.client.chat.completions.create(**request_json)
133
+ parsed_request = self._parse_extra_params(request_json)
134
+ response = self.client.chat.completions.create(**parsed_request)
112
135
 
113
- if self.stream:
136
+ if response and self.stream:
114
137
  response = self._collect_stream_response(response)
115
138
 
116
139
  return response.model_dump(exclude_unset=True)
117
140
  except Exception as e:
118
- logger.error(f'Error when calling OpenAI API: {str(e)}')
141
+ logger.error(f'Error when calling remote API: {str(e)}')
119
142
  raise
120
143
 
121
144
  def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
122
145
  collected_chunks = []
123
146
  collected_messages = defaultdict(list)
147
+ collected_reasoning = defaultdict(list)
124
148
 
125
149
  for chunk in response_stream:
126
150
  collected_chunks.append(chunk)
127
151
  for choice in chunk.choices:
152
+ if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
153
+ collected_reasoning[choice.index].append(choice.delta.reasoning_content)
128
154
  if choice.delta.content is not None:
129
155
  collected_messages[choice.index].append(choice.delta.content)
130
156
 
131
157
  choices = []
132
158
  for index, messages in collected_messages.items():
133
159
  full_reply_content = ''.join(messages)
134
-
160
+ reasoning = ''.join(collected_reasoning[index])
135
161
  # use the finish_reason from the last chunk that generated this choice
136
162
  finish_reason = None
137
163
  for chunk in reversed(collected_chunks):
@@ -140,9 +166,10 @@ class ServerModelAdapter(BaseModelAdapter):
140
166
  break
141
167
 
142
168
  choice = Choice(
143
- finish_reason=finish_reason,
169
+ finish_reason=finish_reason or 'stop',
144
170
  index=index,
145
- message=ChatCompletionMessage(role='assistant', content=full_reply_content))
171
+ message=ChatCompletionMessage(
172
+ role='assistant', content=full_reply_content, reasoning_content=reasoning))
146
173
  choices.append(choice)
147
174
 
148
175
  # build the final completion object
@@ -21,9 +21,9 @@ class Arguments:
21
21
  # Connection settings
22
22
  url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
23
23
  headers: Dict[str, Any] = field(default_factory=dict) # Custom headers
24
- connect_timeout: int = 120 # Connection timeout in seconds
25
- read_timeout: int = 120 # Read timeout in seconds
26
- api_key: str = 'EMPTY'
24
+ connect_timeout: int = 600 # Connection timeout in seconds
25
+ read_timeout: int = 600 # Read timeout in seconds
26
+ api_key: Optional[str] = None
27
27
 
28
28
  # Performance and parallelism
29
29
  number: Optional[int] = None # Number of requests to be made
@@ -125,7 +125,13 @@ class ParseKVAction(argparse.Action):
125
125
  setattr(namespace, self.dest, {})
126
126
  else:
127
127
  try:
128
- kv_dict = dict(kv.split('=') for kv in values)
128
+ kv_dict = {}
129
+ for kv in values:
130
+ parts = kv.split('=', 1) # only split the first '='
131
+ if len(parts) != 2:
132
+ raise ValueError(f'Invalid key-value pair: {kv}')
133
+ key, value = parts
134
+ kv_dict[key.strip()] = value.strip()
129
135
  setattr(namespace, self.dest, kv_dict)
130
136
  except ValueError as e:
131
137
  parser.error(f'Error parsing key-value pairs: {e}')
@@ -144,9 +150,9 @@ def add_argument(parser: argparse.ArgumentParser):
144
150
  parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
145
151
  parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
146
152
  parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
147
- parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
148
- parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
149
- parser.add_argument('--read-timeout', type=int, default=120, help='The network read timeout')
153
+ parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
154
+ parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
155
+ parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
150
156
 
151
157
  # Performance and parallelism
152
158
  parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
@@ -23,10 +23,7 @@ class AioHttpClient:
23
23
  self.read_timeout = args.read_timeout
24
24
  self.connect_timeout = args.connect_timeout
25
25
  self.client = aiohttp.ClientSession(
26
- timeout=aiohttp.ClientTimeout(
27
- total=self.read_timeout + self.connect_timeout,
28
- connect=self.connect_timeout,
29
- sock_read=self.read_timeout),
26
+ timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
30
27
  connector=aiohttp.TCPConnector(limit=1),
31
28
  trace_configs=[self._create_trace_config()] if args.debug else [])
32
29
 
@@ -102,6 +99,11 @@ class AioHttpClient:
102
99
  async with self.client.request('POST', url=self.url, data=data, headers=headers) as response:
103
100
  async for rsp in self._handle_response(response):
104
101
  yield rsp
102
+ except asyncio.TimeoutError:
103
+ logger.error(
104
+ f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longger timeout.' # noqa: E501
105
+ )
106
+ yield (True, None, 'Timeout')
105
107
  except (aiohttp.ClientConnectorError, Exception) as e:
106
108
  logger.error(e)
107
109
  yield (True, None, e)
@@ -3,7 +3,7 @@ import json
3
3
  import pickle
4
4
  import sqlite3
5
5
 
6
- result_db_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/qwen2.5_benchmark_20241111_160543.db'
6
+ result_db_path = './outputs/qwen2.5_benchmark_20241111_160543.db'
7
7
  con = sqlite3.connect(result_db_path)
8
8
  query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
9
9
  FROM result WHERE success='1'"
evalscope/report/app.py CHANGED
@@ -125,6 +125,9 @@ def get_compare_report_df(acc_df: pd.DataFrame):
125
125
 
126
126
 
127
127
  def plot_single_report_scores(df: pd.DataFrame):
128
+ if df is None:
129
+ return None
130
+ logger.debug(f'df: {df}')
128
131
  plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
129
132
 
130
133
  width = 0.2 if len(df[ReportKey.dataset_name]) <= 5 else None
@@ -57,8 +57,8 @@ class ReportsRecorder:
57
57
 
58
58
 
59
59
  if __name__ == '__main__':
60
- report_dir_1 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250117_151926'
61
- # report_dir_2 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250107_204445/reports'
60
+ report_dir_1 = './outputs/20250117_151926'
61
+ # report_dir_2 = './outputs/20250107_204445/reports'
62
62
 
63
63
  report_table = gen_table([report_dir_1])
64
64
  print(report_table)
evalscope/run.py CHANGED
@@ -2,7 +2,7 @@
2
2
  """
3
3
  Run evaluation for LLMs.
4
4
  """
5
- import os.path
5
+ import os
6
6
  from argparse import Namespace
7
7
  from datetime import datetime
8
8
  from typing import TYPE_CHECKING, List, Optional, Union
@@ -127,16 +127,17 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
127
127
  from evalscope.models import initialize_model_adapter
128
128
 
129
129
  benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
130
- # Initialize data adapter
131
- data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
132
130
 
133
131
  if dataset_name == DataCollection.NAME:
134
132
  # EvaluatorCollection is a collection of evaluators
135
133
  from evalscope.collections import EvaluatorCollection
134
+ data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
136
135
  return EvaluatorCollection(task_cfg, data_adapter, outputs)
137
136
 
138
137
  # Initialize model adapter
139
- model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
138
+ model_adapter = initialize_model_adapter(task_cfg, benchmark, base_model)
139
+ # Initialize data adapter
140
+ data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
140
141
 
141
142
  # update task_cfg.dataset_args
142
143
  task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()