evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (59) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/aime24/__init__.py +0 -0
  3. evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  5. evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  6. evalscope/benchmarks/benchmark.py +2 -2
  7. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  10. evalscope/benchmarks/data_adapter.py +18 -12
  11. evalscope/benchmarks/data_collection/__init__.py +0 -0
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  16. evalscope/benchmarks/gpqa/__init__.py +0 -0
  17. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  18. evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
  19. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  20. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  21. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  22. evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
  23. evalscope/benchmarks/ifeval/instructions.py +3 -4
  24. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  25. evalscope/benchmarks/math_500/__init__.py +0 -0
  26. evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  28. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  29. evalscope/benchmarks/race/race_adapter.py +3 -3
  30. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  31. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  32. evalscope/cli/start_app.py +3 -2
  33. evalscope/collections/evaluator.py +103 -39
  34. evalscope/collections/sampler.py +2 -1
  35. evalscope/collections/schema.py +1 -2
  36. evalscope/config.py +1 -0
  37. evalscope/evaluator/evaluator.py +78 -64
  38. evalscope/metrics/math_parser.py +526 -0
  39. evalscope/metrics/metrics.py +16 -1
  40. evalscope/metrics/named_metrics.py +31 -7
  41. evalscope/models/chat_adapter.py +69 -47
  42. evalscope/models/choice_adapter.py +52 -45
  43. evalscope/models/custom_adapter.py +2 -2
  44. evalscope/models/local_model.py +4 -0
  45. evalscope/models/server_adapter.py +28 -34
  46. evalscope/report/app.py +298 -96
  47. evalscope/run.py +10 -7
  48. evalscope/utils/chat_service.py +2 -2
  49. evalscope/utils/io_utils.py +1 -1
  50. evalscope/version.py +2 -2
  51. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
  52. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
  53. tests/cli/test_run.py +93 -16
  54. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  55. evalscope/metrics/math_accuracy.py +0 -200
  56. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
  57. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
  58. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
  59. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import time
3
3
  import torch
4
- from typing import Union
4
+ from typing import List, Union
5
5
 
6
6
  from evalscope.models.base_adapter import BaseModelAdapter
7
7
  from evalscope.models.local_model import LocalModel
@@ -57,82 +57,104 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
57
57
 
58
58
  return generation_config
59
59
 
60
- def _model_generate(self, query: str, system_prompt: str = None, infer_cfg: dict = {}) -> str:
60
+ def _model_generate(self, queries: List[str], system_prompts: List[str] = None, infer_cfg: dict = {}) -> List[str]:
61
61
  """
62
62
  Args:
63
- query: The input query.
64
- system_prompt: The system prompt.
63
+ queries: The input queries.
64
+ system_prompts: The system prompts.
65
65
  infer_cfg: The inference configuration.
66
66
  Returns:
67
- The prediction result.
67
+ The prediction results.
68
68
  """
69
- # For chat model, use the chat template to format the input
70
- if self.tokenizer.chat_template is not None:
71
- messages = [ChatMessage(role='user', content=query)]
72
- if system_prompt:
73
- messages = [ChatMessage(role='system', content=system_prompt)] + messages
74
- formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
75
- else:
76
- # For base model, use the query as the input
77
- formatted_prompt = query
78
-
79
- inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
80
- input_ids = inputs['input_ids']
81
-
82
69
  # Process infer_cfg
83
- if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
70
+ num_return_sequences = infer_cfg.get('num_return_sequences', 1)
71
+ if num_return_sequences > 1:
84
72
  infer_cfg['do_sample'] = True
85
73
 
86
74
  # stop settings
87
- stop = infer_cfg.get('stop', None)
88
- eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
89
- if stop else self.tokenizer.eos_token_id
75
+ stop = infer_cfg.get('stop', [])
76
+ if stop:
77
+ eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0]
78
+ else:
79
+ eos_token_id = self.tokenizer.eos_token_id
90
80
 
91
81
  if eos_token_id is not None:
92
82
  infer_cfg['eos_token_id'] = eos_token_id
93
- infer_cfg['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
94
83
 
95
84
  self.generation_config.update(**infer_cfg)
96
85
  fix_do_sample_warning(self.generation_config)
97
86
 
87
+ # For chat model, use the chat template to format the input
88
+ if self.tokenizer.chat_template is not None:
89
+ formatted_prompts = []
90
+ for i, query in enumerate(queries):
91
+ messages = [ChatMessage(role='user', content=query)]
92
+ if i < len(system_prompts) and system_prompts[i]:
93
+ messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
94
+ formatted_prompts.append(
95
+ self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
96
+ else:
97
+ # For base model, use the queries as the input
98
+ formatted_prompts = queries
99
+
100
+ logger.debug(f'formatted_prompts: {formatted_prompts}')
101
+
102
+ # Get input ids
103
+ inputs = self.tokenizer(
104
+ formatted_prompts, return_tensors='pt', padding=True, truncation=True,
105
+ padding_side='left').to(self.device) # padding_side='left' is important for chat model
106
+ input_ids = inputs['input_ids']
107
+
98
108
  # Run inference
99
109
  output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
100
110
 
101
- response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
102
- return response
111
+ responses = []
112
+ for i in range(0, len(output_ids), num_return_sequences):
113
+ query_responses = []
114
+ for j in range(num_return_sequences):
115
+ output = output_ids[i + j]
116
+ response = self.tokenizer.decode(
117
+ output[len(input_ids[i // num_return_sequences]):], skip_special_tokens=True)
118
+ query_responses.append(response)
119
+ responses.append(query_responses)
120
+
121
+ return responses
103
122
 
104
123
  @torch.no_grad()
105
- def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
124
+ def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
106
125
  """
107
126
  Args:
108
127
  inputs: The input data.
109
128
  infer_cfg: The inference configuration.
110
129
  Returns:
111
- The prediction result.
130
+ The prediction results.
112
131
  """
113
132
 
114
133
  # Process inputs
115
- if isinstance(inputs, str):
116
- query = inputs
117
- system_prompt = None
118
- elif isinstance(inputs, dict):
119
- query = inputs['data'][0]
120
- system_prompt = inputs.get('system_prompt', None)
121
- elif isinstance(inputs, list):
122
- query = '\n'.join(inputs)
123
- system_prompt = None
124
- else:
125
- raise TypeError(f'Unsupported inputs type: {type(inputs)}')
134
+ queries = []
135
+ system_prompts = []
136
+
137
+ for input_item in inputs:
138
+ queries.append(input_item['data'][0])
139
+ system_prompts.append(input_item.get('system_prompt', None))
140
+
141
+ responses = self._model_generate(queries, system_prompts, infer_cfg)
126
142
 
127
- response = self._model_generate(query, system_prompt, infer_cfg)
143
+ results = []
144
+ for response in responses:
145
+ choices_list = [
146
+ ChatCompletionResponseChoice(
147
+ index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
148
+ for index, one_response in enumerate(response)
149
+ ]
128
150
 
129
- choices_list = [
130
- ChatCompletionResponseChoice(
131
- index=0, message=ChatMessage(content=response, role='assistant'), finish_reason='stop')
132
- ]
151
+ res_d = ChatCompletionResponse(
152
+ model=self.model_id,
153
+ choices=choices_list,
154
+ object='chat.completion',
155
+ created=int(time.time()),
156
+ usage=None).model_dump(exclude_unset=True)
133
157
 
134
- res_d = ChatCompletionResponse(
135
- model=self.model_id, choices=choices_list, object='chat.completion', created=int(time.time()),
136
- usage=None).model_dump(exclude_unset=True)
158
+ results.append(res_d)
137
159
 
138
- return res_d
160
+ return results
@@ -33,12 +33,12 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
33
33
  return self._DEFAULT_MAX_LENGTH
34
34
 
35
35
  @torch.no_grad()
36
- def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
36
+ def predict(self, inputs: List[dict], infer_cfg: dict = None) -> dict:
37
37
  """
38
38
  Multi-choice model prediction func.
39
39
 
40
40
  Args:
41
- inputs (dict): The inputs for a doc. Format:
41
+ inputs (List[dict]): The inputs for a doc. Format:
42
42
  {'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
43
43
 
44
44
  infer_cfg (dict): inference configuration.
@@ -69,37 +69,39 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
69
69
  infer_cfg = infer_cfg or {}
70
70
  self.model.generation_config.update(**infer_cfg)
71
71
 
72
- input_data = inputs['data']
73
- multi_choices = inputs['multi_choices']
72
+ input_data = [inp['data'][0] for inp in inputs]
73
+ multi_choices = [inp['multi_choices'] for inp in inputs]
74
74
 
75
- output, input_info = self._get_logits(self.tokenizer, self.model, input_data)
76
- assert output.shape[0] == 1
77
- logits = output.flatten()
75
+ outputs, input_info = self._get_logits(self.tokenizer, self.model, input_data)
78
76
 
79
- choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices]
80
- softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
77
+ results = []
78
+ for i, (logits, choices) in enumerate(zip(outputs, multi_choices)):
79
+ choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in choices]
80
+ softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
81
81
 
82
- if softval.dtype in {torch.bfloat16, torch.float16}:
83
- softval = softval.to(dtype=torch.float32)
84
- probs = softval.detach().cpu().numpy()
85
- pred: str = multi_choices[int(np.argmax(probs))] # Format: A or B or C or D
82
+ if softval.dtype in {torch.bfloat16, torch.float16}:
83
+ softval = softval.to(dtype=torch.float32)
84
+ probs = softval.detach().cpu().numpy()
85
+ pred: str = choices[int(np.argmax(probs))] # Format: A or B or C or D
86
86
 
87
- res_d = ChatCompletionResponse(
88
- model=self.model_id,
89
- choices=[
90
- ChatCompletionResponseChoice(
91
- index=0, message=ChatMessage(content=pred, role='assistant'), finish_reason='stop')
92
- ],
93
- object='chat.completion',
94
- created=int(time.time()),
95
- usage=None).model_dump(exclude_unset=True)
87
+ res_d = ChatCompletionResponse(
88
+ model=self.model_id,
89
+ choices=[
90
+ ChatCompletionResponseChoice(
91
+ index=0, message=ChatMessage(content=pred, role='assistant'), finish_reason='stop')
92
+ ],
93
+ object='chat.completion',
94
+ created=int(time.time()),
95
+ usage=None).model_dump(exclude_unset=True)
96
96
 
97
- return res_d
97
+ results.append(res_d)
98
+
99
+ return results
98
100
 
99
101
  @staticmethod
100
102
  def _get_logits(tokenizer, model, inputs: List[str]):
101
- input_ids = tokenizer(inputs, padding=False)['input_ids']
102
- input_ids = torch.tensor(input_ids, device=model.device)
103
+ input_ids = tokenizer(
104
+ inputs, padding=True, return_tensors='pt', padding_side='left')['input_ids'].to(model.device)
103
105
  tokens = {'input_ids': input_ids}
104
106
 
105
107
  outputs = model(input_ids)['logits']
@@ -117,11 +119,11 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
117
119
  super().__init__(model, **kwargs)
118
120
 
119
121
  @torch.no_grad()
120
- def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
122
+ def predict(self, inputs: List[dict], infer_cfg: dict = None) -> dict:
121
123
  """
122
124
  Multi-choice model prediction func.
123
125
  Args:
124
- inputs (dict): The inputs for a doc. Format:
126
+ inputs (List[dict]): The inputs for a doc. Format:
125
127
  {'data': [(context, continuation), ...]}
126
128
  infer_cfg (dict): inference configuration.
127
129
  Returns:
@@ -149,24 +151,29 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
149
151
  """
150
152
  infer_cfg = infer_cfg or {}
151
153
 
152
- pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
153
-
154
- res_d = ChatCompletionResponse(
155
- model=self.model_id,
156
- choices=[{
157
- 'index': 0,
158
- 'message': {
159
- 'content': pred_list,
160
- 'role': 'assistant'
161
- }
162
- }],
163
- object='chat.completion',
164
- created=int(time.time()),
165
- usage=None).model_dump(exclude_unset=True)
166
-
167
- return res_d
168
-
169
- def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list:
154
+ pred_list: list = []
155
+ for inp in inputs:
156
+ pred_list.append(self.loglikelihood(inputs=inp['data'], infer_cfg=infer_cfg))
157
+
158
+ results = []
159
+ for pred in pred_list:
160
+ res_d = ChatCompletionResponse(
161
+ model=self.model_id,
162
+ choices=[{
163
+ 'index': 0,
164
+ 'message': {
165
+ 'content': pred,
166
+ 'role': 'assistant'
167
+ }
168
+ }],
169
+ object='chat.completion',
170
+ created=int(time.time()),
171
+ usage=None).model_dump(exclude_unset=True)
172
+ results.append(res_d)
173
+
174
+ return results
175
+
176
+ def loglikelihood(self, inputs: List[tuple], infer_cfg: dict = None) -> list:
170
177
  self.model.generation_config.update(**infer_cfg)
171
178
  # To predict one doc
172
179
  doc_ele_pred = []
@@ -17,12 +17,12 @@ class CustomModelAdapter(BaseModelAdapter):
17
17
  self.custom_model = custom_model
18
18
  super(CustomModelAdapter, self).__init__(model=custom_model)
19
19
 
20
- def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
20
+ def predict(self, inputs: List[Union[str, dict, list]], **kwargs) -> List[Dict[str, Any]]:
21
21
  """
22
22
  Model prediction func.
23
23
 
24
24
  Args:
25
- inputs (Union[str, dict, list]): The input data. Depending on the specific model.
25
+ inputs (List[Union[str, dict, list]]): The input data. Depending on the specific model.
26
26
  str: 'xxx'
27
27
  dict: {'data': [full_prompt]}
28
28
  list: ['xxx', 'yyy', 'zzz']
@@ -37,6 +37,10 @@ class LocalModel:
37
37
  cache_dir=model_cache_dir,
38
38
  )
39
39
 
40
+ # Fix no padding
41
+ if self.tokenizer.pad_token is None:
42
+ self.tokenizer.pad_token = self.tokenizer.eos_token
43
+
40
44
  self.model = AutoModelForCausalLM.from_pretrained(
41
45
  self.model_id,
42
46
  revision=model_revision,
@@ -1,6 +1,6 @@
1
1
  import requests
2
2
  import time
3
- from typing import Optional, Union
3
+ from typing import List, Optional, Union
4
4
 
5
5
  from evalscope.models.base_adapter import BaseModelAdapter
6
6
  from evalscope.utils.chat_service import ChatMessage
@@ -28,36 +28,35 @@ class ServerModelAdapter(BaseModelAdapter):
28
28
  self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
29
29
  super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
30
30
 
31
- def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
31
+ def predict(self, inputs: List[Union[str, dict, list]], infer_cfg: dict = None) -> List[dict]:
32
32
  """
33
33
  Model prediction func.
34
34
 
35
35
  Args:
36
- inputs (Union[str, dict, list]): The input data.
36
+ inputs (List[Union[str, dict, list]]): The input data.
37
37
  infer_cfg (dict): Inference configuration.
38
38
 
39
39
  Returns:
40
- res (dict): The model prediction results.
40
+ res (List[dict]): The model prediction results.
41
41
  """
42
42
  infer_cfg = infer_cfg or {}
43
+ results = []
43
44
 
44
- # Process inputs
45
- if isinstance(inputs, str):
46
- query = inputs
47
- system_prompt = None
48
- elif isinstance(inputs, dict):
49
- data: list = inputs['data']
50
- if isinstance(data[0], tuple): # for truthful_qa and hellaswag
51
- query = '\n'.join(''.join(item) for item in data)
52
- system_prompt = inputs.get('system_prompt', None)
53
- else:
54
- query = data[0]
55
- system_prompt = inputs.get('system_prompt', None)
56
- elif isinstance(inputs, list):
57
- query = '\n'.join(inputs)
58
- system_prompt = None
45
+ for input_item in inputs:
46
+ response = self.process_single_input(input_item, infer_cfg)
47
+ results.append(response)
48
+
49
+ return results
50
+
51
+ def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
52
+ """Process a single input item."""
53
+ data: list = input_item['data']
54
+ if isinstance(data[0], tuple): # for truthful_qa and hellaswag
55
+ query = '\n'.join(''.join(item) for item in data)
56
+ system_prompt = input_item.get('system_prompt', None)
59
57
  else:
60
- raise TypeError(f'Unsupported inputs type: {type(inputs)}')
58
+ query = data[0]
59
+ system_prompt = input_item.get('system_prompt', None)
61
60
 
62
61
  content = self.make_request_content(query, system_prompt)
63
62
  request_json = self.make_request(content, infer_cfg)
@@ -68,7 +67,7 @@ class ServerModelAdapter(BaseModelAdapter):
68
67
  """
69
68
  Make request content for API.
70
69
  """
71
- if system_prompt is not None:
70
+ if system_prompt:
72
71
  messages = [
73
72
  ChatMessage(role='system', content=system_prompt).model_dump(exclude_unset=True),
74
73
  ChatMessage(role='user', content=query).model_dump(exclude_unset=True)
@@ -80,19 +79,14 @@ class ServerModelAdapter(BaseModelAdapter):
80
79
  def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
81
80
  """Make request to remote API."""
82
81
  # Format request JSON according to OpenAI API format
83
- do_sample = infer_cfg.get('do_sample', False)
84
- temperature = infer_cfg.get('temperature', 0.0) if do_sample else 0.0
85
-
86
- request_json = {
87
- **content, 'model': self.model_id,
88
- 'max_tokens': infer_cfg.get('max_tokens', 2048),
89
- 'temperature': temperature,
90
- 'top_p': infer_cfg.get('top_p', 1.0),
91
- 'n': infer_cfg.get('num_return_sequences', 1),
92
- 'stop': infer_cfg.get('stop', None)
93
- }
94
- if self.seed is not None:
95
- request_json['seed'] = self.seed
82
+ from evalscope.config import DEFAULT_GENERATION_CONFIG
83
+ if infer_cfg == DEFAULT_GENERATION_CONFIG:
84
+ infer_cfg = {
85
+ 'max_tokens': 2048,
86
+ 'temperature': 0.0,
87
+ }
88
+
89
+ request_json = {'model': self.model_id, **content, **infer_cfg}
96
90
  logger.debug(f'Request to remote API: {request_json}')
97
91
  return request_json
98
92