evalscope 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (55) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/aime24/__init__.py +0 -0
  3. evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  5. evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  6. evalscope/benchmarks/benchmark.py +2 -2
  7. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  10. evalscope/benchmarks/data_adapter.py +18 -12
  11. evalscope/benchmarks/data_collection/__init__.py +0 -0
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  16. evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
  17. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  18. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  20. evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
  21. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  22. evalscope/benchmarks/math_500/__init__.py +0 -0
  23. evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  24. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  25. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  26. evalscope/benchmarks/race/race_adapter.py +3 -3
  27. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  28. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  29. evalscope/collections/evaluator.py +103 -39
  30. evalscope/collections/sampler.py +2 -1
  31. evalscope/collections/schema.py +1 -2
  32. evalscope/config.py +1 -0
  33. evalscope/evaluator/evaluator.py +78 -64
  34. evalscope/metrics/math_parser.py +526 -0
  35. evalscope/metrics/metrics.py +16 -1
  36. evalscope/metrics/named_metrics.py +31 -7
  37. evalscope/models/chat_adapter.py +69 -49
  38. evalscope/models/choice_adapter.py +52 -45
  39. evalscope/models/custom_adapter.py +2 -2
  40. evalscope/models/local_model.py +4 -0
  41. evalscope/models/server_adapter.py +28 -34
  42. evalscope/report/app.py +30 -15
  43. evalscope/run.py +10 -7
  44. evalscope/utils/chat_service.py +2 -2
  45. evalscope/utils/io_utils.py +1 -1
  46. evalscope/version.py +2 -2
  47. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/METADATA +14 -5
  48. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/RECORD +53 -46
  49. tests/cli/test_run.py +93 -16
  50. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  51. evalscope/metrics/math_accuracy.py +0 -200
  52. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
  53. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
  54. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
  55. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import time
3
3
  import torch
4
- from typing import Union
4
+ from typing import List, Union
5
5
 
6
6
  from evalscope.models.base_adapter import BaseModelAdapter
7
7
  from evalscope.models.local_model import LocalModel
@@ -57,84 +57,104 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
57
57
 
58
58
  return generation_config
59
59
 
60
- def _model_generate(self, query: str, system_prompt: str = None, infer_cfg: dict = {}) -> str:
60
+ def _model_generate(self, queries: List[str], system_prompts: List[str] = None, infer_cfg: dict = {}) -> List[str]:
61
61
  """
62
62
  Args:
63
- query: The input query.
64
- system_prompt: The system prompt.
63
+ queries: The input queries.
64
+ system_prompts: The system prompts.
65
65
  infer_cfg: The inference configuration.
66
66
  Returns:
67
- The prediction result.
67
+ The prediction results.
68
68
  """
69
- # For chat model, use the chat template to format the input
70
- if self.tokenizer.chat_template is not None:
71
- messages = [ChatMessage(role='user', content=query)]
72
- if system_prompt:
73
- messages = [ChatMessage(role='system', content=system_prompt)] + messages
74
- formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
75
- else:
76
- # For base model, use the query as the input
77
- formatted_prompt = query
78
-
79
- logger.debug(f'formatted_prompt: {formatted_prompt}')
80
-
81
- inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
82
- input_ids = inputs['input_ids']
83
-
84
69
  # Process infer_cfg
85
- if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
70
+ num_return_sequences = infer_cfg.get('num_return_sequences', 1)
71
+ if num_return_sequences > 1:
86
72
  infer_cfg['do_sample'] = True
87
73
 
88
74
  # stop settings
89
- stop = infer_cfg.get('stop', None)
90
- eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
91
- if stop else self.tokenizer.eos_token_id
75
+ stop = infer_cfg.get('stop', [])
76
+ if stop:
77
+ eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0]
78
+ else:
79
+ eos_token_id = self.tokenizer.eos_token_id
92
80
 
93
81
  if eos_token_id is not None:
94
82
  infer_cfg['eos_token_id'] = eos_token_id
95
- infer_cfg['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
96
83
 
97
84
  self.generation_config.update(**infer_cfg)
98
85
  fix_do_sample_warning(self.generation_config)
99
86
 
87
+ # For chat model, use the chat template to format the input
88
+ if self.tokenizer.chat_template is not None:
89
+ formatted_prompts = []
90
+ for i, query in enumerate(queries):
91
+ messages = [ChatMessage(role='user', content=query)]
92
+ if i < len(system_prompts) and system_prompts[i]:
93
+ messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
94
+ formatted_prompts.append(
95
+ self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
96
+ else:
97
+ # For base model, use the queries as the input
98
+ formatted_prompts = queries
99
+
100
+ logger.debug(f'formatted_prompts: {formatted_prompts}')
101
+
102
+ # Get input ids
103
+ inputs = self.tokenizer(
104
+ formatted_prompts, return_tensors='pt', padding=True, truncation=True,
105
+ padding_side='left').to(self.device) # padding_side='left' is important for chat model
106
+ input_ids = inputs['input_ids']
107
+
100
108
  # Run inference
101
109
  output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
102
110
 
103
- response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
104
- return response
111
+ responses = []
112
+ for i in range(0, len(output_ids), num_return_sequences):
113
+ query_responses = []
114
+ for j in range(num_return_sequences):
115
+ output = output_ids[i + j]
116
+ response = self.tokenizer.decode(
117
+ output[len(input_ids[i // num_return_sequences]):], skip_special_tokens=True)
118
+ query_responses.append(response)
119
+ responses.append(query_responses)
120
+
121
+ return responses
105
122
 
106
123
  @torch.no_grad()
107
- def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
124
+ def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
108
125
  """
109
126
  Args:
110
127
  inputs: The input data.
111
128
  infer_cfg: The inference configuration.
112
129
  Returns:
113
- The prediction result.
130
+ The prediction results.
114
131
  """
115
132
 
116
133
  # Process inputs
117
- if isinstance(inputs, str):
118
- query = inputs
119
- system_prompt = None
120
- elif isinstance(inputs, dict):
121
- query = inputs['data'][0]
122
- system_prompt = inputs.get('system_prompt', None)
123
- elif isinstance(inputs, list):
124
- query = '\n'.join(inputs)
125
- system_prompt = None
126
- else:
127
- raise TypeError(f'Unsupported inputs type: {type(inputs)}')
134
+ queries = []
135
+ system_prompts = []
136
+
137
+ for input_item in inputs:
138
+ queries.append(input_item['data'][0])
139
+ system_prompts.append(input_item.get('system_prompt', None))
140
+
141
+ responses = self._model_generate(queries, system_prompts, infer_cfg)
128
142
 
129
- response = self._model_generate(query, system_prompt, infer_cfg)
143
+ results = []
144
+ for response in responses:
145
+ choices_list = [
146
+ ChatCompletionResponseChoice(
147
+ index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
148
+ for index, one_response in enumerate(response)
149
+ ]
130
150
 
131
- choices_list = [
132
- ChatCompletionResponseChoice(
133
- index=0, message=ChatMessage(content=response, role='assistant'), finish_reason='stop')
134
- ]
151
+ res_d = ChatCompletionResponse(
152
+ model=self.model_id,
153
+ choices=choices_list,
154
+ object='chat.completion',
155
+ created=int(time.time()),
156
+ usage=None).model_dump(exclude_unset=True)
135
157
 
136
- res_d = ChatCompletionResponse(
137
- model=self.model_id, choices=choices_list, object='chat.completion', created=int(time.time()),
138
- usage=None).model_dump(exclude_unset=True)
158
+ results.append(res_d)
139
159
 
140
- return res_d
160
+ return results
@@ -33,12 +33,12 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
33
33
  return self._DEFAULT_MAX_LENGTH
34
34
 
35
35
  @torch.no_grad()
36
- def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
36
+ def predict(self, inputs: List[dict], infer_cfg: dict = None) -> dict:
37
37
  """
38
38
  Multi-choice model prediction func.
39
39
 
40
40
  Args:
41
- inputs (dict): The inputs for a doc. Format:
41
+ inputs (List[dict]): The inputs for a doc. Format:
42
42
  {'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
43
43
 
44
44
  infer_cfg (dict): inference configuration.
@@ -69,37 +69,39 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
69
69
  infer_cfg = infer_cfg or {}
70
70
  self.model.generation_config.update(**infer_cfg)
71
71
 
72
- input_data = inputs['data']
73
- multi_choices = inputs['multi_choices']
72
+ input_data = [inp['data'][0] for inp in inputs]
73
+ multi_choices = [inp['multi_choices'] for inp in inputs]
74
74
 
75
- output, input_info = self._get_logits(self.tokenizer, self.model, input_data)
76
- assert output.shape[0] == 1
77
- logits = output.flatten()
75
+ outputs, input_info = self._get_logits(self.tokenizer, self.model, input_data)
78
76
 
79
- choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices]
80
- softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
77
+ results = []
78
+ for i, (logits, choices) in enumerate(zip(outputs, multi_choices)):
79
+ choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in choices]
80
+ softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
81
81
 
82
- if softval.dtype in {torch.bfloat16, torch.float16}:
83
- softval = softval.to(dtype=torch.float32)
84
- probs = softval.detach().cpu().numpy()
85
- pred: str = multi_choices[int(np.argmax(probs))] # Format: A or B or C or D
82
+ if softval.dtype in {torch.bfloat16, torch.float16}:
83
+ softval = softval.to(dtype=torch.float32)
84
+ probs = softval.detach().cpu().numpy()
85
+ pred: str = choices[int(np.argmax(probs))] # Format: A or B or C or D
86
86
 
87
- res_d = ChatCompletionResponse(
88
- model=self.model_id,
89
- choices=[
90
- ChatCompletionResponseChoice(
91
- index=0, message=ChatMessage(content=pred, role='assistant'), finish_reason='stop')
92
- ],
93
- object='chat.completion',
94
- created=int(time.time()),
95
- usage=None).model_dump(exclude_unset=True)
87
+ res_d = ChatCompletionResponse(
88
+ model=self.model_id,
89
+ choices=[
90
+ ChatCompletionResponseChoice(
91
+ index=0, message=ChatMessage(content=pred, role='assistant'), finish_reason='stop')
92
+ ],
93
+ object='chat.completion',
94
+ created=int(time.time()),
95
+ usage=None).model_dump(exclude_unset=True)
96
96
 
97
- return res_d
97
+ results.append(res_d)
98
+
99
+ return results
98
100
 
99
101
  @staticmethod
100
102
  def _get_logits(tokenizer, model, inputs: List[str]):
101
- input_ids = tokenizer(inputs, padding=False)['input_ids']
102
- input_ids = torch.tensor(input_ids, device=model.device)
103
+ input_ids = tokenizer(
104
+ inputs, padding=True, return_tensors='pt', padding_side='left')['input_ids'].to(model.device)
103
105
  tokens = {'input_ids': input_ids}
104
106
 
105
107
  outputs = model(input_ids)['logits']
@@ -117,11 +119,11 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
117
119
  super().__init__(model, **kwargs)
118
120
 
119
121
  @torch.no_grad()
120
- def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
122
+ def predict(self, inputs: List[dict], infer_cfg: dict = None) -> dict:
121
123
  """
122
124
  Multi-choice model prediction func.
123
125
  Args:
124
- inputs (dict): The inputs for a doc. Format:
126
+ inputs (List[dict]): The inputs for a doc. Format:
125
127
  {'data': [(context, continuation), ...]}
126
128
  infer_cfg (dict): inference configuration.
127
129
  Returns:
@@ -149,24 +151,29 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
149
151
  """
150
152
  infer_cfg = infer_cfg or {}
151
153
 
152
- pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
153
-
154
- res_d = ChatCompletionResponse(
155
- model=self.model_id,
156
- choices=[{
157
- 'index': 0,
158
- 'message': {
159
- 'content': pred_list,
160
- 'role': 'assistant'
161
- }
162
- }],
163
- object='chat.completion',
164
- created=int(time.time()),
165
- usage=None).model_dump(exclude_unset=True)
166
-
167
- return res_d
168
-
169
- def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list:
154
+ pred_list: list = []
155
+ for inp in inputs:
156
+ pred_list.append(self.loglikelihood(inputs=inp['data'], infer_cfg=infer_cfg))
157
+
158
+ results = []
159
+ for pred in pred_list:
160
+ res_d = ChatCompletionResponse(
161
+ model=self.model_id,
162
+ choices=[{
163
+ 'index': 0,
164
+ 'message': {
165
+ 'content': pred,
166
+ 'role': 'assistant'
167
+ }
168
+ }],
169
+ object='chat.completion',
170
+ created=int(time.time()),
171
+ usage=None).model_dump(exclude_unset=True)
172
+ results.append(res_d)
173
+
174
+ return results
175
+
176
+ def loglikelihood(self, inputs: List[tuple], infer_cfg: dict = None) -> list:
170
177
  self.model.generation_config.update(**infer_cfg)
171
178
  # To predict one doc
172
179
  doc_ele_pred = []
@@ -17,12 +17,12 @@ class CustomModelAdapter(BaseModelAdapter):
17
17
  self.custom_model = custom_model
18
18
  super(CustomModelAdapter, self).__init__(model=custom_model)
19
19
 
20
- def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
20
+ def predict(self, inputs: List[Union[str, dict, list]], **kwargs) -> List[Dict[str, Any]]:
21
21
  """
22
22
  Model prediction func.
23
23
 
24
24
  Args:
25
- inputs (Union[str, dict, list]): The input data. Depending on the specific model.
25
+ inputs (List[Union[str, dict, list]]): The input data. Depending on the specific model.
26
26
  str: 'xxx'
27
27
  dict: {'data': [full_prompt]}
28
28
  list: ['xxx', 'yyy', 'zzz']
@@ -37,6 +37,10 @@ class LocalModel:
37
37
  cache_dir=model_cache_dir,
38
38
  )
39
39
 
40
+ # Fix no padding
41
+ if self.tokenizer.pad_token is None:
42
+ self.tokenizer.pad_token = self.tokenizer.eos_token
43
+
40
44
  self.model = AutoModelForCausalLM.from_pretrained(
41
45
  self.model_id,
42
46
  revision=model_revision,
@@ -1,6 +1,6 @@
1
1
  import requests
2
2
  import time
3
- from typing import Optional, Union
3
+ from typing import List, Optional, Union
4
4
 
5
5
  from evalscope.models.base_adapter import BaseModelAdapter
6
6
  from evalscope.utils.chat_service import ChatMessage
@@ -28,36 +28,35 @@ class ServerModelAdapter(BaseModelAdapter):
28
28
  self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
29
29
  super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
30
30
 
31
- def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
31
+ def predict(self, inputs: List[Union[str, dict, list]], infer_cfg: dict = None) -> List[dict]:
32
32
  """
33
33
  Model prediction func.
34
34
 
35
35
  Args:
36
- inputs (Union[str, dict, list]): The input data.
36
+ inputs (List[Union[str, dict, list]]): The input data.
37
37
  infer_cfg (dict): Inference configuration.
38
38
 
39
39
  Returns:
40
- res (dict): The model prediction results.
40
+ res (List[dict]): The model prediction results.
41
41
  """
42
42
  infer_cfg = infer_cfg or {}
43
+ results = []
43
44
 
44
- # Process inputs
45
- if isinstance(inputs, str):
46
- query = inputs
47
- system_prompt = None
48
- elif isinstance(inputs, dict):
49
- data: list = inputs['data']
50
- if isinstance(data[0], tuple): # for truthful_qa and hellaswag
51
- query = '\n'.join(''.join(item) for item in data)
52
- system_prompt = inputs.get('system_prompt', None)
53
- else:
54
- query = data[0]
55
- system_prompt = inputs.get('system_prompt', None)
56
- elif isinstance(inputs, list):
57
- query = '\n'.join(inputs)
58
- system_prompt = None
45
+ for input_item in inputs:
46
+ response = self.process_single_input(input_item, infer_cfg)
47
+ results.append(response)
48
+
49
+ return results
50
+
51
+ def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
52
+ """Process a single input item."""
53
+ data: list = input_item['data']
54
+ if isinstance(data[0], tuple): # for truthful_qa and hellaswag
55
+ query = '\n'.join(''.join(item) for item in data)
56
+ system_prompt = input_item.get('system_prompt', None)
59
57
  else:
60
- raise TypeError(f'Unsupported inputs type: {type(inputs)}')
58
+ query = data[0]
59
+ system_prompt = input_item.get('system_prompt', None)
61
60
 
62
61
  content = self.make_request_content(query, system_prompt)
63
62
  request_json = self.make_request(content, infer_cfg)
@@ -68,7 +67,7 @@ class ServerModelAdapter(BaseModelAdapter):
68
67
  """
69
68
  Make request content for API.
70
69
  """
71
- if system_prompt is not None:
70
+ if system_prompt:
72
71
  messages = [
73
72
  ChatMessage(role='system', content=system_prompt).model_dump(exclude_unset=True),
74
73
  ChatMessage(role='user', content=query).model_dump(exclude_unset=True)
@@ -80,19 +79,14 @@ class ServerModelAdapter(BaseModelAdapter):
80
79
  def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
81
80
  """Make request to remote API."""
82
81
  # Format request JSON according to OpenAI API format
83
- do_sample = infer_cfg.get('do_sample', False)
84
- temperature = infer_cfg.get('temperature', 0.0) if do_sample else 0.0
85
-
86
- request_json = {
87
- **content, 'model': self.model_id,
88
- 'max_tokens': infer_cfg.get('max_tokens', 2048),
89
- 'temperature': temperature,
90
- 'top_p': infer_cfg.get('top_p', 1.0),
91
- 'n': infer_cfg.get('num_return_sequences', 1),
92
- 'stop': infer_cfg.get('stop', None)
93
- }
94
- if self.seed is not None:
95
- request_json['seed'] = self.seed
82
+ from evalscope.config import DEFAULT_GENERATION_CONFIG
83
+ if infer_cfg == DEFAULT_GENERATION_CONFIG:
84
+ infer_cfg = {
85
+ 'max_tokens': 2048,
86
+ 'temperature': 0.0,
87
+ }
88
+
89
+ request_json = {'model': self.model_id, **content, **infer_cfg}
96
90
  logger.debug(f'Request to remote API: {request_json}')
97
91
  return request_json
98
92
 
evalscope/report/app.py CHANGED
@@ -6,6 +6,7 @@ import os
6
6
  import pandas as pd
7
7
  import plotly.express as px
8
8
  import plotly.graph_objects as go
9
+ import re
9
10
  from dataclasses import dataclass
10
11
  from typing import Any, List, Union
11
12
 
@@ -218,7 +219,16 @@ def dict_to_markdown(data) -> str:
218
219
  return '\n\n'.join(markdown_lines)
219
220
 
220
221
 
222
+ def convert_html_tags(text):
223
+ # match begin label
224
+ text = re.sub(r'<(\w+)>', r'[\1]', text)
225
+ # match end label
226
+ text = re.sub(r'</(\w+)>', r'[/\1]', text)
227
+ return text
228
+
229
+
221
230
  def process_string(string: str, max_length: int = 2048) -> str:
231
+ string = convert_html_tags(string) # for display labels e.g. `<think>`
222
232
  if len(string) > max_length:
223
233
  return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
224
234
  return string
@@ -226,9 +236,11 @@ def process_string(string: str, max_length: int = 2048) -> str:
226
236
 
227
237
  def process_model_prediction(item: Any):
228
238
  if isinstance(item, dict):
229
- return dict_to_markdown(item)
239
+ res = dict_to_markdown(item)
240
+ return process_string(res)
230
241
  elif isinstance(item, list):
231
- return '\n'.join([process_model_prediction(item) for item in item])
242
+ res = '\n'.join([process_model_prediction(item) for item in item])
243
+ return process_string(res)
232
244
  else:
233
245
  return process_string(str(item))
234
246
 
@@ -257,19 +269,20 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
257
269
  ds = []
258
270
  for i, item in origin_df.iterrows():
259
271
  raw_input = item['raw_input']
260
- raw_pred_answer = item['choices'][0]['message']['content']
261
- parsed_gold_answer = item['choices'][0]['review']['gold']
262
- parsed_pred_answer = item['choices'][0]['review']['pred']
263
- score = item['choices'][0]['review']['result']
264
- raw_d = {
265
- 'Input': raw_input,
266
- 'Generated': raw_pred_answer,
267
- 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
268
- 'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
269
- 'Score': score,
270
- 'NScore': normalize_score(score)
271
- }
272
- ds.append(raw_d)
272
+ for choice in item['choices']:
273
+ raw_pred_answer = choice['message']['content']
274
+ parsed_gold_answer = choice['review']['gold']
275
+ parsed_pred_answer = choice['review']['pred']
276
+ score = choice['review']['result']
277
+ raw_d = {
278
+ 'Input': raw_input,
279
+ 'Generated': raw_pred_answer,
280
+ 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
281
+ 'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
282
+ 'Score': score,
283
+ 'NScore': normalize_score(score)
284
+ }
285
+ ds.append(raw_d)
273
286
 
274
287
  df_subset = pd.DataFrame(ds)
275
288
  return df_subset
@@ -284,6 +297,8 @@ def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: i
284
297
  end = start + rows_per_page
285
298
  df_subset = data_review_df.iloc[start:end].copy()
286
299
  df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
300
+ df_subset['Generated'] = df_subset['Generated'].map(process_model_prediction).astype(str)
301
+ df_subset['Pred'] = df_subset['Pred'].map(process_model_prediction).astype(str)
287
302
  df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
288
303
  styler = style_df(df_subset, columns=['NScore'])
289
304
  return df_subset, styler
evalscope/run.py CHANGED
@@ -46,11 +46,13 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
46
46
 
47
47
  def setup_work_directory(task_cfg: TaskConfig, run_time: str):
48
48
  """Set the working directory for the task."""
49
+ # use cache
49
50
  if task_cfg.use_cache:
50
51
  task_cfg.work_dir = task_cfg.use_cache
51
52
  logger.info(f'Set resume from {task_cfg.work_dir}')
52
53
  # elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
53
- task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
54
+ else:
55
+ task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
54
56
 
55
57
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
56
58
 
@@ -112,7 +114,7 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
112
114
  logger.info(task_cfg)
113
115
 
114
116
  for evaluator in evaluators:
115
- res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
117
+ res_dict = evaluator.eval()
116
118
  eval_results[dataset_name] = res_dict
117
119
 
118
120
  return eval_results
@@ -124,21 +126,22 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
124
126
  from evalscope.evaluator import Evaluator
125
127
  from evalscope.models import initialize_model_adapter
126
128
 
129
+ benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
130
+ # Initialize data adapter
131
+ data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
132
+
127
133
  if dataset_name == DataCollection.NAME:
128
134
  # EvaluatorCollection is a collection of evaluators
129
135
  from evalscope.collections import EvaluatorCollection
130
- return EvaluatorCollection(task_cfg, outputs)
136
+ return EvaluatorCollection(task_cfg, data_adapter, outputs)
131
137
 
132
- benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
133
-
134
- data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
138
+ # Initialize model adapter
135
139
  model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
136
140
 
137
141
  # update task_cfg.dataset_args
138
142
  task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
139
143
 
140
144
  return Evaluator(
141
- dataset_name_or_path=benchmark.dataset_id,
142
145
  data_adapter=data_adapter,
143
146
  model_adapter=model_adapter,
144
147
  outputs=outputs,
@@ -174,7 +174,7 @@ class ChatService:
174
174
  )
175
175
 
176
176
  def _prepare_text_inputs(self, request: TextCompletionRequest):
177
- inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=True).to(self.device)
177
+ inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=False).to(self.device)
178
178
  prompt_tokens = len(inputs['input_ids'][0])
179
179
  return inputs, prompt_tokens
180
180
 
@@ -204,7 +204,7 @@ class ChatService:
204
204
  def _prepare_chat_inputs(self, request: ChatCompletionRequest):
205
205
  formatted_prompt = self.tokenizer.apply_chat_template(
206
206
  request.messages, tokenize=False, add_generation_prompt=True)
207
- inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
207
+ inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
208
208
  prompt_tokens = len(inputs['input_ids'][0])
209
209
  return formatted_prompt, inputs, prompt_tokens
210
210
 
@@ -135,7 +135,7 @@ def dict_to_yaml(d: dict, yaml_file: str):
135
135
  Dump dict to yaml file.
136
136
  """
137
137
  with open(yaml_file, 'w') as f:
138
- yaml.dump(d, f, default_flow_style=False)
138
+ yaml.dump(d, f, default_flow_style=False, allow_unicode=True)
139
139
 
140
140
 
141
141
  def json_to_dict(json_file) -> dict:
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.10.1'
4
- __release_datetime__ = '2025-01-23 13:00:00'
3
+ __version__ = '0.11.0'
4
+ __release_datetime__ = '2025-02-13 12:00:00'