evalscope 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/aime24/__init__.py +0 -0
- evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- evalscope/benchmarks/benchmark.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- evalscope/benchmarks/data_adapter.py +18 -12
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- evalscope/collections/evaluator.py +103 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +78 -64
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +16 -1
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/chat_adapter.py +69 -49
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +4 -0
- evalscope/models/server_adapter.py +28 -34
- evalscope/report/app.py +30 -15
- evalscope/run.py +10 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/METADATA +14 -5
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/RECORD +53 -46
- tests/cli/test_run.py +93 -16
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
evalscope/models/chat_adapter.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
3
|
import torch
|
|
4
|
-
from typing import Union
|
|
4
|
+
from typing import List, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
7
7
|
from evalscope.models.local_model import LocalModel
|
|
@@ -57,84 +57,104 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
57
57
|
|
|
58
58
|
return generation_config
|
|
59
59
|
|
|
60
|
-
def _model_generate(self,
|
|
60
|
+
def _model_generate(self, queries: List[str], system_prompts: List[str] = None, infer_cfg: dict = {}) -> List[str]:
|
|
61
61
|
"""
|
|
62
62
|
Args:
|
|
63
|
-
|
|
64
|
-
|
|
63
|
+
queries: The input queries.
|
|
64
|
+
system_prompts: The system prompts.
|
|
65
65
|
infer_cfg: The inference configuration.
|
|
66
66
|
Returns:
|
|
67
|
-
The prediction
|
|
67
|
+
The prediction results.
|
|
68
68
|
"""
|
|
69
|
-
# For chat model, use the chat template to format the input
|
|
70
|
-
if self.tokenizer.chat_template is not None:
|
|
71
|
-
messages = [ChatMessage(role='user', content=query)]
|
|
72
|
-
if system_prompt:
|
|
73
|
-
messages = [ChatMessage(role='system', content=system_prompt)] + messages
|
|
74
|
-
formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
75
|
-
else:
|
|
76
|
-
# For base model, use the query as the input
|
|
77
|
-
formatted_prompt = query
|
|
78
|
-
|
|
79
|
-
logger.debug(f'formatted_prompt: {formatted_prompt}')
|
|
80
|
-
|
|
81
|
-
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
|
|
82
|
-
input_ids = inputs['input_ids']
|
|
83
|
-
|
|
84
69
|
# Process infer_cfg
|
|
85
|
-
|
|
70
|
+
num_return_sequences = infer_cfg.get('num_return_sequences', 1)
|
|
71
|
+
if num_return_sequences > 1:
|
|
86
72
|
infer_cfg['do_sample'] = True
|
|
87
73
|
|
|
88
74
|
# stop settings
|
|
89
|
-
stop = infer_cfg.get('stop',
|
|
90
|
-
|
|
91
|
-
|
|
75
|
+
stop = infer_cfg.get('stop', [])
|
|
76
|
+
if stop:
|
|
77
|
+
eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0]
|
|
78
|
+
else:
|
|
79
|
+
eos_token_id = self.tokenizer.eos_token_id
|
|
92
80
|
|
|
93
81
|
if eos_token_id is not None:
|
|
94
82
|
infer_cfg['eos_token_id'] = eos_token_id
|
|
95
|
-
infer_cfg['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
|
|
96
83
|
|
|
97
84
|
self.generation_config.update(**infer_cfg)
|
|
98
85
|
fix_do_sample_warning(self.generation_config)
|
|
99
86
|
|
|
87
|
+
# For chat model, use the chat template to format the input
|
|
88
|
+
if self.tokenizer.chat_template is not None:
|
|
89
|
+
formatted_prompts = []
|
|
90
|
+
for i, query in enumerate(queries):
|
|
91
|
+
messages = [ChatMessage(role='user', content=query)]
|
|
92
|
+
if i < len(system_prompts) and system_prompts[i]:
|
|
93
|
+
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
94
|
+
formatted_prompts.append(
|
|
95
|
+
self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
|
|
96
|
+
else:
|
|
97
|
+
# For base model, use the queries as the input
|
|
98
|
+
formatted_prompts = queries
|
|
99
|
+
|
|
100
|
+
logger.debug(f'formatted_prompts: {formatted_prompts}')
|
|
101
|
+
|
|
102
|
+
# Get input ids
|
|
103
|
+
inputs = self.tokenizer(
|
|
104
|
+
formatted_prompts, return_tensors='pt', padding=True, truncation=True,
|
|
105
|
+
padding_side='left').to(self.device) # padding_side='left' is important for chat model
|
|
106
|
+
input_ids = inputs['input_ids']
|
|
107
|
+
|
|
100
108
|
# Run inference
|
|
101
109
|
output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
|
|
102
110
|
|
|
103
|
-
|
|
104
|
-
|
|
111
|
+
responses = []
|
|
112
|
+
for i in range(0, len(output_ids), num_return_sequences):
|
|
113
|
+
query_responses = []
|
|
114
|
+
for j in range(num_return_sequences):
|
|
115
|
+
output = output_ids[i + j]
|
|
116
|
+
response = self.tokenizer.decode(
|
|
117
|
+
output[len(input_ids[i // num_return_sequences]):], skip_special_tokens=True)
|
|
118
|
+
query_responses.append(response)
|
|
119
|
+
responses.append(query_responses)
|
|
120
|
+
|
|
121
|
+
return responses
|
|
105
122
|
|
|
106
123
|
@torch.no_grad()
|
|
107
|
-
def predict(self, inputs:
|
|
124
|
+
def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
|
|
108
125
|
"""
|
|
109
126
|
Args:
|
|
110
127
|
inputs: The input data.
|
|
111
128
|
infer_cfg: The inference configuration.
|
|
112
129
|
Returns:
|
|
113
|
-
The prediction
|
|
130
|
+
The prediction results.
|
|
114
131
|
"""
|
|
115
132
|
|
|
116
133
|
# Process inputs
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
system_prompt = None
|
|
126
|
-
else:
|
|
127
|
-
raise TypeError(f'Unsupported inputs type: {type(inputs)}')
|
|
134
|
+
queries = []
|
|
135
|
+
system_prompts = []
|
|
136
|
+
|
|
137
|
+
for input_item in inputs:
|
|
138
|
+
queries.append(input_item['data'][0])
|
|
139
|
+
system_prompts.append(input_item.get('system_prompt', None))
|
|
140
|
+
|
|
141
|
+
responses = self._model_generate(queries, system_prompts, infer_cfg)
|
|
128
142
|
|
|
129
|
-
|
|
143
|
+
results = []
|
|
144
|
+
for response in responses:
|
|
145
|
+
choices_list = [
|
|
146
|
+
ChatCompletionResponseChoice(
|
|
147
|
+
index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
|
|
148
|
+
for index, one_response in enumerate(response)
|
|
149
|
+
]
|
|
130
150
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
151
|
+
res_d = ChatCompletionResponse(
|
|
152
|
+
model=self.model_id,
|
|
153
|
+
choices=choices_list,
|
|
154
|
+
object='chat.completion',
|
|
155
|
+
created=int(time.time()),
|
|
156
|
+
usage=None).model_dump(exclude_unset=True)
|
|
135
157
|
|
|
136
|
-
|
|
137
|
-
model=self.model_id, choices=choices_list, object='chat.completion', created=int(time.time()),
|
|
138
|
-
usage=None).model_dump(exclude_unset=True)
|
|
158
|
+
results.append(res_d)
|
|
139
159
|
|
|
140
|
-
return
|
|
160
|
+
return results
|
|
@@ -33,12 +33,12 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
|
33
33
|
return self._DEFAULT_MAX_LENGTH
|
|
34
34
|
|
|
35
35
|
@torch.no_grad()
|
|
36
|
-
def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
|
|
36
|
+
def predict(self, inputs: List[dict], infer_cfg: dict = None) -> dict:
|
|
37
37
|
"""
|
|
38
38
|
Multi-choice model prediction func.
|
|
39
39
|
|
|
40
40
|
Args:
|
|
41
|
-
inputs (dict): The inputs for a doc. Format:
|
|
41
|
+
inputs (List[dict]): The inputs for a doc. Format:
|
|
42
42
|
{'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
|
|
43
43
|
|
|
44
44
|
infer_cfg (dict): inference configuration.
|
|
@@ -69,37 +69,39 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
|
69
69
|
infer_cfg = infer_cfg or {}
|
|
70
70
|
self.model.generation_config.update(**infer_cfg)
|
|
71
71
|
|
|
72
|
-
input_data =
|
|
73
|
-
multi_choices =
|
|
72
|
+
input_data = [inp['data'][0] for inp in inputs]
|
|
73
|
+
multi_choices = [inp['multi_choices'] for inp in inputs]
|
|
74
74
|
|
|
75
|
-
|
|
76
|
-
assert output.shape[0] == 1
|
|
77
|
-
logits = output.flatten()
|
|
75
|
+
outputs, input_info = self._get_logits(self.tokenizer, self.model, input_data)
|
|
78
76
|
|
|
79
|
-
|
|
80
|
-
|
|
77
|
+
results = []
|
|
78
|
+
for i, (logits, choices) in enumerate(zip(outputs, multi_choices)):
|
|
79
|
+
choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in choices]
|
|
80
|
+
softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
82
|
+
if softval.dtype in {torch.bfloat16, torch.float16}:
|
|
83
|
+
softval = softval.to(dtype=torch.float32)
|
|
84
|
+
probs = softval.detach().cpu().numpy()
|
|
85
|
+
pred: str = choices[int(np.argmax(probs))] # Format: A or B or C or D
|
|
86
86
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
87
|
+
res_d = ChatCompletionResponse(
|
|
88
|
+
model=self.model_id,
|
|
89
|
+
choices=[
|
|
90
|
+
ChatCompletionResponseChoice(
|
|
91
|
+
index=0, message=ChatMessage(content=pred, role='assistant'), finish_reason='stop')
|
|
92
|
+
],
|
|
93
|
+
object='chat.completion',
|
|
94
|
+
created=int(time.time()),
|
|
95
|
+
usage=None).model_dump(exclude_unset=True)
|
|
96
96
|
|
|
97
|
-
|
|
97
|
+
results.append(res_d)
|
|
98
|
+
|
|
99
|
+
return results
|
|
98
100
|
|
|
99
101
|
@staticmethod
|
|
100
102
|
def _get_logits(tokenizer, model, inputs: List[str]):
|
|
101
|
-
input_ids = tokenizer(
|
|
102
|
-
|
|
103
|
+
input_ids = tokenizer(
|
|
104
|
+
inputs, padding=True, return_tensors='pt', padding_side='left')['input_ids'].to(model.device)
|
|
103
105
|
tokens = {'input_ids': input_ids}
|
|
104
106
|
|
|
105
107
|
outputs = model(input_ids)['logits']
|
|
@@ -117,11 +119,11 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
|
117
119
|
super().__init__(model, **kwargs)
|
|
118
120
|
|
|
119
121
|
@torch.no_grad()
|
|
120
|
-
def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
|
|
122
|
+
def predict(self, inputs: List[dict], infer_cfg: dict = None) -> dict:
|
|
121
123
|
"""
|
|
122
124
|
Multi-choice model prediction func.
|
|
123
125
|
Args:
|
|
124
|
-
inputs (dict): The inputs for a doc. Format:
|
|
126
|
+
inputs (List[dict]): The inputs for a doc. Format:
|
|
125
127
|
{'data': [(context, continuation), ...]}
|
|
126
128
|
infer_cfg (dict): inference configuration.
|
|
127
129
|
Returns:
|
|
@@ -149,24 +151,29 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
|
149
151
|
"""
|
|
150
152
|
infer_cfg = infer_cfg or {}
|
|
151
153
|
|
|
152
|
-
pred_list: list =
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
154
|
+
pred_list: list = []
|
|
155
|
+
for inp in inputs:
|
|
156
|
+
pred_list.append(self.loglikelihood(inputs=inp['data'], infer_cfg=infer_cfg))
|
|
157
|
+
|
|
158
|
+
results = []
|
|
159
|
+
for pred in pred_list:
|
|
160
|
+
res_d = ChatCompletionResponse(
|
|
161
|
+
model=self.model_id,
|
|
162
|
+
choices=[{
|
|
163
|
+
'index': 0,
|
|
164
|
+
'message': {
|
|
165
|
+
'content': pred,
|
|
166
|
+
'role': 'assistant'
|
|
167
|
+
}
|
|
168
|
+
}],
|
|
169
|
+
object='chat.completion',
|
|
170
|
+
created=int(time.time()),
|
|
171
|
+
usage=None).model_dump(exclude_unset=True)
|
|
172
|
+
results.append(res_d)
|
|
173
|
+
|
|
174
|
+
return results
|
|
175
|
+
|
|
176
|
+
def loglikelihood(self, inputs: List[tuple], infer_cfg: dict = None) -> list:
|
|
170
177
|
self.model.generation_config.update(**infer_cfg)
|
|
171
178
|
# To predict one doc
|
|
172
179
|
doc_ele_pred = []
|
|
@@ -17,12 +17,12 @@ class CustomModelAdapter(BaseModelAdapter):
|
|
|
17
17
|
self.custom_model = custom_model
|
|
18
18
|
super(CustomModelAdapter, self).__init__(model=custom_model)
|
|
19
19
|
|
|
20
|
-
def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
|
|
20
|
+
def predict(self, inputs: List[Union[str, dict, list]], **kwargs) -> List[Dict[str, Any]]:
|
|
21
21
|
"""
|
|
22
22
|
Model prediction func.
|
|
23
23
|
|
|
24
24
|
Args:
|
|
25
|
-
inputs (Union[str, dict, list]): The input data. Depending on the specific model.
|
|
25
|
+
inputs (List[Union[str, dict, list]]): The input data. Depending on the specific model.
|
|
26
26
|
str: 'xxx'
|
|
27
27
|
dict: {'data': [full_prompt]}
|
|
28
28
|
list: ['xxx', 'yyy', 'zzz']
|
evalscope/models/local_model.py
CHANGED
|
@@ -37,6 +37,10 @@ class LocalModel:
|
|
|
37
37
|
cache_dir=model_cache_dir,
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
+
# Fix no padding
|
|
41
|
+
if self.tokenizer.pad_token is None:
|
|
42
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
43
|
+
|
|
40
44
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
41
45
|
self.model_id,
|
|
42
46
|
revision=model_revision,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
import time
|
|
3
|
-
from typing import Optional, Union
|
|
3
|
+
from typing import List, Optional, Union
|
|
4
4
|
|
|
5
5
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
6
6
|
from evalscope.utils.chat_service import ChatMessage
|
|
@@ -28,36 +28,35 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
28
28
|
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
29
29
|
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
30
30
|
|
|
31
|
-
def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
|
|
31
|
+
def predict(self, inputs: List[Union[str, dict, list]], infer_cfg: dict = None) -> List[dict]:
|
|
32
32
|
"""
|
|
33
33
|
Model prediction func.
|
|
34
34
|
|
|
35
35
|
Args:
|
|
36
|
-
inputs (Union[str, dict, list]): The input data.
|
|
36
|
+
inputs (List[Union[str, dict, list]]): The input data.
|
|
37
37
|
infer_cfg (dict): Inference configuration.
|
|
38
38
|
|
|
39
39
|
Returns:
|
|
40
|
-
res (dict): The model prediction results.
|
|
40
|
+
res (List[dict]): The model prediction results.
|
|
41
41
|
"""
|
|
42
42
|
infer_cfg = infer_cfg or {}
|
|
43
|
+
results = []
|
|
43
44
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
elif isinstance(inputs, list):
|
|
57
|
-
query = '\n'.join(inputs)
|
|
58
|
-
system_prompt = None
|
|
45
|
+
for input_item in inputs:
|
|
46
|
+
response = self.process_single_input(input_item, infer_cfg)
|
|
47
|
+
results.append(response)
|
|
48
|
+
|
|
49
|
+
return results
|
|
50
|
+
|
|
51
|
+
def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
52
|
+
"""Process a single input item."""
|
|
53
|
+
data: list = input_item['data']
|
|
54
|
+
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
55
|
+
query = '\n'.join(''.join(item) for item in data)
|
|
56
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
59
57
|
else:
|
|
60
|
-
|
|
58
|
+
query = data[0]
|
|
59
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
61
60
|
|
|
62
61
|
content = self.make_request_content(query, system_prompt)
|
|
63
62
|
request_json = self.make_request(content, infer_cfg)
|
|
@@ -68,7 +67,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
68
67
|
"""
|
|
69
68
|
Make request content for API.
|
|
70
69
|
"""
|
|
71
|
-
if system_prompt
|
|
70
|
+
if system_prompt:
|
|
72
71
|
messages = [
|
|
73
72
|
ChatMessage(role='system', content=system_prompt).model_dump(exclude_unset=True),
|
|
74
73
|
ChatMessage(role='user', content=query).model_dump(exclude_unset=True)
|
|
@@ -80,19 +79,14 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
80
79
|
def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
|
|
81
80
|
"""Make request to remote API."""
|
|
82
81
|
# Format request JSON according to OpenAI API format
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
'n': infer_cfg.get('num_return_sequences', 1),
|
|
92
|
-
'stop': infer_cfg.get('stop', None)
|
|
93
|
-
}
|
|
94
|
-
if self.seed is not None:
|
|
95
|
-
request_json['seed'] = self.seed
|
|
82
|
+
from evalscope.config import DEFAULT_GENERATION_CONFIG
|
|
83
|
+
if infer_cfg == DEFAULT_GENERATION_CONFIG:
|
|
84
|
+
infer_cfg = {
|
|
85
|
+
'max_tokens': 2048,
|
|
86
|
+
'temperature': 0.0,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
request_json = {'model': self.model_id, **content, **infer_cfg}
|
|
96
90
|
logger.debug(f'Request to remote API: {request_json}')
|
|
97
91
|
return request_json
|
|
98
92
|
|
evalscope/report/app.py
CHANGED
|
@@ -6,6 +6,7 @@ import os
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import plotly.express as px
|
|
8
8
|
import plotly.graph_objects as go
|
|
9
|
+
import re
|
|
9
10
|
from dataclasses import dataclass
|
|
10
11
|
from typing import Any, List, Union
|
|
11
12
|
|
|
@@ -218,7 +219,16 @@ def dict_to_markdown(data) -> str:
|
|
|
218
219
|
return '\n\n'.join(markdown_lines)
|
|
219
220
|
|
|
220
221
|
|
|
222
|
+
def convert_html_tags(text):
|
|
223
|
+
# match begin label
|
|
224
|
+
text = re.sub(r'<(\w+)>', r'[\1]', text)
|
|
225
|
+
# match end label
|
|
226
|
+
text = re.sub(r'</(\w+)>', r'[/\1]', text)
|
|
227
|
+
return text
|
|
228
|
+
|
|
229
|
+
|
|
221
230
|
def process_string(string: str, max_length: int = 2048) -> str:
|
|
231
|
+
string = convert_html_tags(string) # for display labels e.g. `<think>`
|
|
222
232
|
if len(string) > max_length:
|
|
223
233
|
return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
|
|
224
234
|
return string
|
|
@@ -226,9 +236,11 @@ def process_string(string: str, max_length: int = 2048) -> str:
|
|
|
226
236
|
|
|
227
237
|
def process_model_prediction(item: Any):
|
|
228
238
|
if isinstance(item, dict):
|
|
229
|
-
|
|
239
|
+
res = dict_to_markdown(item)
|
|
240
|
+
return process_string(res)
|
|
230
241
|
elif isinstance(item, list):
|
|
231
|
-
|
|
242
|
+
res = '\n'.join([process_model_prediction(item) for item in item])
|
|
243
|
+
return process_string(res)
|
|
232
244
|
else:
|
|
233
245
|
return process_string(str(item))
|
|
234
246
|
|
|
@@ -257,19 +269,20 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
|
|
|
257
269
|
ds = []
|
|
258
270
|
for i, item in origin_df.iterrows():
|
|
259
271
|
raw_input = item['raw_input']
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
272
|
+
for choice in item['choices']:
|
|
273
|
+
raw_pred_answer = choice['message']['content']
|
|
274
|
+
parsed_gold_answer = choice['review']['gold']
|
|
275
|
+
parsed_pred_answer = choice['review']['pred']
|
|
276
|
+
score = choice['review']['result']
|
|
277
|
+
raw_d = {
|
|
278
|
+
'Input': raw_input,
|
|
279
|
+
'Generated': raw_pred_answer,
|
|
280
|
+
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
|
|
281
|
+
'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
|
|
282
|
+
'Score': score,
|
|
283
|
+
'NScore': normalize_score(score)
|
|
284
|
+
}
|
|
285
|
+
ds.append(raw_d)
|
|
273
286
|
|
|
274
287
|
df_subset = pd.DataFrame(ds)
|
|
275
288
|
return df_subset
|
|
@@ -284,6 +297,8 @@ def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: i
|
|
|
284
297
|
end = start + rows_per_page
|
|
285
298
|
df_subset = data_review_df.iloc[start:end].copy()
|
|
286
299
|
df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
|
|
300
|
+
df_subset['Generated'] = df_subset['Generated'].map(process_model_prediction).astype(str)
|
|
301
|
+
df_subset['Pred'] = df_subset['Pred'].map(process_model_prediction).astype(str)
|
|
287
302
|
df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
|
|
288
303
|
styler = style_df(df_subset, columns=['NScore'])
|
|
289
304
|
return df_subset, styler
|
evalscope/run.py
CHANGED
|
@@ -46,11 +46,13 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
46
46
|
|
|
47
47
|
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
48
48
|
"""Set the working directory for the task."""
|
|
49
|
+
# use cache
|
|
49
50
|
if task_cfg.use_cache:
|
|
50
51
|
task_cfg.work_dir = task_cfg.use_cache
|
|
51
52
|
logger.info(f'Set resume from {task_cfg.work_dir}')
|
|
52
53
|
# elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
53
|
-
|
|
54
|
+
else:
|
|
55
|
+
task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
|
|
54
56
|
|
|
55
57
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
56
58
|
|
|
@@ -112,7 +114,7 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
112
114
|
logger.info(task_cfg)
|
|
113
115
|
|
|
114
116
|
for evaluator in evaluators:
|
|
115
|
-
res_dict = evaluator.eval(
|
|
117
|
+
res_dict = evaluator.eval()
|
|
116
118
|
eval_results[dataset_name] = res_dict
|
|
117
119
|
|
|
118
120
|
return eval_results
|
|
@@ -124,21 +126,22 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
|
|
|
124
126
|
from evalscope.evaluator import Evaluator
|
|
125
127
|
from evalscope.models import initialize_model_adapter
|
|
126
128
|
|
|
129
|
+
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
130
|
+
# Initialize data adapter
|
|
131
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
132
|
+
|
|
127
133
|
if dataset_name == DataCollection.NAME:
|
|
128
134
|
# EvaluatorCollection is a collection of evaluators
|
|
129
135
|
from evalscope.collections import EvaluatorCollection
|
|
130
|
-
return EvaluatorCollection(task_cfg, outputs)
|
|
136
|
+
return EvaluatorCollection(task_cfg, data_adapter, outputs)
|
|
131
137
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
138
|
+
# Initialize model adapter
|
|
135
139
|
model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
|
|
136
140
|
|
|
137
141
|
# update task_cfg.dataset_args
|
|
138
142
|
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
139
143
|
|
|
140
144
|
return Evaluator(
|
|
141
|
-
dataset_name_or_path=benchmark.dataset_id,
|
|
142
145
|
data_adapter=data_adapter,
|
|
143
146
|
model_adapter=model_adapter,
|
|
144
147
|
outputs=outputs,
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -174,7 +174,7 @@ class ChatService:
|
|
|
174
174
|
)
|
|
175
175
|
|
|
176
176
|
def _prepare_text_inputs(self, request: TextCompletionRequest):
|
|
177
|
-
inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=
|
|
177
|
+
inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=False).to(self.device)
|
|
178
178
|
prompt_tokens = len(inputs['input_ids'][0])
|
|
179
179
|
return inputs, prompt_tokens
|
|
180
180
|
|
|
@@ -204,7 +204,7 @@ class ChatService:
|
|
|
204
204
|
def _prepare_chat_inputs(self, request: ChatCompletionRequest):
|
|
205
205
|
formatted_prompt = self.tokenizer.apply_chat_template(
|
|
206
206
|
request.messages, tokenize=False, add_generation_prompt=True)
|
|
207
|
-
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=
|
|
207
|
+
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
|
|
208
208
|
prompt_tokens = len(inputs['input_ids'][0])
|
|
209
209
|
return formatted_prompt, inputs, prompt_tokens
|
|
210
210
|
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -135,7 +135,7 @@ def dict_to_yaml(d: dict, yaml_file: str):
|
|
|
135
135
|
Dump dict to yaml file.
|
|
136
136
|
"""
|
|
137
137
|
with open(yaml_file, 'w') as f:
|
|
138
|
-
yaml.dump(d, f, default_flow_style=False)
|
|
138
|
+
yaml.dump(d, f, default_flow_style=False, allow_unicode=True)
|
|
139
139
|
|
|
140
140
|
|
|
141
141
|
def json_to_dict(json_file) -> dict:
|
evalscope/version.py
CHANGED