evalscope 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/opencompass/backend_manager.py +2 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +1 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +12 -7
- evalscope/backend/vlm_eval_kit/custom_dataset.py +47 -0
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +3 -3
- evalscope/models/api/__init__.py +3 -0
- evalscope/models/api/openai_api.py +228 -0
- evalscope/models/model_adapter.py +6 -0
- evalscope/perf/http_client.py +5 -5
- evalscope/run_arena.py +5 -3
- evalscope/summarizer.py +10 -4
- evalscope/third_party/longbench_write/__init__.py +3 -0
- evalscope/third_party/longbench_write/eval.py +284 -0
- evalscope/third_party/longbench_write/infer.py +217 -0
- evalscope/third_party/longbench_write/longbench_write.py +88 -0
- evalscope/third_party/longbench_write/resources/__init__.py +1 -0
- evalscope/third_party/longbench_write/resources/judge.txt +31 -0
- evalscope/third_party/longbench_write/resources/longbench_write.jsonl +120 -0
- evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +60 -0
- evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +48 -0
- evalscope/third_party/longbench_write/tools/__init__.py +1 -0
- evalscope/third_party/longbench_write/tools/data_etl.py +155 -0
- evalscope/third_party/longbench_write/utils.py +37 -0
- evalscope/version.py +2 -2
- evalscope-0.5.4.dist-info/METADATA +399 -0
- {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/RECORD +31 -16
- evalscope-0.5.2.dist-info/METADATA +0 -578
- {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/WHEEL +0 -0
- {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# Copyright (c) ZhipuAI, Inc. and its affiliates.
|
|
3
|
+
import multiprocessing
|
|
4
|
+
import os
|
|
5
|
+
import json
|
|
6
|
+
import random
|
|
7
|
+
import re
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
|
|
10
|
+
import matplotlib.pyplot as plt
|
|
11
|
+
import numpy as np
|
|
12
|
+
import requests
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
|
|
15
|
+
from evalscope.utils import jsonl_to_list
|
|
16
|
+
from evalscope.utils import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
This script is used to evaluate results of predictions for the LongWriter model.
|
|
22
|
+
Refer to https://github.com/THUDM/LongWriter for more details.
|
|
23
|
+
|
|
24
|
+
EvalLength:
|
|
25
|
+
Evaluate the length of the generated responses.
|
|
26
|
+
Metrics:
|
|
27
|
+
score_l: The average score of the length evaluation.
|
|
28
|
+
|
|
29
|
+
EvalQuality:
|
|
30
|
+
Evaluate the quality of the generated responses by using Judge Model.
|
|
31
|
+
Metrics:
|
|
32
|
+
score_q: The average score of the quality evaluation.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class EvalLength:
|
|
37
|
+
|
|
38
|
+
EVAL_L = 'eval_length'
|
|
39
|
+
|
|
40
|
+
def __init__(self, model: str, pred_path: str, output_dir: str):
|
|
41
|
+
self.model = model
|
|
42
|
+
self.pred_path = pred_path
|
|
43
|
+
self.output_dir = output_dir
|
|
44
|
+
|
|
45
|
+
self.model_id_path = self.model.strip(os.sep).replace(os.sep, '__')
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def score(x, y):
|
|
49
|
+
if y > x:
|
|
50
|
+
return 100 * max(0, 1. - (y / x - 1) / 3)
|
|
51
|
+
else:
|
|
52
|
+
return 100 * max(0, 1. - (x / y - 1) / 2)
|
|
53
|
+
|
|
54
|
+
def eval(self, dump_res: bool = True):
|
|
55
|
+
# example = {"prompt": "Write an outline for a short 100-word blog post about xxx", "type": "Community Forum", "length": 100, "response_length": 103, "response": "I. Introduction A. xxx"}
|
|
56
|
+
predictions = [json.loads(line) for line in open(self.pred_path, encoding='utf-8')]
|
|
57
|
+
x, y, scores = [], [], []
|
|
58
|
+
|
|
59
|
+
for pred in tqdm(predictions, total=len(predictions), desc=f'Process of eval_l: '):
|
|
60
|
+
x.append(pred["length"])
|
|
61
|
+
y.append(pred["response_length"])
|
|
62
|
+
scores.append(self.score(pred["length"], pred["response_length"]))
|
|
63
|
+
|
|
64
|
+
avg_score_l = np.mean(scores)
|
|
65
|
+
logger.info(f'Average score of length evaluation: {avg_score_l:.2f}')
|
|
66
|
+
|
|
67
|
+
# Dump to output file
|
|
68
|
+
if dump_res:
|
|
69
|
+
output_res_path = f'{self.output_dir}/{self.model_id_path}/{self.EVAL_L}.jsonl'
|
|
70
|
+
with open(output_res_path, 'w') as f:
|
|
71
|
+
f.write(json.dumps({'score_l': avg_score_l, 'scores': scores}, ensure_ascii=False) + '\n')
|
|
72
|
+
logger.info(f'Successfully dumped evaluation results to {output_res_path}')
|
|
73
|
+
|
|
74
|
+
return x, y, scores
|
|
75
|
+
|
|
76
|
+
def plot(self, x: list, y: list):
|
|
77
|
+
plt = self.plot_img(x, y)
|
|
78
|
+
output_pic_path = f'{self.output_dir}/{self.model_id_path}/eval_length_scatter.png'
|
|
79
|
+
plt.savefig(output_pic_path)
|
|
80
|
+
logger.info(f'Successfully saved scatter plot to {output_pic_path}')
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def plot_img(x: list, y: list):
|
|
84
|
+
# set plt size 6x6
|
|
85
|
+
plt.figure(figsize=(6, 6))
|
|
86
|
+
lmt = 25000
|
|
87
|
+
# plot x, y
|
|
88
|
+
plt.scatter(x, y, s=100, c='r', alpha=0.3)
|
|
89
|
+
# plot x=y
|
|
90
|
+
plt.plot([0, lmt], [0, lmt], 'k--')
|
|
91
|
+
plt.xscale('log')
|
|
92
|
+
plt.yscale('log')
|
|
93
|
+
plt.xlim(50, lmt)
|
|
94
|
+
plt.ylim(50, lmt)
|
|
95
|
+
plt.xlabel('Required Length', fontsize=20, fontweight='bold')
|
|
96
|
+
plt.ylabel('Output Length', fontsize=20, fontweight='bold')
|
|
97
|
+
plt.xticks(fontsize=24)
|
|
98
|
+
plt.yticks(fontsize=24)
|
|
99
|
+
plt.tight_layout()
|
|
100
|
+
|
|
101
|
+
return plt
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class EvalQuality:
|
|
105
|
+
|
|
106
|
+
EVAL_Q = 'eval_quality'
|
|
107
|
+
OPENAI_BASE_URL = 'https://api.openai.com/v1/chat/completions'
|
|
108
|
+
DIMS = ["Relevance", "Accuracy", "Coherence", "Clarity", "Breadth and Depth", "Reading Experience"]
|
|
109
|
+
|
|
110
|
+
def __init__(self,
|
|
111
|
+
model: str,
|
|
112
|
+
pred_path: str,
|
|
113
|
+
output_dir: str,
|
|
114
|
+
prompt_template_path: str,
|
|
115
|
+
openai_api_key: str = None,
|
|
116
|
+
openai_api_base: str = OPENAI_BASE_URL,
|
|
117
|
+
openai_gpt_model: str = 'gpt-4o-2024-05-13',
|
|
118
|
+
generation_kwargs: dict = None,
|
|
119
|
+
proc_num: int = 8):
|
|
120
|
+
|
|
121
|
+
self.model = model
|
|
122
|
+
self.openai_api_base = openai_api_base
|
|
123
|
+
self.pred_path = pred_path
|
|
124
|
+
self.output_dir = output_dir
|
|
125
|
+
self.proc_num = proc_num
|
|
126
|
+
self.eval_scores = []
|
|
127
|
+
|
|
128
|
+
assert os.path.exists(self.pred_path), f'Prediction file not found: {self.pred_path}'
|
|
129
|
+
|
|
130
|
+
# Default: temperature=0.5, max_new_tokens=1024, stop=None
|
|
131
|
+
if generation_kwargs is None:
|
|
132
|
+
self.generation_kwargs = dict({
|
|
133
|
+
'max_new_tokens': 1024,
|
|
134
|
+
'temperature': 0.5,
|
|
135
|
+
'stop': None,
|
|
136
|
+
})
|
|
137
|
+
else:
|
|
138
|
+
self.generation_kwargs = generation_kwargs
|
|
139
|
+
|
|
140
|
+
self.prompt_template: str = open(prompt_template_path, 'r', encoding='utf-8').read()
|
|
141
|
+
|
|
142
|
+
self.model_id_path = self.model.strip(os.sep).replace(os.sep, '__')
|
|
143
|
+
self.output_res_path = f'{self.output_dir}/{self.model_id_path}/{self.EVAL_Q}.jsonl'
|
|
144
|
+
|
|
145
|
+
self.openai_api_key: str = openai_api_key
|
|
146
|
+
self.openai_gpt_model = openai_gpt_model
|
|
147
|
+
assert self.openai_api_key, 'Please set `OPENAI_API_KEY` in environment variables.'
|
|
148
|
+
|
|
149
|
+
def get_response_gpt4(self, prompt, temperature=0.5, max_new_tokens=1024, stop=None):
|
|
150
|
+
tries = 0
|
|
151
|
+
while tries < 1:
|
|
152
|
+
tries += 1
|
|
153
|
+
try:
|
|
154
|
+
headers = {
|
|
155
|
+
'Authorization': "Bearer {}".format(self.openai_api_key),
|
|
156
|
+
}
|
|
157
|
+
messages = [
|
|
158
|
+
{'role': 'user', 'content': prompt},
|
|
159
|
+
]
|
|
160
|
+
resp = requests.post(self.openai_api_base, json={
|
|
161
|
+
"model": self.openai_gpt_model,
|
|
162
|
+
"messages": messages,
|
|
163
|
+
"temperature": temperature,
|
|
164
|
+
"max_tokens": max_new_tokens,
|
|
165
|
+
"stop": stop,
|
|
166
|
+
}, headers=headers, timeout=600)
|
|
167
|
+
if resp.status_code != 200:
|
|
168
|
+
raise Exception(resp.text)
|
|
169
|
+
resp = resp.json()
|
|
170
|
+
logger.info(f'>>gpt resp: {resp}')
|
|
171
|
+
break
|
|
172
|
+
except KeyboardInterrupt as e:
|
|
173
|
+
raise e
|
|
174
|
+
except Exception as e:
|
|
175
|
+
if "maximum context length" in str(e):
|
|
176
|
+
raise e
|
|
177
|
+
elif "triggering" in str(e):
|
|
178
|
+
return 'Trigger OpenAI\'s content management policy'
|
|
179
|
+
logger.error("Error Occurs: \"%s\" Retry ..." % (str(e)))
|
|
180
|
+
else:
|
|
181
|
+
logger.error("Max tries. Failed.")
|
|
182
|
+
return "Max tries. Failed."
|
|
183
|
+
try:
|
|
184
|
+
return resp["choices"][0]["message"]["content"]
|
|
185
|
+
except:
|
|
186
|
+
return ''
|
|
187
|
+
|
|
188
|
+
@staticmethod
|
|
189
|
+
def extract_info(pattern, text):
|
|
190
|
+
match = re.search(pattern, text, re.DOTALL)
|
|
191
|
+
if match:
|
|
192
|
+
return match.group(1)
|
|
193
|
+
else:
|
|
194
|
+
return None
|
|
195
|
+
|
|
196
|
+
def process_data(self, item):
|
|
197
|
+
# for item in tqdm(items, total=len(items), desc=f'Process of eval_q: '):
|
|
198
|
+
prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item["response"])
|
|
199
|
+
scores = None
|
|
200
|
+
output = self.get_response_gpt4(prompt, **self.generation_kwargs)
|
|
201
|
+
try:
|
|
202
|
+
if '```json' in output:
|
|
203
|
+
output = self.extract_info(r'```json\n(.*?)\n```', output)
|
|
204
|
+
output = output.replace('\n', '')
|
|
205
|
+
scores = json.loads(output)
|
|
206
|
+
for dim in self.DIMS:
|
|
207
|
+
if dim not in scores:
|
|
208
|
+
logger.warning(f'Cannot find score for dimension: {dim} in scores {scores}.')
|
|
209
|
+
scores = None
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.error(f'Error occurs during process data: {str(e)}')
|
|
212
|
+
|
|
213
|
+
if scores is None:
|
|
214
|
+
logger.error(f'Failed to extract scores for item: {item}')
|
|
215
|
+
else:
|
|
216
|
+
logger.info(f'>>scores: {scores}')
|
|
217
|
+
item['scores'] = scores
|
|
218
|
+
|
|
219
|
+
return item
|
|
220
|
+
|
|
221
|
+
def eval(self):
|
|
222
|
+
|
|
223
|
+
data_all = jsonl_to_list(self.pred_path)
|
|
224
|
+
total = len(data_all)
|
|
225
|
+
assert total > 0, f'No data found in prediction file: {self.pred_path}'
|
|
226
|
+
|
|
227
|
+
random.shuffle(data_all)
|
|
228
|
+
|
|
229
|
+
with ThreadPoolExecutor() as executor:
|
|
230
|
+
self.eval_scores = list(executor.map(self.process_data, data_all))
|
|
231
|
+
|
|
232
|
+
# self.process_data(items=data)
|
|
233
|
+
logger.info(f'>>self.eval_scores: {self.eval_scores}')
|
|
234
|
+
|
|
235
|
+
total_score = dict()
|
|
236
|
+
for dim in self.DIMS:
|
|
237
|
+
# scores = [float(score[dim]) if dim in score else 3 for score in self.eval_scores]
|
|
238
|
+
scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores'] else 3 for item in self.eval_scores]
|
|
239
|
+
total_score[dim] = ((sum(scores) / len(scores)) - 1) * 25
|
|
240
|
+
total_score['total'] = sum(total_score.values()) / len(total_score)
|
|
241
|
+
logger.info(f'Total score of quality evaluation: {total_score["total"]:.2f}')
|
|
242
|
+
|
|
243
|
+
output_res_path: str = f'{self.output_dir}/{self.model_id_path}/{self.EVAL_Q}.jsonl'
|
|
244
|
+
with open(output_res_path, 'w', encoding='utf-8') as fout:
|
|
245
|
+
fout.write(json.dumps(total_score, ensure_ascii=False) + '\n')
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def run_eval(model: str,
|
|
249
|
+
pred_path: str,
|
|
250
|
+
output_dir: str,
|
|
251
|
+
prompt_template_path: str,
|
|
252
|
+
openai_api_key: str,
|
|
253
|
+
openai_api_base: str,
|
|
254
|
+
openai_gpt_model: str,
|
|
255
|
+
generation_kwargs: dict,
|
|
256
|
+
proc_num: int,
|
|
257
|
+
stage: list,
|
|
258
|
+
):
|
|
259
|
+
logger.info(f'Got eval stages: {stage}')
|
|
260
|
+
|
|
261
|
+
if 'eval_l' in stage:
|
|
262
|
+
logger.info(f'Processing evaluation of length for model: {model}')
|
|
263
|
+
eval_length = EvalLength(model=model,
|
|
264
|
+
pred_path=pred_path,
|
|
265
|
+
output_dir=output_dir)
|
|
266
|
+
x, y, _ = eval_length.eval()
|
|
267
|
+
eval_length.plot(x, y)
|
|
268
|
+
else:
|
|
269
|
+
logger.warning(f'*** Skip `eval_l` stage ***')
|
|
270
|
+
|
|
271
|
+
if 'eval_q' in stage:
|
|
272
|
+
logger.info(f'Processing evaluation of quality for model: {model}')
|
|
273
|
+
eval_quality = EvalQuality(model=model,
|
|
274
|
+
pred_path=pred_path,
|
|
275
|
+
output_dir=output_dir,
|
|
276
|
+
prompt_template_path=prompt_template_path,
|
|
277
|
+
openai_api_key=openai_api_key,
|
|
278
|
+
openai_api_base=openai_api_base,
|
|
279
|
+
openai_gpt_model=openai_gpt_model,
|
|
280
|
+
generation_kwargs=generation_kwargs,
|
|
281
|
+
proc_num=proc_num)
|
|
282
|
+
eval_quality.eval()
|
|
283
|
+
else:
|
|
284
|
+
logger.warning('*** Skip `eval_q` stage ***')
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# Copyright (c) ZhipuAI, Inc. and its affiliates.
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import json
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import numpy as np
|
|
10
|
+
import random
|
|
11
|
+
from modelscope import AutoTokenizer, AutoModelForCausalLM
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
|
|
14
|
+
from evalscope.third_party.longbench_write.utils import count_words
|
|
15
|
+
from evalscope.models.api import OpenaiApi
|
|
16
|
+
from evalscope.utils import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
DEFAULT_PROC_NUM = 8
|
|
21
|
+
|
|
22
|
+
"""
|
|
23
|
+
This script is used to generate predictions for the LongWriter model.
|
|
24
|
+
Refer to https://github.com/THUDM/LongWriter for more details.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_pred(rank, world_size, data, path, max_new_tokens, temperature, tokenizer, fout):
|
|
29
|
+
device = torch.device(f'cuda:{rank}')
|
|
30
|
+
model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
|
|
31
|
+
model = model.eval()
|
|
32
|
+
|
|
33
|
+
for dt in tqdm(data, total=len(data), desc=f'Infer on rank-{rank}: '):
|
|
34
|
+
prompt = dt['prompt']
|
|
35
|
+
if "llama" in path.lower():
|
|
36
|
+
prompt = f"[INST]{prompt}[/INST]"
|
|
37
|
+
input = tokenizer(prompt, truncation=False, return_tensors="pt").to(device)
|
|
38
|
+
context_length = input.input_ids.shape[-1]
|
|
39
|
+
output = model.generate(
|
|
40
|
+
**input,
|
|
41
|
+
max_new_tokens=max_new_tokens,
|
|
42
|
+
num_beams=1,
|
|
43
|
+
do_sample=True,
|
|
44
|
+
temperature=temperature,
|
|
45
|
+
)[0]
|
|
46
|
+
response = tokenizer.decode(output[context_length:], skip_special_tokens=True)
|
|
47
|
+
else:
|
|
48
|
+
response, history = model.chat(tokenizer, prompt, history=[], max_new_tokens=max_new_tokens,
|
|
49
|
+
temperature=temperature)
|
|
50
|
+
dt["response_length"], _ = count_words(response)
|
|
51
|
+
dt["response"] = response
|
|
52
|
+
|
|
53
|
+
logger.info(dt)
|
|
54
|
+
|
|
55
|
+
fout.write(json.dumps(dt, ensure_ascii=False) + '\n')
|
|
56
|
+
fout.flush()
|
|
57
|
+
|
|
58
|
+
logger.info(f'Successfully generated predictions for {len(data)} samples.')
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def seed_everything(seed):
|
|
62
|
+
torch.manual_seed(seed)
|
|
63
|
+
torch.cuda.manual_seed(seed)
|
|
64
|
+
np.random.seed(seed)
|
|
65
|
+
random.seed(seed)
|
|
66
|
+
torch.backends.cudnn.benchmark = False
|
|
67
|
+
torch.backends.cudnn.deterministic = True
|
|
68
|
+
torch.cuda.manual_seed_all(seed)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# def run_infer(model: str,
|
|
72
|
+
# data_path: str,
|
|
73
|
+
# output_dir: str,
|
|
74
|
+
# generation_kwargs: dict = None,
|
|
75
|
+
# enable: bool = True, ):
|
|
76
|
+
# """
|
|
77
|
+
# Process inference for LongWriter model.
|
|
78
|
+
#
|
|
79
|
+
# Args:
|
|
80
|
+
# model: The model id of the LongWriter model on ModelScope, or local model path.
|
|
81
|
+
# data_path: The path to the data file.
|
|
82
|
+
# output_dir: The output directory for the predictions.
|
|
83
|
+
# generation_kwargs: The generation arguments for the model.
|
|
84
|
+
# Attributes: `max_new_tokens`: The maximum number of tokens to generate. `temperature`: The temperature
|
|
85
|
+
# enable: Whether to run infer process.
|
|
86
|
+
# """
|
|
87
|
+
# model_id_path: str = os.path.join(output_dir, model.strip(os.sep).replace(os.sep, '__'))
|
|
88
|
+
#
|
|
89
|
+
# if not enable:
|
|
90
|
+
# logger.warning('*** Skip `infer` stage ***')
|
|
91
|
+
# return f'{model_id_path}/pred.jsonl'
|
|
92
|
+
#
|
|
93
|
+
# seed_everything(42)
|
|
94
|
+
#
|
|
95
|
+
# os.makedirs(model_id_path, exist_ok=True)
|
|
96
|
+
# fout = open(f'{model_id_path}/pred.jsonl', 'w', encoding='utf-8')
|
|
97
|
+
#
|
|
98
|
+
# if generation_kwargs is None:
|
|
99
|
+
# generation_kwargs = dict({
|
|
100
|
+
# 'max_new_tokens': 32768,
|
|
101
|
+
# 'temperature': 0.5
|
|
102
|
+
# })
|
|
103
|
+
#
|
|
104
|
+
# tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
|
|
105
|
+
# world_size = torch.cuda.device_count()
|
|
106
|
+
#
|
|
107
|
+
# logger.info(f'>>Input data path: {data_path}')
|
|
108
|
+
# with open(data_path, encoding='utf-8') as f:
|
|
109
|
+
# data = [json.loads(line) for line in f]
|
|
110
|
+
#
|
|
111
|
+
# data_subsets = [data[i::world_size] for i in range(world_size)]
|
|
112
|
+
# processes = []
|
|
113
|
+
# for rank in range(world_size):
|
|
114
|
+
# p = mp.Process(target=get_pred,
|
|
115
|
+
# args=(rank, world_size, data_subsets[rank], model, generation_kwargs.get('max_new_tokens'), generation_kwargs.get('temperature'), tokenizer, fout))
|
|
116
|
+
# p.start()
|
|
117
|
+
# processes.append(p)
|
|
118
|
+
#
|
|
119
|
+
# for p in processes:
|
|
120
|
+
# p.join()
|
|
121
|
+
#
|
|
122
|
+
# logger.info(f'Finish generating predictions for {model}.')
|
|
123
|
+
# logger.info(f'Predictions are saved in {model_id_path}/pred.jsonl.')
|
|
124
|
+
#
|
|
125
|
+
# return f'{model_id_path}/pred.jsonl'
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def run_infer(model: str,
|
|
129
|
+
data_path: str,
|
|
130
|
+
output_dir: str,
|
|
131
|
+
api_config: dict,
|
|
132
|
+
generation_kwargs: dict = None,
|
|
133
|
+
enable: bool = True, ):
|
|
134
|
+
"""
|
|
135
|
+
Process inference for LongWriter model.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
model: The model id of the LongWriter model on ModelScope, or local model path.
|
|
139
|
+
data_path: The path to the data file.
|
|
140
|
+
output_dir: The output directory for the predictions.
|
|
141
|
+
api_config: The configuration for the OpenAI API inference.
|
|
142
|
+
Attributes:
|
|
143
|
+
`openai_api_key`: The OpenAI API key. Default is None for custom model serving.
|
|
144
|
+
`openai_api_base`: The OpenAI API base URL.
|
|
145
|
+
`is_chat`: Whether to chat. Default is True.
|
|
146
|
+
`verbose`: Whether to print verbose information. Default is False.
|
|
147
|
+
generation_kwargs: The generation arguments for the model.
|
|
148
|
+
Attributes: `max_new_tokens`: The maximum number of tokens to generate. `temperature`: The temperature
|
|
149
|
+
enable: Whether to run infer process.
|
|
150
|
+
"""
|
|
151
|
+
model_id_path: str = os.path.join(output_dir, model.strip(os.sep).replace(os.sep, '__'))
|
|
152
|
+
|
|
153
|
+
if not enable:
|
|
154
|
+
logger.warning('*** Skip `infer` stage ***')
|
|
155
|
+
return f'{model_id_path}/pred.jsonl'
|
|
156
|
+
|
|
157
|
+
seed_everything(42)
|
|
158
|
+
|
|
159
|
+
if generation_kwargs is None:
|
|
160
|
+
generation_kwargs = dict({
|
|
161
|
+
'max_new_tokens': 32768,
|
|
162
|
+
'temperature': 0.5,
|
|
163
|
+
'repetition_penalty': 1.0,
|
|
164
|
+
})
|
|
165
|
+
|
|
166
|
+
# Prepare inputs
|
|
167
|
+
logger.info(f'>>Input data path: {data_path}')
|
|
168
|
+
# TODO: add load data from ms
|
|
169
|
+
with open(data_path, encoding='utf-8') as f:
|
|
170
|
+
data_list = [json.loads(line) for line in f]
|
|
171
|
+
|
|
172
|
+
logger.info(f'Input example: {data_list[0]}')
|
|
173
|
+
|
|
174
|
+
api_client = OpenaiApi(model=model,
|
|
175
|
+
openai_api_key=None,
|
|
176
|
+
openai_api_base=api_config.get('openai_api_base', 'http://127.0.0.1:8000/v1/chat/completions'),
|
|
177
|
+
max_new_tokens=generation_kwargs.get('max_new_tokens', 4096),
|
|
178
|
+
temperature=generation_kwargs.get('temperature', 0.0),
|
|
179
|
+
repetition_penalty=generation_kwargs.get('repetition_penalty', 1.0),
|
|
180
|
+
is_chat=api_config.get('is_chat', True),
|
|
181
|
+
verbose=api_config.get('verbose', False),
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# TODO: ONLY FOR TEST generate_simple
|
|
185
|
+
results: List[str] = api_client.generate_simple(inputs=[example['prompt'] for example in data_list])
|
|
186
|
+
assert len(results) == len(data_list), f'Error: The number of predictions {len(results)} is not equal to the number of inputs {len(data_list)}.'
|
|
187
|
+
logger.info(f'Finish generating predictions with {len(data_list)} samples for {model}')
|
|
188
|
+
|
|
189
|
+
# Outputs
|
|
190
|
+
os.makedirs(model_id_path, exist_ok=True)
|
|
191
|
+
output_pred_file: str = f'{model_id_path}/pred.jsonl'
|
|
192
|
+
with open(output_pred_file, 'w', encoding='utf-8') as f:
|
|
193
|
+
for dt, res in zip(data_list, results):
|
|
194
|
+
dt["response_length"], _ = count_words(res)
|
|
195
|
+
dt["response"] = res
|
|
196
|
+
f.write(json.dumps(dt, ensure_ascii=False) + '\n')
|
|
197
|
+
|
|
198
|
+
logger.info(f'Predictions are saved in {output_pred_file}')
|
|
199
|
+
|
|
200
|
+
return output_pred_file
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
if __name__ == '__main__':
|
|
204
|
+
# ZhipuAI/LongWriter-glm4-9b, ZhipuAI/LongWriter-llama3.1-8b
|
|
205
|
+
api_config = dict(openai_api_key=None,
|
|
206
|
+
openai_api_base='http://127.0.0.1:8000/v1/chat/completions',
|
|
207
|
+
is_chat=True,
|
|
208
|
+
verbose=True,)
|
|
209
|
+
|
|
210
|
+
run_infer(model='ZhipuAI/LongWriter-glm4-9b',
|
|
211
|
+
data_path='resources/longbench_write.jsonl',
|
|
212
|
+
output_dir='outputs',
|
|
213
|
+
api_config=api_config,
|
|
214
|
+
generation_kwargs=dict({
|
|
215
|
+
'max_new_tokens': 32768,
|
|
216
|
+
'temperature': 0.5})
|
|
217
|
+
)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
from evalscope.third_party.longbench_write.infer import run_infer
|
|
6
|
+
from evalscope.third_party.longbench_write.eval import run_eval
|
|
7
|
+
from evalscope.utils import yaml_to_dict, json_to_dict
|
|
8
|
+
from evalscope.utils import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
Entry file for LongWriter evaluation.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def run_task(task_cfg: Union[str, dict]):
|
|
18
|
+
|
|
19
|
+
if isinstance(task_cfg, str):
|
|
20
|
+
if task_cfg.endswith('.yaml'):
|
|
21
|
+
task_cfg: dict = yaml_to_dict(task_cfg)
|
|
22
|
+
elif task_cfg.endswith('.json'):
|
|
23
|
+
task_cfg: dict = json_to_dict(task_cfg)
|
|
24
|
+
else:
|
|
25
|
+
raise ValueError(f'Unsupported file format: {task_cfg}, should be yaml or json file.')
|
|
26
|
+
|
|
27
|
+
# Parse task configuration
|
|
28
|
+
stage: list = task_cfg.get('stage', ['infer', 'eval_l', 'eval_q'])
|
|
29
|
+
model: str = task_cfg.get('model')
|
|
30
|
+
input_data_path: str = task_cfg.get('input_data_path')
|
|
31
|
+
output_dir: str = task_cfg.get('output_dir')
|
|
32
|
+
|
|
33
|
+
infer_config: dict = task_cfg.get('infer_config')
|
|
34
|
+
eval_config: dict = task_cfg.get('eval_config')
|
|
35
|
+
assert infer_config is not None and eval_config is not None, 'Please provide infer_config and eval_config.'
|
|
36
|
+
|
|
37
|
+
# Run inference process
|
|
38
|
+
pred_res_path = run_infer(model=model,
|
|
39
|
+
data_path=input_data_path or os.path.join(os.path.dirname(__file__), 'resources/longbench_write.jsonl'),
|
|
40
|
+
output_dir=output_dir,
|
|
41
|
+
api_config=dict(
|
|
42
|
+
openai_api_key=infer_config.get('openai_api_key'),
|
|
43
|
+
openai_api_base=infer_config.get('openai_api_base'),
|
|
44
|
+
is_chat=infer_config.get('is_chat', True),
|
|
45
|
+
verbose=infer_config.get('verbose', False),
|
|
46
|
+
),
|
|
47
|
+
generation_kwargs=infer_config.get('generation_kwargs'),
|
|
48
|
+
enable='infer' in stage)
|
|
49
|
+
|
|
50
|
+
# Run eval process
|
|
51
|
+
run_eval(model=model,
|
|
52
|
+
pred_path=pred_res_path,
|
|
53
|
+
output_dir=output_dir,
|
|
54
|
+
prompt_template_path=os.path.join(os.path.dirname(__file__), 'resources/judge.txt'),
|
|
55
|
+
openai_api_key=eval_config.get('openai_api_key'),
|
|
56
|
+
openai_api_base=eval_config.get('openai_api_base'),
|
|
57
|
+
openai_gpt_model=eval_config.get('openai_gpt_model'),
|
|
58
|
+
generation_kwargs=eval_config.get('generation_kwargs'),
|
|
59
|
+
proc_num=eval_config.get('proc_num', 16),
|
|
60
|
+
stage=stage)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == '__main__':
|
|
64
|
+
# Note: evaluation task configuration can also be loaded from yaml or json file.
|
|
65
|
+
# task_cfg = os.path.join(os.path.dirname(__file__), 'default_task.yaml')
|
|
66
|
+
# task_cfg = os.path.join(os.path.dirname(__file__), 'default_task.json')
|
|
67
|
+
task_cfg = dict(stage=['infer', 'eval_l', 'eval_q'],
|
|
68
|
+
model='ZhipuAI/LongWriter-glm4-9b', # or /path/to/your_model_dir
|
|
69
|
+
input_data_path=None,
|
|
70
|
+
output_dir='./outputs',
|
|
71
|
+
|
|
72
|
+
infer_config={
|
|
73
|
+
'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions',
|
|
74
|
+
'is_chat': True,
|
|
75
|
+
'verbose': False,
|
|
76
|
+
'generation_kwargs': {'max_new_tokens': 32768, 'temperature': 0.5, 'repetition_penalty': 1.0},
|
|
77
|
+
},
|
|
78
|
+
|
|
79
|
+
eval_config={
|
|
80
|
+
'openai_api_key': 'YOUR_OPENAI_API_KEY',
|
|
81
|
+
'openai_api_base': 'https://api.openai.com/v1/chat/completions',
|
|
82
|
+
'openai_gpt_model': 'gpt-4o-2024-05-13',
|
|
83
|
+
'generation_kwargs': {'max_new_tokens': 1024, 'temperature': 0.5, 'stop': None},
|
|
84
|
+
'proc_num': 16,
|
|
85
|
+
},
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
run_task(task_cfg=task_cfg)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
You are an expert in evaluating text quality. Please evaluate the quality of an AI assistant's response to a user's writing request. Be as strict as possible.
|
|
2
|
+
|
|
3
|
+
You need to evaluate across the following six dimensions, with scores ranging from 1 to 5. The scoring criteria from 5 to 1 for each dimension are as follows:
|
|
4
|
+
|
|
5
|
+
1. Relevance: From content highly relevant and fully applicable to the user's request to completely irrelevant or inapplicable.
|
|
6
|
+
|
|
7
|
+
2. Accuracy: From content completely accurate with no factual errors or misleading information to content with numerous errors and highly misleading.
|
|
8
|
+
|
|
9
|
+
3. Coherence: From clear structure with smooth logical connections to disorganized structure with no coherence.
|
|
10
|
+
|
|
11
|
+
4. Clarity: From clear language, rich in detail, and easy to understand to confusing expression with minimal details.
|
|
12
|
+
|
|
13
|
+
5. Breadth and Depth: From both broad and deep content with a lot of information to seriously lacking breadth and depth with minimal information.
|
|
14
|
+
|
|
15
|
+
6. Reading Experience: From excellent reading experience, engaging and easy to understand content to very poor reading experience, boring and hard to understand content.
|
|
16
|
+
|
|
17
|
+
Please evaluate the quality of the following response to a user's request according to the above requirements.
|
|
18
|
+
|
|
19
|
+
<User Request>
|
|
20
|
+
|
|
21
|
+
$INST$
|
|
22
|
+
|
|
23
|
+
</User Request>
|
|
24
|
+
|
|
25
|
+
<Response>
|
|
26
|
+
|
|
27
|
+
$RESPONSE$
|
|
28
|
+
|
|
29
|
+
</Response>
|
|
30
|
+
|
|
31
|
+
Please evaluate the quality of the response. You must first provide a brief analysis of its quality, then give a comprehensive analysis with scores for each dimension. The output must strictly follow the JSON format: {"Analysis": ..., "Relevance": ..., "Accuracy": ..., "Coherence": ..., "Clarity": ..., "Breadth and Depth": ..., "Reading Experience": ...}. You do not need to consider whether the response meets the user's length requirements in your evaluation. Ensure that only one integer between 1 and 5 is output for each dimension score.
|