evalscope 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (32) hide show
  1. evalscope/backend/opencompass/backend_manager.py +2 -0
  2. evalscope/backend/opencompass/tasks/eval_datasets.py +1 -0
  3. evalscope/backend/vlm_eval_kit/backend_manager.py +12 -7
  4. evalscope/backend/vlm_eval_kit/custom_dataset.py +47 -0
  5. evalscope/benchmarks/benchmark.py +1 -1
  6. evalscope/config.py +1 -0
  7. evalscope/evaluator/evaluator.py +3 -3
  8. evalscope/models/api/__init__.py +3 -0
  9. evalscope/models/api/openai_api.py +228 -0
  10. evalscope/models/model_adapter.py +6 -0
  11. evalscope/perf/http_client.py +5 -5
  12. evalscope/run_arena.py +5 -3
  13. evalscope/summarizer.py +10 -4
  14. evalscope/third_party/longbench_write/__init__.py +3 -0
  15. evalscope/third_party/longbench_write/eval.py +284 -0
  16. evalscope/third_party/longbench_write/infer.py +217 -0
  17. evalscope/third_party/longbench_write/longbench_write.py +88 -0
  18. evalscope/third_party/longbench_write/resources/__init__.py +1 -0
  19. evalscope/third_party/longbench_write/resources/judge.txt +31 -0
  20. evalscope/third_party/longbench_write/resources/longbench_write.jsonl +120 -0
  21. evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +60 -0
  22. evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +48 -0
  23. evalscope/third_party/longbench_write/tools/__init__.py +1 -0
  24. evalscope/third_party/longbench_write/tools/data_etl.py +155 -0
  25. evalscope/third_party/longbench_write/utils.py +37 -0
  26. evalscope/version.py +2 -2
  27. evalscope-0.5.4.dist-info/METADATA +399 -0
  28. {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/RECORD +31 -16
  29. evalscope-0.5.2.dist-info/METADATA +0 -578
  30. {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/WHEEL +0 -0
  31. {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/entry_points.txt +0 -0
  32. {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,284 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright (c) ZhipuAI, Inc. and its affiliates.
3
+ import multiprocessing
4
+ import os
5
+ import json
6
+ import random
7
+ import re
8
+ from concurrent.futures import ThreadPoolExecutor
9
+
10
+ import matplotlib.pyplot as plt
11
+ import numpy as np
12
+ import requests
13
+ from tqdm import tqdm
14
+
15
+ from evalscope.utils import jsonl_to_list
16
+ from evalscope.utils import get_logger
17
+
18
+ logger = get_logger()
19
+
20
+ """
21
+ This script is used to evaluate results of predictions for the LongWriter model.
22
+ Refer to https://github.com/THUDM/LongWriter for more details.
23
+
24
+ EvalLength:
25
+ Evaluate the length of the generated responses.
26
+ Metrics:
27
+ score_l: The average score of the length evaluation.
28
+
29
+ EvalQuality:
30
+ Evaluate the quality of the generated responses by using Judge Model.
31
+ Metrics:
32
+ score_q: The average score of the quality evaluation.
33
+ """
34
+
35
+
36
+ class EvalLength:
37
+
38
+ EVAL_L = 'eval_length'
39
+
40
+ def __init__(self, model: str, pred_path: str, output_dir: str):
41
+ self.model = model
42
+ self.pred_path = pred_path
43
+ self.output_dir = output_dir
44
+
45
+ self.model_id_path = self.model.strip(os.sep).replace(os.sep, '__')
46
+
47
+ @staticmethod
48
+ def score(x, y):
49
+ if y > x:
50
+ return 100 * max(0, 1. - (y / x - 1) / 3)
51
+ else:
52
+ return 100 * max(0, 1. - (x / y - 1) / 2)
53
+
54
+ def eval(self, dump_res: bool = True):
55
+ # example = {"prompt": "Write an outline for a short 100-word blog post about xxx", "type": "Community Forum", "length": 100, "response_length": 103, "response": "I. Introduction A. xxx"}
56
+ predictions = [json.loads(line) for line in open(self.pred_path, encoding='utf-8')]
57
+ x, y, scores = [], [], []
58
+
59
+ for pred in tqdm(predictions, total=len(predictions), desc=f'Process of eval_l: '):
60
+ x.append(pred["length"])
61
+ y.append(pred["response_length"])
62
+ scores.append(self.score(pred["length"], pred["response_length"]))
63
+
64
+ avg_score_l = np.mean(scores)
65
+ logger.info(f'Average score of length evaluation: {avg_score_l:.2f}')
66
+
67
+ # Dump to output file
68
+ if dump_res:
69
+ output_res_path = f'{self.output_dir}/{self.model_id_path}/{self.EVAL_L}.jsonl'
70
+ with open(output_res_path, 'w') as f:
71
+ f.write(json.dumps({'score_l': avg_score_l, 'scores': scores}, ensure_ascii=False) + '\n')
72
+ logger.info(f'Successfully dumped evaluation results to {output_res_path}')
73
+
74
+ return x, y, scores
75
+
76
+ def plot(self, x: list, y: list):
77
+ plt = self.plot_img(x, y)
78
+ output_pic_path = f'{self.output_dir}/{self.model_id_path}/eval_length_scatter.png'
79
+ plt.savefig(output_pic_path)
80
+ logger.info(f'Successfully saved scatter plot to {output_pic_path}')
81
+
82
+ @staticmethod
83
+ def plot_img(x: list, y: list):
84
+ # set plt size 6x6
85
+ plt.figure(figsize=(6, 6))
86
+ lmt = 25000
87
+ # plot x, y
88
+ plt.scatter(x, y, s=100, c='r', alpha=0.3)
89
+ # plot x=y
90
+ plt.plot([0, lmt], [0, lmt], 'k--')
91
+ plt.xscale('log')
92
+ plt.yscale('log')
93
+ plt.xlim(50, lmt)
94
+ plt.ylim(50, lmt)
95
+ plt.xlabel('Required Length', fontsize=20, fontweight='bold')
96
+ plt.ylabel('Output Length', fontsize=20, fontweight='bold')
97
+ plt.xticks(fontsize=24)
98
+ plt.yticks(fontsize=24)
99
+ plt.tight_layout()
100
+
101
+ return plt
102
+
103
+
104
+ class EvalQuality:
105
+
106
+ EVAL_Q = 'eval_quality'
107
+ OPENAI_BASE_URL = 'https://api.openai.com/v1/chat/completions'
108
+ DIMS = ["Relevance", "Accuracy", "Coherence", "Clarity", "Breadth and Depth", "Reading Experience"]
109
+
110
+ def __init__(self,
111
+ model: str,
112
+ pred_path: str,
113
+ output_dir: str,
114
+ prompt_template_path: str,
115
+ openai_api_key: str = None,
116
+ openai_api_base: str = OPENAI_BASE_URL,
117
+ openai_gpt_model: str = 'gpt-4o-2024-05-13',
118
+ generation_kwargs: dict = None,
119
+ proc_num: int = 8):
120
+
121
+ self.model = model
122
+ self.openai_api_base = openai_api_base
123
+ self.pred_path = pred_path
124
+ self.output_dir = output_dir
125
+ self.proc_num = proc_num
126
+ self.eval_scores = []
127
+
128
+ assert os.path.exists(self.pred_path), f'Prediction file not found: {self.pred_path}'
129
+
130
+ # Default: temperature=0.5, max_new_tokens=1024, stop=None
131
+ if generation_kwargs is None:
132
+ self.generation_kwargs = dict({
133
+ 'max_new_tokens': 1024,
134
+ 'temperature': 0.5,
135
+ 'stop': None,
136
+ })
137
+ else:
138
+ self.generation_kwargs = generation_kwargs
139
+
140
+ self.prompt_template: str = open(prompt_template_path, 'r', encoding='utf-8').read()
141
+
142
+ self.model_id_path = self.model.strip(os.sep).replace(os.sep, '__')
143
+ self.output_res_path = f'{self.output_dir}/{self.model_id_path}/{self.EVAL_Q}.jsonl'
144
+
145
+ self.openai_api_key: str = openai_api_key
146
+ self.openai_gpt_model = openai_gpt_model
147
+ assert self.openai_api_key, 'Please set `OPENAI_API_KEY` in environment variables.'
148
+
149
+ def get_response_gpt4(self, prompt, temperature=0.5, max_new_tokens=1024, stop=None):
150
+ tries = 0
151
+ while tries < 1:
152
+ tries += 1
153
+ try:
154
+ headers = {
155
+ 'Authorization': "Bearer {}".format(self.openai_api_key),
156
+ }
157
+ messages = [
158
+ {'role': 'user', 'content': prompt},
159
+ ]
160
+ resp = requests.post(self.openai_api_base, json={
161
+ "model": self.openai_gpt_model,
162
+ "messages": messages,
163
+ "temperature": temperature,
164
+ "max_tokens": max_new_tokens,
165
+ "stop": stop,
166
+ }, headers=headers, timeout=600)
167
+ if resp.status_code != 200:
168
+ raise Exception(resp.text)
169
+ resp = resp.json()
170
+ logger.info(f'>>gpt resp: {resp}')
171
+ break
172
+ except KeyboardInterrupt as e:
173
+ raise e
174
+ except Exception as e:
175
+ if "maximum context length" in str(e):
176
+ raise e
177
+ elif "triggering" in str(e):
178
+ return 'Trigger OpenAI\'s content management policy'
179
+ logger.error("Error Occurs: \"%s\" Retry ..." % (str(e)))
180
+ else:
181
+ logger.error("Max tries. Failed.")
182
+ return "Max tries. Failed."
183
+ try:
184
+ return resp["choices"][0]["message"]["content"]
185
+ except:
186
+ return ''
187
+
188
+ @staticmethod
189
+ def extract_info(pattern, text):
190
+ match = re.search(pattern, text, re.DOTALL)
191
+ if match:
192
+ return match.group(1)
193
+ else:
194
+ return None
195
+
196
+ def process_data(self, item):
197
+ # for item in tqdm(items, total=len(items), desc=f'Process of eval_q: '):
198
+ prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item["response"])
199
+ scores = None
200
+ output = self.get_response_gpt4(prompt, **self.generation_kwargs)
201
+ try:
202
+ if '```json' in output:
203
+ output = self.extract_info(r'```json\n(.*?)\n```', output)
204
+ output = output.replace('\n', '')
205
+ scores = json.loads(output)
206
+ for dim in self.DIMS:
207
+ if dim not in scores:
208
+ logger.warning(f'Cannot find score for dimension: {dim} in scores {scores}.')
209
+ scores = None
210
+ except Exception as e:
211
+ logger.error(f'Error occurs during process data: {str(e)}')
212
+
213
+ if scores is None:
214
+ logger.error(f'Failed to extract scores for item: {item}')
215
+ else:
216
+ logger.info(f'>>scores: {scores}')
217
+ item['scores'] = scores
218
+
219
+ return item
220
+
221
+ def eval(self):
222
+
223
+ data_all = jsonl_to_list(self.pred_path)
224
+ total = len(data_all)
225
+ assert total > 0, f'No data found in prediction file: {self.pred_path}'
226
+
227
+ random.shuffle(data_all)
228
+
229
+ with ThreadPoolExecutor() as executor:
230
+ self.eval_scores = list(executor.map(self.process_data, data_all))
231
+
232
+ # self.process_data(items=data)
233
+ logger.info(f'>>self.eval_scores: {self.eval_scores}')
234
+
235
+ total_score = dict()
236
+ for dim in self.DIMS:
237
+ # scores = [float(score[dim]) if dim in score else 3 for score in self.eval_scores]
238
+ scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores'] else 3 for item in self.eval_scores]
239
+ total_score[dim] = ((sum(scores) / len(scores)) - 1) * 25
240
+ total_score['total'] = sum(total_score.values()) / len(total_score)
241
+ logger.info(f'Total score of quality evaluation: {total_score["total"]:.2f}')
242
+
243
+ output_res_path: str = f'{self.output_dir}/{self.model_id_path}/{self.EVAL_Q}.jsonl'
244
+ with open(output_res_path, 'w', encoding='utf-8') as fout:
245
+ fout.write(json.dumps(total_score, ensure_ascii=False) + '\n')
246
+
247
+
248
+ def run_eval(model: str,
249
+ pred_path: str,
250
+ output_dir: str,
251
+ prompt_template_path: str,
252
+ openai_api_key: str,
253
+ openai_api_base: str,
254
+ openai_gpt_model: str,
255
+ generation_kwargs: dict,
256
+ proc_num: int,
257
+ stage: list,
258
+ ):
259
+ logger.info(f'Got eval stages: {stage}')
260
+
261
+ if 'eval_l' in stage:
262
+ logger.info(f'Processing evaluation of length for model: {model}')
263
+ eval_length = EvalLength(model=model,
264
+ pred_path=pred_path,
265
+ output_dir=output_dir)
266
+ x, y, _ = eval_length.eval()
267
+ eval_length.plot(x, y)
268
+ else:
269
+ logger.warning(f'*** Skip `eval_l` stage ***')
270
+
271
+ if 'eval_q' in stage:
272
+ logger.info(f'Processing evaluation of quality for model: {model}')
273
+ eval_quality = EvalQuality(model=model,
274
+ pred_path=pred_path,
275
+ output_dir=output_dir,
276
+ prompt_template_path=prompt_template_path,
277
+ openai_api_key=openai_api_key,
278
+ openai_api_base=openai_api_base,
279
+ openai_gpt_model=openai_gpt_model,
280
+ generation_kwargs=generation_kwargs,
281
+ proc_num=proc_num)
282
+ eval_quality.eval()
283
+ else:
284
+ logger.warning('*** Skip `eval_q` stage ***')
@@ -0,0 +1,217 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright (c) ZhipuAI, Inc. and its affiliates.
3
+
4
+ import os
5
+ import json
6
+ from typing import List
7
+
8
+ import torch
9
+ import numpy as np
10
+ import random
11
+ from modelscope import AutoTokenizer, AutoModelForCausalLM
12
+ from tqdm import tqdm
13
+
14
+ from evalscope.third_party.longbench_write.utils import count_words
15
+ from evalscope.models.api import OpenaiApi
16
+ from evalscope.utils import get_logger
17
+
18
+ logger = get_logger()
19
+
20
+ DEFAULT_PROC_NUM = 8
21
+
22
+ """
23
+ This script is used to generate predictions for the LongWriter model.
24
+ Refer to https://github.com/THUDM/LongWriter for more details.
25
+ """
26
+
27
+
28
+ def get_pred(rank, world_size, data, path, max_new_tokens, temperature, tokenizer, fout):
29
+ device = torch.device(f'cuda:{rank}')
30
+ model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
31
+ model = model.eval()
32
+
33
+ for dt in tqdm(data, total=len(data), desc=f'Infer on rank-{rank}: '):
34
+ prompt = dt['prompt']
35
+ if "llama" in path.lower():
36
+ prompt = f"[INST]{prompt}[/INST]"
37
+ input = tokenizer(prompt, truncation=False, return_tensors="pt").to(device)
38
+ context_length = input.input_ids.shape[-1]
39
+ output = model.generate(
40
+ **input,
41
+ max_new_tokens=max_new_tokens,
42
+ num_beams=1,
43
+ do_sample=True,
44
+ temperature=temperature,
45
+ )[0]
46
+ response = tokenizer.decode(output[context_length:], skip_special_tokens=True)
47
+ else:
48
+ response, history = model.chat(tokenizer, prompt, history=[], max_new_tokens=max_new_tokens,
49
+ temperature=temperature)
50
+ dt["response_length"], _ = count_words(response)
51
+ dt["response"] = response
52
+
53
+ logger.info(dt)
54
+
55
+ fout.write(json.dumps(dt, ensure_ascii=False) + '\n')
56
+ fout.flush()
57
+
58
+ logger.info(f'Successfully generated predictions for {len(data)} samples.')
59
+
60
+
61
+ def seed_everything(seed):
62
+ torch.manual_seed(seed)
63
+ torch.cuda.manual_seed(seed)
64
+ np.random.seed(seed)
65
+ random.seed(seed)
66
+ torch.backends.cudnn.benchmark = False
67
+ torch.backends.cudnn.deterministic = True
68
+ torch.cuda.manual_seed_all(seed)
69
+
70
+
71
+ # def run_infer(model: str,
72
+ # data_path: str,
73
+ # output_dir: str,
74
+ # generation_kwargs: dict = None,
75
+ # enable: bool = True, ):
76
+ # """
77
+ # Process inference for LongWriter model.
78
+ #
79
+ # Args:
80
+ # model: The model id of the LongWriter model on ModelScope, or local model path.
81
+ # data_path: The path to the data file.
82
+ # output_dir: The output directory for the predictions.
83
+ # generation_kwargs: The generation arguments for the model.
84
+ # Attributes: `max_new_tokens`: The maximum number of tokens to generate. `temperature`: The temperature
85
+ # enable: Whether to run infer process.
86
+ # """
87
+ # model_id_path: str = os.path.join(output_dir, model.strip(os.sep).replace(os.sep, '__'))
88
+ #
89
+ # if not enable:
90
+ # logger.warning('*** Skip `infer` stage ***')
91
+ # return f'{model_id_path}/pred.jsonl'
92
+ #
93
+ # seed_everything(42)
94
+ #
95
+ # os.makedirs(model_id_path, exist_ok=True)
96
+ # fout = open(f'{model_id_path}/pred.jsonl', 'w', encoding='utf-8')
97
+ #
98
+ # if generation_kwargs is None:
99
+ # generation_kwargs = dict({
100
+ # 'max_new_tokens': 32768,
101
+ # 'temperature': 0.5
102
+ # })
103
+ #
104
+ # tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
105
+ # world_size = torch.cuda.device_count()
106
+ #
107
+ # logger.info(f'>>Input data path: {data_path}')
108
+ # with open(data_path, encoding='utf-8') as f:
109
+ # data = [json.loads(line) for line in f]
110
+ #
111
+ # data_subsets = [data[i::world_size] for i in range(world_size)]
112
+ # processes = []
113
+ # for rank in range(world_size):
114
+ # p = mp.Process(target=get_pred,
115
+ # args=(rank, world_size, data_subsets[rank], model, generation_kwargs.get('max_new_tokens'), generation_kwargs.get('temperature'), tokenizer, fout))
116
+ # p.start()
117
+ # processes.append(p)
118
+ #
119
+ # for p in processes:
120
+ # p.join()
121
+ #
122
+ # logger.info(f'Finish generating predictions for {model}.')
123
+ # logger.info(f'Predictions are saved in {model_id_path}/pred.jsonl.')
124
+ #
125
+ # return f'{model_id_path}/pred.jsonl'
126
+
127
+
128
+ def run_infer(model: str,
129
+ data_path: str,
130
+ output_dir: str,
131
+ api_config: dict,
132
+ generation_kwargs: dict = None,
133
+ enable: bool = True, ):
134
+ """
135
+ Process inference for LongWriter model.
136
+
137
+ Args:
138
+ model: The model id of the LongWriter model on ModelScope, or local model path.
139
+ data_path: The path to the data file.
140
+ output_dir: The output directory for the predictions.
141
+ api_config: The configuration for the OpenAI API inference.
142
+ Attributes:
143
+ `openai_api_key`: The OpenAI API key. Default is None for custom model serving.
144
+ `openai_api_base`: The OpenAI API base URL.
145
+ `is_chat`: Whether to chat. Default is True.
146
+ `verbose`: Whether to print verbose information. Default is False.
147
+ generation_kwargs: The generation arguments for the model.
148
+ Attributes: `max_new_tokens`: The maximum number of tokens to generate. `temperature`: The temperature
149
+ enable: Whether to run infer process.
150
+ """
151
+ model_id_path: str = os.path.join(output_dir, model.strip(os.sep).replace(os.sep, '__'))
152
+
153
+ if not enable:
154
+ logger.warning('*** Skip `infer` stage ***')
155
+ return f'{model_id_path}/pred.jsonl'
156
+
157
+ seed_everything(42)
158
+
159
+ if generation_kwargs is None:
160
+ generation_kwargs = dict({
161
+ 'max_new_tokens': 32768,
162
+ 'temperature': 0.5,
163
+ 'repetition_penalty': 1.0,
164
+ })
165
+
166
+ # Prepare inputs
167
+ logger.info(f'>>Input data path: {data_path}')
168
+ # TODO: add load data from ms
169
+ with open(data_path, encoding='utf-8') as f:
170
+ data_list = [json.loads(line) for line in f]
171
+
172
+ logger.info(f'Input example: {data_list[0]}')
173
+
174
+ api_client = OpenaiApi(model=model,
175
+ openai_api_key=None,
176
+ openai_api_base=api_config.get('openai_api_base', 'http://127.0.0.1:8000/v1/chat/completions'),
177
+ max_new_tokens=generation_kwargs.get('max_new_tokens', 4096),
178
+ temperature=generation_kwargs.get('temperature', 0.0),
179
+ repetition_penalty=generation_kwargs.get('repetition_penalty', 1.0),
180
+ is_chat=api_config.get('is_chat', True),
181
+ verbose=api_config.get('verbose', False),
182
+ )
183
+
184
+ # TODO: ONLY FOR TEST generate_simple
185
+ results: List[str] = api_client.generate_simple(inputs=[example['prompt'] for example in data_list])
186
+ assert len(results) == len(data_list), f'Error: The number of predictions {len(results)} is not equal to the number of inputs {len(data_list)}.'
187
+ logger.info(f'Finish generating predictions with {len(data_list)} samples for {model}')
188
+
189
+ # Outputs
190
+ os.makedirs(model_id_path, exist_ok=True)
191
+ output_pred_file: str = f'{model_id_path}/pred.jsonl'
192
+ with open(output_pred_file, 'w', encoding='utf-8') as f:
193
+ for dt, res in zip(data_list, results):
194
+ dt["response_length"], _ = count_words(res)
195
+ dt["response"] = res
196
+ f.write(json.dumps(dt, ensure_ascii=False) + '\n')
197
+
198
+ logger.info(f'Predictions are saved in {output_pred_file}')
199
+
200
+ return output_pred_file
201
+
202
+
203
+ if __name__ == '__main__':
204
+ # ZhipuAI/LongWriter-glm4-9b, ZhipuAI/LongWriter-llama3.1-8b
205
+ api_config = dict(openai_api_key=None,
206
+ openai_api_base='http://127.0.0.1:8000/v1/chat/completions',
207
+ is_chat=True,
208
+ verbose=True,)
209
+
210
+ run_infer(model='ZhipuAI/LongWriter-glm4-9b',
211
+ data_path='resources/longbench_write.jsonl',
212
+ output_dir='outputs',
213
+ api_config=api_config,
214
+ generation_kwargs=dict({
215
+ 'max_new_tokens': 32768,
216
+ 'temperature': 0.5})
217
+ )
@@ -0,0 +1,88 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ from typing import Union
4
+
5
+ from evalscope.third_party.longbench_write.infer import run_infer
6
+ from evalscope.third_party.longbench_write.eval import run_eval
7
+ from evalscope.utils import yaml_to_dict, json_to_dict
8
+ from evalscope.utils import get_logger
9
+
10
+ logger = get_logger()
11
+
12
+ """
13
+ Entry file for LongWriter evaluation.
14
+ """
15
+
16
+
17
+ def run_task(task_cfg: Union[str, dict]):
18
+
19
+ if isinstance(task_cfg, str):
20
+ if task_cfg.endswith('.yaml'):
21
+ task_cfg: dict = yaml_to_dict(task_cfg)
22
+ elif task_cfg.endswith('.json'):
23
+ task_cfg: dict = json_to_dict(task_cfg)
24
+ else:
25
+ raise ValueError(f'Unsupported file format: {task_cfg}, should be yaml or json file.')
26
+
27
+ # Parse task configuration
28
+ stage: list = task_cfg.get('stage', ['infer', 'eval_l', 'eval_q'])
29
+ model: str = task_cfg.get('model')
30
+ input_data_path: str = task_cfg.get('input_data_path')
31
+ output_dir: str = task_cfg.get('output_dir')
32
+
33
+ infer_config: dict = task_cfg.get('infer_config')
34
+ eval_config: dict = task_cfg.get('eval_config')
35
+ assert infer_config is not None and eval_config is not None, 'Please provide infer_config and eval_config.'
36
+
37
+ # Run inference process
38
+ pred_res_path = run_infer(model=model,
39
+ data_path=input_data_path or os.path.join(os.path.dirname(__file__), 'resources/longbench_write.jsonl'),
40
+ output_dir=output_dir,
41
+ api_config=dict(
42
+ openai_api_key=infer_config.get('openai_api_key'),
43
+ openai_api_base=infer_config.get('openai_api_base'),
44
+ is_chat=infer_config.get('is_chat', True),
45
+ verbose=infer_config.get('verbose', False),
46
+ ),
47
+ generation_kwargs=infer_config.get('generation_kwargs'),
48
+ enable='infer' in stage)
49
+
50
+ # Run eval process
51
+ run_eval(model=model,
52
+ pred_path=pred_res_path,
53
+ output_dir=output_dir,
54
+ prompt_template_path=os.path.join(os.path.dirname(__file__), 'resources/judge.txt'),
55
+ openai_api_key=eval_config.get('openai_api_key'),
56
+ openai_api_base=eval_config.get('openai_api_base'),
57
+ openai_gpt_model=eval_config.get('openai_gpt_model'),
58
+ generation_kwargs=eval_config.get('generation_kwargs'),
59
+ proc_num=eval_config.get('proc_num', 16),
60
+ stage=stage)
61
+
62
+
63
+ if __name__ == '__main__':
64
+ # Note: evaluation task configuration can also be loaded from yaml or json file.
65
+ # task_cfg = os.path.join(os.path.dirname(__file__), 'default_task.yaml')
66
+ # task_cfg = os.path.join(os.path.dirname(__file__), 'default_task.json')
67
+ task_cfg = dict(stage=['infer', 'eval_l', 'eval_q'],
68
+ model='ZhipuAI/LongWriter-glm4-9b', # or /path/to/your_model_dir
69
+ input_data_path=None,
70
+ output_dir='./outputs',
71
+
72
+ infer_config={
73
+ 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions',
74
+ 'is_chat': True,
75
+ 'verbose': False,
76
+ 'generation_kwargs': {'max_new_tokens': 32768, 'temperature': 0.5, 'repetition_penalty': 1.0},
77
+ },
78
+
79
+ eval_config={
80
+ 'openai_api_key': 'YOUR_OPENAI_API_KEY',
81
+ 'openai_api_base': 'https://api.openai.com/v1/chat/completions',
82
+ 'openai_gpt_model': 'gpt-4o-2024-05-13',
83
+ 'generation_kwargs': {'max_new_tokens': 1024, 'temperature': 0.5, 'stop': None},
84
+ 'proc_num': 16,
85
+ },
86
+ )
87
+
88
+ run_task(task_cfg=task_cfg)
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -0,0 +1,31 @@
1
+ You are an expert in evaluating text quality. Please evaluate the quality of an AI assistant's response to a user's writing request. Be as strict as possible.
2
+
3
+ You need to evaluate across the following six dimensions, with scores ranging from 1 to 5. The scoring criteria from 5 to 1 for each dimension are as follows:
4
+
5
+ 1. Relevance: From content highly relevant and fully applicable to the user's request to completely irrelevant or inapplicable.
6
+
7
+ 2. Accuracy: From content completely accurate with no factual errors or misleading information to content with numerous errors and highly misleading.
8
+
9
+ 3. Coherence: From clear structure with smooth logical connections to disorganized structure with no coherence.
10
+
11
+ 4. Clarity: From clear language, rich in detail, and easy to understand to confusing expression with minimal details.
12
+
13
+ 5. Breadth and Depth: From both broad and deep content with a lot of information to seriously lacking breadth and depth with minimal information.
14
+
15
+ 6. Reading Experience: From excellent reading experience, engaging and easy to understand content to very poor reading experience, boring and hard to understand content.
16
+
17
+ Please evaluate the quality of the following response to a user's request according to the above requirements.
18
+
19
+ <User Request>
20
+
21
+ $INST$
22
+
23
+ </User Request>
24
+
25
+ <Response>
26
+
27
+ $RESPONSE$
28
+
29
+ </Response>
30
+
31
+ Please evaluate the quality of the response. You must first provide a brief analysis of its quality, then give a comprehensive analysis with scores for each dimension. The output must strictly follow the JSON format: {"Analysis": ..., "Relevance": ..., "Accuracy": ..., "Coherence": ..., "Clarity": ..., "Breadth and Depth": ..., "Reading Experience": ...}. You do not need to consider whether the response meets the user's length requirements in your evaluation. Ensure that only one integer between 1 and 5 is output for each dimension score.