evalscope 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +10 -0
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +4 -2
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
- evalscope/benchmarks/tool_bench/utils.py +202 -0
- evalscope/benchmarks/utils.py +3 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/collections/evaluator.py +76 -26
- evalscope/config.py +46 -15
- evalscope/evaluator/evaluator.py +43 -15
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +3 -3
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +15 -19
- evalscope/perf/arguments.py +14 -5
- evalscope/perf/benchmark.py +0 -6
- evalscope/perf/main.py +65 -15
- evalscope/perf/utils/benchmark_util.py +33 -15
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/log_utils.py +1 -1
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/app.py +47 -34
- evalscope/report/utils.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/METADATA +45 -21
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/RECORD +46 -36
- tests/cli/test_all.py +3 -0
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +28 -12
- tests/perf/test_perf.py +23 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
7
7
|
from copy import deepcopy
|
|
8
8
|
from tabulate import tabulate
|
|
9
9
|
from tqdm import tqdm
|
|
10
|
-
from typing import List
|
|
10
|
+
from typing import Any, Dict, List
|
|
11
11
|
|
|
12
12
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
13
13
|
from evalscope.collections.sampler import DatasetEntry
|
|
@@ -190,21 +190,24 @@ class EvaluatorCollection:
|
|
|
190
190
|
answer_dict = defaultdict(dict)
|
|
191
191
|
if self.task_cfg.use_cache and os.path.exists(pred_file_path):
|
|
192
192
|
answers_list = jsonl_to_list(pred_file_path)
|
|
193
|
+
# Create a set of sample indices for which we have answers
|
|
193
194
|
indices = set()
|
|
194
195
|
for answer in answers_list:
|
|
195
196
|
index = answer.get(AnswerKeys.INDEX)
|
|
196
197
|
answer_dict[index] = answer
|
|
197
198
|
indices.add(index)
|
|
198
199
|
|
|
199
|
-
|
|
200
|
-
for sample in self.dataset
|
|
201
|
-
|
|
202
|
-
|
|
200
|
+
# Filter dataset to only include samples that don't have answers
|
|
201
|
+
data = [sample for sample in self.dataset if sample.index not in indices]
|
|
202
|
+
|
|
203
|
+
# Initialize name map for the filtered dataset
|
|
203
204
|
data_map = self._init_name_map(data)
|
|
204
205
|
|
|
205
206
|
logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
|
|
206
207
|
return answer_dict, data, data_map
|
|
207
|
-
|
|
208
|
+
else:
|
|
209
|
+
# If cache isn't enabled or file doesn't exist, return the full dataset
|
|
210
|
+
return answer_dict, self.dataset, self.dataset_name_map
|
|
208
211
|
|
|
209
212
|
def get_answers(self):
|
|
210
213
|
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
|
|
@@ -214,13 +217,16 @@ class EvaluatorCollection:
|
|
|
214
217
|
answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
|
|
215
218
|
|
|
216
219
|
eval_batch_size = self.task_cfg.eval_batch_size
|
|
220
|
+
# Process samples and get answers
|
|
217
221
|
with tqdm(total=len(dataset), desc='Getting answers') as pbar:
|
|
218
222
|
if self.task_cfg.eval_type == EvalType.SERVICE:
|
|
223
|
+
# Create a thread pool for parallel processing
|
|
219
224
|
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
|
|
220
225
|
futures = []
|
|
221
226
|
for sample in dataset:
|
|
222
227
|
evaluator = self.evaluators[sample.dataset_name]
|
|
223
228
|
futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
|
|
229
|
+
# Process completed tasks
|
|
224
230
|
for future in as_completed(futures):
|
|
225
231
|
answer_list, samples = future.result()
|
|
226
232
|
answers[samples[0].index] = answer_list[0]
|
|
@@ -244,35 +250,79 @@ class EvaluatorCollection:
|
|
|
244
250
|
pbar.update(len(batch_ids))
|
|
245
251
|
return answers
|
|
246
252
|
|
|
247
|
-
def get_reviews(self, answers):
|
|
253
|
+
def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
|
|
254
|
+
"""
|
|
255
|
+
Retrieve or generate reviews for given answers.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
answers: Dictionary of answers indexed by sample index.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Dictionary of reviews indexed by sample index.
|
|
262
|
+
"""
|
|
263
|
+
# Set up the review file path
|
|
248
264
|
review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
|
|
249
265
|
os.makedirs(review_file_path, exist_ok=True)
|
|
250
266
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
if os.path.isfile(file_path):
|
|
259
|
-
os.remove(file_path)
|
|
260
|
-
except Exception as e:
|
|
261
|
-
logger.error(f'Error deleting file {file_path}: {e}')
|
|
267
|
+
review_history_map = defaultdict(dict)
|
|
268
|
+
|
|
269
|
+
# Handle caching logic
|
|
270
|
+
if os.path.exists(review_file_path):
|
|
271
|
+
if not self.task_cfg.use_cache:
|
|
272
|
+
# Clear existing reviews if not using cache
|
|
273
|
+
self._clear_review_files(review_file_path)
|
|
262
274
|
else:
|
|
263
|
-
|
|
275
|
+
# Load existing reviews if using cache
|
|
276
|
+
self._load_existing_reviews(review_file_path, review_history_map)
|
|
264
277
|
|
|
265
|
-
reviews =
|
|
278
|
+
reviews = {}
|
|
266
279
|
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
267
|
-
|
|
268
|
-
|
|
280
|
+
file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
|
|
281
|
+
|
|
282
|
+
if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
|
|
283
|
+
# Use cached review if available
|
|
284
|
+
review_d = review_history_map[file_name][sample.index]
|
|
285
|
+
else:
|
|
286
|
+
# Generate new review
|
|
287
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
288
|
+
review_d = evaluator.get_review(answers[sample.index])
|
|
289
|
+
# Only save the review if it's not in the cache
|
|
290
|
+
self._save_review(review_file_path, file_name, review_d)
|
|
291
|
+
|
|
269
292
|
reviews[sample.index] = review_d
|
|
270
|
-
|
|
271
|
-
review_d,
|
|
272
|
-
os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
|
|
273
|
-
dump_mode=DumpMode.APPEND)
|
|
293
|
+
|
|
274
294
|
return reviews
|
|
275
295
|
|
|
296
|
+
def _clear_review_files(self, review_file_path: str) -> None:
|
|
297
|
+
"""Clear existing review files."""
|
|
298
|
+
if os.path.isdir(review_file_path):
|
|
299
|
+
for filename in os.listdir(review_file_path):
|
|
300
|
+
file_path = os.path.join(review_file_path, filename)
|
|
301
|
+
try:
|
|
302
|
+
if os.path.isfile(file_path):
|
|
303
|
+
os.remove(file_path)
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logger.error(f'Error deleting file {file_path}: {e}')
|
|
306
|
+
else:
|
|
307
|
+
os.remove(review_file_path)
|
|
308
|
+
|
|
309
|
+
def _load_existing_reviews(self, review_file_path: str, review_history_map: Dict[str, Dict[int, Any]]) -> None:
|
|
310
|
+
"""Load existing reviews from files."""
|
|
311
|
+
logger.info(f'use_cache={self.task_cfg.use_cache}, reloading the review file: {review_file_path}')
|
|
312
|
+
if os.path.isdir(review_file_path):
|
|
313
|
+
for filename in os.listdir(review_file_path):
|
|
314
|
+
if '.ipynb_checkpoints' in filename:
|
|
315
|
+
continue
|
|
316
|
+
file_path = os.path.join(review_file_path, filename)
|
|
317
|
+
with open(file_path, 'r') as f:
|
|
318
|
+
review_history = [json.loads(line.strip()) for line in f]
|
|
319
|
+
review_history_map[filename] = {item['index']: item for item in review_history}
|
|
320
|
+
|
|
321
|
+
def _save_review(self, review_file_path: str, file_name: str, review_d: Dict[str, Any]) -> None:
|
|
322
|
+
"""Save a single review to file."""
|
|
323
|
+
file_path = os.path.join(review_file_path, file_name)
|
|
324
|
+
dump_jsonl_data(review_d, file_path, dump_mode=DumpMode.APPEND)
|
|
325
|
+
|
|
276
326
|
def get_scores(self, reviews) -> float:
|
|
277
327
|
scores = defaultdict(dict)
|
|
278
328
|
for sample in tqdm(self.dataset, desc='Getting scores'):
|
evalscope/config.py
CHANGED
|
@@ -18,24 +18,14 @@ logger = get_logger()
|
|
|
18
18
|
|
|
19
19
|
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
20
20
|
|
|
21
|
-
DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16'}
|
|
22
|
-
DEFAULT_GENERATION_CONFIG = {
|
|
23
|
-
'max_length': 2048,
|
|
24
|
-
'max_new_tokens': 512,
|
|
25
|
-
'do_sample': False,
|
|
26
|
-
'top_k': 50,
|
|
27
|
-
'top_p': 1.0,
|
|
28
|
-
'temperature': 1.0,
|
|
29
|
-
}
|
|
30
|
-
|
|
31
21
|
|
|
32
22
|
@dataclass
|
|
33
23
|
class TaskConfig:
|
|
34
24
|
# Model-related arguments
|
|
35
25
|
model: Union[str, 'CustomModel', None] = None
|
|
36
26
|
model_id: Optional[str] = None
|
|
37
|
-
model_args:
|
|
38
|
-
model_task:
|
|
27
|
+
model_args: Dict = field(default_factory=dict)
|
|
28
|
+
model_task: str = ModelTask.TEXT_GENERATION
|
|
39
29
|
|
|
40
30
|
# Template-related arguments
|
|
41
31
|
template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
@@ -48,7 +38,7 @@ class TaskConfig:
|
|
|
48
38
|
dataset_hub: str = HubType.MODELSCOPE
|
|
49
39
|
|
|
50
40
|
# Generation configuration arguments
|
|
51
|
-
generation_config:
|
|
41
|
+
generation_config: Dict = field(default_factory=dict)
|
|
52
42
|
|
|
53
43
|
# Evaluation-related arguments
|
|
54
44
|
eval_type: str = EvalType.CHECKPOINT
|
|
@@ -65,6 +55,7 @@ class TaskConfig:
|
|
|
65
55
|
outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
66
56
|
|
|
67
57
|
# Debug and runtime mode arguments
|
|
58
|
+
ignore_errors: bool = False
|
|
68
59
|
debug: bool = False
|
|
69
60
|
dry_run: bool = False
|
|
70
61
|
seed: Optional[int] = 42
|
|
@@ -95,6 +86,46 @@ class TaskConfig:
|
|
|
95
86
|
if self.eval_batch_size is None:
|
|
96
87
|
self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
|
|
97
88
|
|
|
89
|
+
# Set default generation_config and model_args
|
|
90
|
+
self.__init_default_generation_config()
|
|
91
|
+
self.__init_default_model_args()
|
|
92
|
+
|
|
93
|
+
def __init_default_generation_config(self):
|
|
94
|
+
if self.generation_config:
|
|
95
|
+
return
|
|
96
|
+
if self.model_task == ModelTask.IMAGE_GENERATION:
|
|
97
|
+
self.generation_config = {
|
|
98
|
+
'height': 1024,
|
|
99
|
+
'width': 1024,
|
|
100
|
+
'num_inference_steps': 50,
|
|
101
|
+
'guidance_scale': 9.0,
|
|
102
|
+
}
|
|
103
|
+
elif self.model_task == ModelTask.TEXT_GENERATION:
|
|
104
|
+
if self.eval_type == EvalType.CHECKPOINT:
|
|
105
|
+
self.generation_config = {
|
|
106
|
+
'max_length': 2048,
|
|
107
|
+
'max_new_tokens': 512,
|
|
108
|
+
'do_sample': False,
|
|
109
|
+
'top_k': 50,
|
|
110
|
+
'top_p': 1.0,
|
|
111
|
+
'temperature': 1.0,
|
|
112
|
+
}
|
|
113
|
+
elif self.eval_type == EvalType.SERVICE:
|
|
114
|
+
self.generation_config = {
|
|
115
|
+
'max_tokens': 2048,
|
|
116
|
+
'temperature': 0.0,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
def __init_default_model_args(self):
|
|
120
|
+
if self.model_args:
|
|
121
|
+
return
|
|
122
|
+
if self.model_task == ModelTask.TEXT_GENERATION:
|
|
123
|
+
if self.eval_type == EvalType.CHECKPOINT:
|
|
124
|
+
self.model_args = {
|
|
125
|
+
'revision': 'master',
|
|
126
|
+
'precision': 'torch.float16',
|
|
127
|
+
}
|
|
128
|
+
|
|
98
129
|
def to_dict(self):
|
|
99
130
|
result = self.__dict__.copy()
|
|
100
131
|
if isinstance(self.model, CustomModel):
|
|
@@ -218,9 +249,9 @@ def parse_task_config(task_cfg) -> TaskConfig:
|
|
|
218
249
|
elif isinstance(task_cfg, str):
|
|
219
250
|
extension = os.path.splitext(task_cfg)[-1]
|
|
220
251
|
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
221
|
-
if extension in ['yaml', 'yml']:
|
|
252
|
+
if extension in ['.yaml', '.yml']:
|
|
222
253
|
task_cfg = TaskConfig.from_yaml(task_cfg)
|
|
223
|
-
elif extension == 'json':
|
|
254
|
+
elif extension == '.json':
|
|
224
255
|
task_cfg = TaskConfig.from_json(task_cfg)
|
|
225
256
|
else:
|
|
226
257
|
raise ValueError('Args: Unsupported file extension.')
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -97,13 +97,23 @@ class Evaluator(object):
|
|
|
97
97
|
answer_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
98
98
|
answer_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
99
99
|
answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
|
|
100
|
-
# answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
101
100
|
answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
|
|
102
101
|
return answer_d
|
|
103
102
|
|
|
104
103
|
def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
|
|
105
104
|
answers_list = []
|
|
106
|
-
|
|
105
|
+
try:
|
|
106
|
+
# get answer from model
|
|
107
|
+
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
|
|
110
|
+
# if ignore_errors is True, continue to next input
|
|
111
|
+
if self.task_cfg.ignore_errors:
|
|
112
|
+
logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
|
|
113
|
+
return answers_list
|
|
114
|
+
else:
|
|
115
|
+
raise e
|
|
116
|
+
# process answer
|
|
107
117
|
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
108
118
|
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
109
119
|
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
@@ -197,16 +207,17 @@ class Evaluator(object):
|
|
|
197
207
|
reviewer_spec = {}
|
|
198
208
|
|
|
199
209
|
review_res = deepcopy(answer_d)
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
review_res[ReviewKeys.REVIEWED] =
|
|
210
|
+
if AnswerKeys.CHOICES not in review_res:
|
|
211
|
+
review_res[AnswerKeys.CHOICES] = []
|
|
212
|
+
review_res[ReviewKeys.REVIEWED] = True
|
|
203
213
|
review_res[ReviewKeys.REVIEW_ID] = None
|
|
204
214
|
review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
|
|
205
215
|
review_res[ReviewKeys.REVIEW_TIME] = time.time()
|
|
216
|
+
logger.warning(f'No choices found for answer dict: {review_res}')
|
|
206
217
|
return review_res
|
|
207
218
|
|
|
208
219
|
rev_choices = []
|
|
209
|
-
for choice in
|
|
220
|
+
for choice in review_res[AnswerKeys.CHOICES]:
|
|
210
221
|
raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
|
|
211
222
|
answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
|
|
212
223
|
gold_content = self.data_adapter.get_gold_answer(raw_input_d)
|
|
@@ -280,11 +291,20 @@ class Evaluator(object):
|
|
|
280
291
|
review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
|
|
281
292
|
os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
|
|
282
293
|
|
|
294
|
+
# Load existing reviews if using cache
|
|
295
|
+
existing_reviews = {}
|
|
283
296
|
if self.use_cache and os.path.exists(review_file_path):
|
|
284
|
-
|
|
285
|
-
|
|
297
|
+
with open(review_file_path, 'r') as f:
|
|
298
|
+
for line in f:
|
|
299
|
+
review = json.loads(line.strip())
|
|
300
|
+
existing_reviews[review['index']] = review
|
|
301
|
+
logger.info(f'Reusing review result from {review_file_path}, got {len(existing_reviews)} reviews.')
|
|
286
302
|
|
|
287
303
|
def process_single_review(answer_d):
|
|
304
|
+
# Check if review already exists in cache
|
|
305
|
+
if self.use_cache and answer_d['index'] in existing_reviews:
|
|
306
|
+
return existing_reviews[answer_d['index']]
|
|
307
|
+
|
|
288
308
|
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
289
309
|
# Get review
|
|
290
310
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
@@ -299,8 +319,9 @@ class Evaluator(object):
|
|
|
299
319
|
for future in tqdm(as_completed(futures), total=len(futures), desc=f'Reviewing({subset_name}): '):
|
|
300
320
|
review_d = future.result()
|
|
301
321
|
reviews_list.append(review_d)
|
|
302
|
-
# Dump reviews
|
|
303
|
-
|
|
322
|
+
# Dump new reviews only if not using cache or review is new
|
|
323
|
+
if not self.use_cache or review_d['index'] not in existing_reviews:
|
|
324
|
+
dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
|
|
304
325
|
|
|
305
326
|
return reviews_list
|
|
306
327
|
|
|
@@ -315,17 +336,24 @@ class Evaluator(object):
|
|
|
315
336
|
Returns:
|
|
316
337
|
The metric result. Depends on the metric function in data_adapter.
|
|
317
338
|
"""
|
|
339
|
+
# Get max choices
|
|
340
|
+
choices_lengths = [
|
|
341
|
+
len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d.get(ReviewKeys.REVIEWED)
|
|
342
|
+
]
|
|
343
|
+
if choices_lengths:
|
|
344
|
+
max_choices = max(choices_lengths)
|
|
345
|
+
else:
|
|
346
|
+
max_choices = 0
|
|
318
347
|
|
|
348
|
+
# Get review result
|
|
319
349
|
review_res_list = []
|
|
320
|
-
max_choices = max(
|
|
321
|
-
len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
|
|
322
350
|
for review_d in reviews_list:
|
|
323
351
|
if not review_d[ReviewKeys.REVIEWED]:
|
|
324
|
-
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
352
|
+
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
|
|
325
353
|
continue
|
|
326
354
|
|
|
327
355
|
if len(review_d[AnswerKeys.CHOICES]) == 0:
|
|
328
|
-
logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
356
|
+
logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
|
|
329
357
|
continue
|
|
330
358
|
elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
|
|
331
359
|
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
@@ -367,7 +395,7 @@ class Evaluator(object):
|
|
|
367
395
|
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
|
368
396
|
|
|
369
397
|
# Write report
|
|
370
|
-
with open(report_path, 'w') as f:
|
|
398
|
+
with open(report_path, 'w', encoding='utf-8') as f:
|
|
371
399
|
f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
|
|
372
400
|
logger.info(f'Dump report: {report_path} \n')
|
|
373
401
|
|
|
@@ -44,20 +44,25 @@ from evalscope.utils import get_logger
|
|
|
44
44
|
|
|
45
45
|
logger = get_logger()
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
os.
|
|
56
|
-
os.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
47
|
+
|
|
48
|
+
def check_nltk_data():
|
|
49
|
+
"""
|
|
50
|
+
Check if nltk data is available in the system.
|
|
51
|
+
If not, download the necessary data files.
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
|
|
55
|
+
os.makedirs(nltk_dir, exist_ok=True)
|
|
56
|
+
punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
|
|
57
|
+
punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
|
|
58
|
+
|
|
59
|
+
if not os.path.exists(punkt_path):
|
|
60
|
+
os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
|
|
61
|
+
os.system(f'unzip {punkt_path} -d {nltk_dir}')
|
|
62
|
+
else:
|
|
63
|
+
logger.debug(f'{punkt_path} already exists, skipping download')
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
|
|
61
66
|
|
|
62
67
|
|
|
63
68
|
class RougeScorer(scoring.BaseScorer):
|
|
@@ -83,7 +88,7 @@ class RougeScorer(scoring.BaseScorer):
|
|
|
83
88
|
"""
|
|
84
89
|
|
|
85
90
|
def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
|
|
86
|
-
|
|
91
|
+
check_nltk_data()
|
|
87
92
|
self.rouge_types = rouge_types
|
|
88
93
|
if tokenizer:
|
|
89
94
|
self._tokenizer = tokenizer
|
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -59,13 +59,13 @@ class LLMJudge:
|
|
|
59
59
|
# Initialize ServerModelAdapter
|
|
60
60
|
self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
|
|
61
61
|
|
|
62
|
-
def __call__(self, prompt: str, system_prompt: Optional[str] = None) ->
|
|
62
|
+
def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
63
63
|
"""
|
|
64
64
|
Args:
|
|
65
65
|
prompt (str): The prompt to evaluate
|
|
66
66
|
system_prompt (str, optional): The system prompt to use for the evaluation
|
|
67
67
|
Returns:
|
|
68
|
-
|
|
68
|
+
str: The response from the LLM
|
|
69
69
|
"""
|
|
70
70
|
input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
|
|
71
71
|
|
|
@@ -83,7 +83,7 @@ class LLMJudge:
|
|
|
83
83
|
return llm_response
|
|
84
84
|
except Exception as e:
|
|
85
85
|
logger.error(f'Error during LLM evaluation: {e}')
|
|
86
|
-
return
|
|
86
|
+
return ''
|
|
87
87
|
|
|
88
88
|
def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
|
|
89
89
|
if question is None:
|
|
@@ -19,10 +19,6 @@ class DummyTokenizer:
|
|
|
19
19
|
return text.split()
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
|
|
23
|
-
zh_scorer = Rouge()
|
|
24
|
-
|
|
25
|
-
|
|
26
22
|
def is_contains_chinese(strs):
|
|
27
23
|
for _char in strs:
|
|
28
24
|
if '\u4e00' <= _char <= '\u9fa5':
|
|
@@ -51,6 +47,7 @@ def compute_rouge_score(predict_l, reference_l):
|
|
|
51
47
|
|
|
52
48
|
def compute_rouge_score_one_sample_zh(predict, reference):
|
|
53
49
|
result = dict()
|
|
50
|
+
zh_scorer = Rouge()
|
|
54
51
|
for p, r in zip(predict, reference):
|
|
55
52
|
p = ' '.join(jieba.cut(p)) if is_contains_chinese(p) else p
|
|
56
53
|
r = ' '.join(jieba.cut(r)) if is_contains_chinese(r) else r
|
|
@@ -60,21 +57,22 @@ def compute_rouge_score_one_sample_zh(predict, reference):
|
|
|
60
57
|
except Exception as e:
|
|
61
58
|
logger.warning(f'rouge score error: {p} {r} {e}')
|
|
62
59
|
continue
|
|
63
|
-
result['
|
|
64
|
-
result['
|
|
65
|
-
result['
|
|
66
|
-
result['
|
|
67
|
-
result['
|
|
68
|
-
result['
|
|
69
|
-
result['
|
|
70
|
-
result['
|
|
71
|
-
result['
|
|
60
|
+
result['Rouge-1-R'] = score['rouge-1']['r']
|
|
61
|
+
result['Rouge-1-P'] = score['rouge-1']['p']
|
|
62
|
+
result['Rouge-1-F'] = score['rouge-1']['f']
|
|
63
|
+
result['Rouge-2-R'] = score['rouge-2']['r']
|
|
64
|
+
result['Rouge-2-P'] = score['rouge-2']['p']
|
|
65
|
+
result['Rouge-2-F'] = score['rouge-2']['f']
|
|
66
|
+
result['Rouge-L-R'] = score['rouge-l']['r']
|
|
67
|
+
result['Rouge-L-P'] = score['rouge-l']['p']
|
|
68
|
+
result['Rouge-L-F'] = score['rouge-l']['f']
|
|
72
69
|
|
|
73
70
|
return result
|
|
74
71
|
|
|
75
72
|
|
|
76
73
|
def compute_rouge_score_one_sample(predict, reference):
|
|
77
74
|
result = dict()
|
|
75
|
+
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
|
|
78
76
|
for p, r in zip(predict, reference):
|
|
79
77
|
try:
|
|
80
78
|
score = scorer.score(p, r)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
3
|
import torch
|
|
4
|
-
from typing import Any, Dict, List, Tuple, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
|
|
7
7
|
from evalscope.utils.logger import get_logger
|
|
@@ -58,19 +58,15 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
58
58
|
return generation_config
|
|
59
59
|
|
|
60
60
|
def _model_generate(self,
|
|
61
|
-
|
|
62
|
-
system_prompts: List[str] = None,
|
|
61
|
+
formatted_prompts: List[str],
|
|
63
62
|
infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
|
|
64
63
|
"""
|
|
65
64
|
Args:
|
|
66
|
-
|
|
67
|
-
system_prompts: The system prompts.
|
|
65
|
+
formatted_prompts: The formatted prompts.
|
|
68
66
|
infer_cfg: The inference configuration.
|
|
69
67
|
Returns:
|
|
70
68
|
The prediction results.
|
|
71
69
|
"""
|
|
72
|
-
if system_prompts is None:
|
|
73
|
-
system_prompts = []
|
|
74
70
|
if infer_cfg is None:
|
|
75
71
|
infer_cfg = {}
|
|
76
72
|
|
|
@@ -92,27 +88,6 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
92
88
|
self.generation_config.update(**infer_cfg)
|
|
93
89
|
fix_do_sample_warning(self.generation_config)
|
|
94
90
|
|
|
95
|
-
# For chat model, use the chat template to format the input
|
|
96
|
-
if self.tokenizer.chat_template is not None:
|
|
97
|
-
formatted_prompts = []
|
|
98
|
-
for i, query in enumerate(queries):
|
|
99
|
-
messages = [ChatMessage(role='user', content=query)]
|
|
100
|
-
if i < len(system_prompts) and system_prompts[i]:
|
|
101
|
-
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
102
|
-
# whether thinking is needed
|
|
103
|
-
chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
|
|
104
|
-
if chat_template_kwargs is not None:
|
|
105
|
-
prompts = self.tokenizer.apply_chat_template(
|
|
106
|
-
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
|
|
107
|
-
else:
|
|
108
|
-
prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
109
|
-
formatted_prompts.append(prompts)
|
|
110
|
-
else:
|
|
111
|
-
# For base model, use the queries as the input
|
|
112
|
-
formatted_prompts = queries
|
|
113
|
-
|
|
114
|
-
logger.debug(f'formatted_prompts: {formatted_prompts}')
|
|
115
|
-
|
|
116
91
|
# Get input ids
|
|
117
92
|
inputs = self.tokenizer(
|
|
118
93
|
formatted_prompts, return_tensors='pt', padding=True, truncation=True,
|
|
@@ -136,26 +111,68 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
136
111
|
|
|
137
112
|
return responses, input_lengths
|
|
138
113
|
|
|
139
|
-
|
|
140
|
-
def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
|
|
114
|
+
def _prepare_inputs(self, inputs: List[dict], infer_cfg: dict = {}) -> List[str]:
|
|
141
115
|
"""
|
|
116
|
+
Prepare the inputs for the model.
|
|
142
117
|
Args:
|
|
143
118
|
inputs: The input data.
|
|
144
119
|
infer_cfg: The inference configuration.
|
|
145
120
|
Returns:
|
|
146
|
-
The
|
|
121
|
+
The prepared inputs and system prompts.
|
|
147
122
|
"""
|
|
148
|
-
|
|
149
|
-
# Process inputs
|
|
150
123
|
queries = []
|
|
151
124
|
system_prompts = []
|
|
125
|
+
message_list = []
|
|
152
126
|
|
|
153
127
|
for input_item in inputs:
|
|
154
128
|
queries.append(input_item['data'][0])
|
|
155
129
|
system_prompts.append(input_item.get('system_prompt', None))
|
|
130
|
+
if input_item.get('messages', None):
|
|
131
|
+
message_list.append(input_item.get('messages', None))
|
|
132
|
+
|
|
133
|
+
# For non chat model, use the original queries as the input
|
|
134
|
+
if self.tokenizer.chat_template is None:
|
|
135
|
+
return queries
|
|
136
|
+
|
|
137
|
+
# For chat model, use the messages as the input
|
|
138
|
+
# if message_list is None, use the queries as the input
|
|
139
|
+
if len(message_list) == 0:
|
|
140
|
+
for i, query in enumerate(queries):
|
|
141
|
+
messages = [ChatMessage(role='user', content=query)]
|
|
142
|
+
if i < len(system_prompts) and system_prompts[i]:
|
|
143
|
+
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
144
|
+
message_list.append(messages)
|
|
145
|
+
|
|
146
|
+
# Format the messages
|
|
147
|
+
formatted_prompts = []
|
|
148
|
+
for messages in message_list:
|
|
149
|
+
# apply chat template
|
|
150
|
+
chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
|
|
151
|
+
if chat_template_kwargs is not None:
|
|
152
|
+
prompts = self.tokenizer.apply_chat_template(
|
|
153
|
+
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
|
|
154
|
+
else:
|
|
155
|
+
prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
156
|
+
formatted_prompts.append(prompts)
|
|
157
|
+
|
|
158
|
+
logger.debug(f'formatted_prompts: {formatted_prompts}')
|
|
159
|
+
return formatted_prompts
|
|
160
|
+
|
|
161
|
+
@torch.no_grad()
|
|
162
|
+
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = {}) -> List[dict]:
|
|
163
|
+
"""
|
|
164
|
+
Args:
|
|
165
|
+
inputs: The input data.
|
|
166
|
+
infer_cfg: The inference configuration.
|
|
167
|
+
Returns:
|
|
168
|
+
The prediction results.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
# Process inputs
|
|
172
|
+
formatted_prompts = self._prepare_inputs(inputs, infer_cfg)
|
|
156
173
|
|
|
157
174
|
# Run inference
|
|
158
|
-
responses, input_lengths = self._model_generate(
|
|
175
|
+
responses, input_lengths = self._model_generate(formatted_prompts, infer_cfg)
|
|
159
176
|
|
|
160
177
|
# Process outputs
|
|
161
178
|
results = []
|