evalscope 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (46) hide show
  1. evalscope/arguments.py +10 -0
  2. evalscope/backend/rag_eval/utils/llm.py +1 -1
  3. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  4. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  5. evalscope/benchmarks/data_adapter.py +4 -2
  6. evalscope/benchmarks/drop/__init__.py +0 -0
  7. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  8. evalscope/benchmarks/drop/utils.py +59 -0
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
  10. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  11. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  12. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
  13. evalscope/benchmarks/tool_bench/utils.py +202 -0
  14. evalscope/benchmarks/utils.py +3 -2
  15. evalscope/benchmarks/winogrande/__init__.py +0 -0
  16. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  17. evalscope/collections/evaluator.py +76 -26
  18. evalscope/config.py +46 -15
  19. evalscope/evaluator/evaluator.py +43 -15
  20. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  21. evalscope/metrics/llm_judge.py +3 -3
  22. evalscope/metrics/rouge_metric.py +11 -13
  23. evalscope/models/adapters/chat_adapter.py +51 -34
  24. evalscope/models/adapters/server_adapter.py +15 -19
  25. evalscope/perf/arguments.py +14 -5
  26. evalscope/perf/benchmark.py +0 -6
  27. evalscope/perf/main.py +65 -15
  28. evalscope/perf/utils/benchmark_util.py +33 -15
  29. evalscope/perf/utils/db_util.py +25 -15
  30. evalscope/perf/utils/log_utils.py +1 -1
  31. evalscope/perf/utils/rich_display.py +186 -0
  32. evalscope/report/app.py +47 -34
  33. evalscope/report/utils.py +1 -1
  34. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  35. evalscope/utils/deprecation_utils.py +42 -0
  36. evalscope/version.py +2 -2
  37. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/METADATA +45 -21
  38. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/RECORD +46 -36
  39. tests/cli/test_all.py +3 -0
  40. tests/cli/test_collection.py +2 -1
  41. tests/cli/test_run.py +28 -12
  42. tests/perf/test_perf.py +23 -0
  43. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
  44. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
  45. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
  46. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
7
7
  from copy import deepcopy
8
8
  from tabulate import tabulate
9
9
  from tqdm import tqdm
10
- from typing import List
10
+ from typing import Any, Dict, List
11
11
 
12
12
  from evalscope.benchmarks import Benchmark, DataAdapter
13
13
  from evalscope.collections.sampler import DatasetEntry
@@ -190,21 +190,24 @@ class EvaluatorCollection:
190
190
  answer_dict = defaultdict(dict)
191
191
  if self.task_cfg.use_cache and os.path.exists(pred_file_path):
192
192
  answers_list = jsonl_to_list(pred_file_path)
193
+ # Create a set of sample indices for which we have answers
193
194
  indices = set()
194
195
  for answer in answers_list:
195
196
  index = answer.get(AnswerKeys.INDEX)
196
197
  answer_dict[index] = answer
197
198
  indices.add(index)
198
199
 
199
- data = []
200
- for sample in self.dataset:
201
- if sample.index not in indices:
202
- data.append(sample)
200
+ # Filter dataset to only include samples that don't have answers
201
+ data = [sample for sample in self.dataset if sample.index not in indices]
202
+
203
+ # Initialize name map for the filtered dataset
203
204
  data_map = self._init_name_map(data)
204
205
 
205
206
  logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
206
207
  return answer_dict, data, data_map
207
- return answer_dict, self.dataset, self.dataset_name_map
208
+ else:
209
+ # If cache isn't enabled or file doesn't exist, return the full dataset
210
+ return answer_dict, self.dataset, self.dataset_name_map
208
211
 
209
212
  def get_answers(self):
210
213
  pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
@@ -214,13 +217,16 @@ class EvaluatorCollection:
214
217
  answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
215
218
 
216
219
  eval_batch_size = self.task_cfg.eval_batch_size
220
+ # Process samples and get answers
217
221
  with tqdm(total=len(dataset), desc='Getting answers') as pbar:
218
222
  if self.task_cfg.eval_type == EvalType.SERVICE:
223
+ # Create a thread pool for parallel processing
219
224
  with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
220
225
  futures = []
221
226
  for sample in dataset:
222
227
  evaluator = self.evaluators[sample.dataset_name]
223
228
  futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
229
+ # Process completed tasks
224
230
  for future in as_completed(futures):
225
231
  answer_list, samples = future.result()
226
232
  answers[samples[0].index] = answer_list[0]
@@ -244,35 +250,79 @@ class EvaluatorCollection:
244
250
  pbar.update(len(batch_ids))
245
251
  return answers
246
252
 
247
- def get_reviews(self, answers):
253
+ def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
254
+ """
255
+ Retrieve or generate reviews for given answers.
256
+
257
+ Args:
258
+ answers: Dictionary of answers indexed by sample index.
259
+
260
+ Returns:
261
+ Dictionary of reviews indexed by sample index.
262
+ """
263
+ # Set up the review file path
248
264
  review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
249
265
  os.makedirs(review_file_path, exist_ok=True)
250
266
 
251
- if self.task_cfg.use_cache and os.path.exists(review_file_path):
252
- logger.warning(
253
- f'Ignore use_cache={self.task_cfg.use_cache}, updating the review file: {review_file_path} ...')
254
- if os.path.isdir(review_file_path):
255
- for filename in os.listdir(review_file_path):
256
- file_path = os.path.join(review_file_path, filename)
257
- try:
258
- if os.path.isfile(file_path):
259
- os.remove(file_path)
260
- except Exception as e:
261
- logger.error(f'Error deleting file {file_path}: {e}')
267
+ review_history_map = defaultdict(dict)
268
+
269
+ # Handle caching logic
270
+ if os.path.exists(review_file_path):
271
+ if not self.task_cfg.use_cache:
272
+ # Clear existing reviews if not using cache
273
+ self._clear_review_files(review_file_path)
262
274
  else:
263
- os.remove(review_file_path)
275
+ # Load existing reviews if using cache
276
+ self._load_existing_reviews(review_file_path, review_history_map)
264
277
 
265
- reviews = defaultdict(dict)
278
+ reviews = {}
266
279
  for sample in tqdm(self.dataset, desc='Getting reviews'):
267
- evaluator = self.evaluators[sample.dataset_name]
268
- review_d = evaluator.get_review(answers[sample.index])
280
+ file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
281
+
282
+ if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
283
+ # Use cached review if available
284
+ review_d = review_history_map[file_name][sample.index]
285
+ else:
286
+ # Generate new review
287
+ evaluator = self.evaluators[sample.dataset_name]
288
+ review_d = evaluator.get_review(answers[sample.index])
289
+ # Only save the review if it's not in the cache
290
+ self._save_review(review_file_path, file_name, review_d)
291
+
269
292
  reviews[sample.index] = review_d
270
- dump_jsonl_data(
271
- review_d,
272
- os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
273
- dump_mode=DumpMode.APPEND)
293
+
274
294
  return reviews
275
295
 
296
+ def _clear_review_files(self, review_file_path: str) -> None:
297
+ """Clear existing review files."""
298
+ if os.path.isdir(review_file_path):
299
+ for filename in os.listdir(review_file_path):
300
+ file_path = os.path.join(review_file_path, filename)
301
+ try:
302
+ if os.path.isfile(file_path):
303
+ os.remove(file_path)
304
+ except Exception as e:
305
+ logger.error(f'Error deleting file {file_path}: {e}')
306
+ else:
307
+ os.remove(review_file_path)
308
+
309
+ def _load_existing_reviews(self, review_file_path: str, review_history_map: Dict[str, Dict[int, Any]]) -> None:
310
+ """Load existing reviews from files."""
311
+ logger.info(f'use_cache={self.task_cfg.use_cache}, reloading the review file: {review_file_path}')
312
+ if os.path.isdir(review_file_path):
313
+ for filename in os.listdir(review_file_path):
314
+ if '.ipynb_checkpoints' in filename:
315
+ continue
316
+ file_path = os.path.join(review_file_path, filename)
317
+ with open(file_path, 'r') as f:
318
+ review_history = [json.loads(line.strip()) for line in f]
319
+ review_history_map[filename] = {item['index']: item for item in review_history}
320
+
321
+ def _save_review(self, review_file_path: str, file_name: str, review_d: Dict[str, Any]) -> None:
322
+ """Save a single review to file."""
323
+ file_path = os.path.join(review_file_path, file_name)
324
+ dump_jsonl_data(review_d, file_path, dump_mode=DumpMode.APPEND)
325
+
276
326
  def get_scores(self, reviews) -> float:
277
327
  scores = defaultdict(dict)
278
328
  for sample in tqdm(self.dataset, desc='Getting scores'):
evalscope/config.py CHANGED
@@ -18,24 +18,14 @@ logger = get_logger()
18
18
 
19
19
  cur_path = os.path.dirname(os.path.abspath(__file__))
20
20
 
21
- DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16'}
22
- DEFAULT_GENERATION_CONFIG = {
23
- 'max_length': 2048,
24
- 'max_new_tokens': 512,
25
- 'do_sample': False,
26
- 'top_k': 50,
27
- 'top_p': 1.0,
28
- 'temperature': 1.0,
29
- }
30
-
31
21
 
32
22
  @dataclass
33
23
  class TaskConfig:
34
24
  # Model-related arguments
35
25
  model: Union[str, 'CustomModel', None] = None
36
26
  model_id: Optional[str] = None
37
- model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
38
- model_task: Optional[str] = ModelTask.TEXT_GENERATION
27
+ model_args: Dict = field(default_factory=dict)
28
+ model_task: str = ModelTask.TEXT_GENERATION
39
29
 
40
30
  # Template-related arguments
41
31
  template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
@@ -48,7 +38,7 @@ class TaskConfig:
48
38
  dataset_hub: str = HubType.MODELSCOPE
49
39
 
50
40
  # Generation configuration arguments
51
- generation_config: Optional[Dict] = field(default_factory=lambda: DEFAULT_GENERATION_CONFIG | {})
41
+ generation_config: Dict = field(default_factory=dict)
52
42
 
53
43
  # Evaluation-related arguments
54
44
  eval_type: str = EvalType.CHECKPOINT
@@ -65,6 +55,7 @@ class TaskConfig:
65
55
  outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
66
56
 
67
57
  # Debug and runtime mode arguments
58
+ ignore_errors: bool = False
68
59
  debug: bool = False
69
60
  dry_run: bool = False
70
61
  seed: Optional[int] = 42
@@ -95,6 +86,46 @@ class TaskConfig:
95
86
  if self.eval_batch_size is None:
96
87
  self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
97
88
 
89
+ # Set default generation_config and model_args
90
+ self.__init_default_generation_config()
91
+ self.__init_default_model_args()
92
+
93
+ def __init_default_generation_config(self):
94
+ if self.generation_config:
95
+ return
96
+ if self.model_task == ModelTask.IMAGE_GENERATION:
97
+ self.generation_config = {
98
+ 'height': 1024,
99
+ 'width': 1024,
100
+ 'num_inference_steps': 50,
101
+ 'guidance_scale': 9.0,
102
+ }
103
+ elif self.model_task == ModelTask.TEXT_GENERATION:
104
+ if self.eval_type == EvalType.CHECKPOINT:
105
+ self.generation_config = {
106
+ 'max_length': 2048,
107
+ 'max_new_tokens': 512,
108
+ 'do_sample': False,
109
+ 'top_k': 50,
110
+ 'top_p': 1.0,
111
+ 'temperature': 1.0,
112
+ }
113
+ elif self.eval_type == EvalType.SERVICE:
114
+ self.generation_config = {
115
+ 'max_tokens': 2048,
116
+ 'temperature': 0.0,
117
+ }
118
+
119
+ def __init_default_model_args(self):
120
+ if self.model_args:
121
+ return
122
+ if self.model_task == ModelTask.TEXT_GENERATION:
123
+ if self.eval_type == EvalType.CHECKPOINT:
124
+ self.model_args = {
125
+ 'revision': 'master',
126
+ 'precision': 'torch.float16',
127
+ }
128
+
98
129
  def to_dict(self):
99
130
  result = self.__dict__.copy()
100
131
  if isinstance(self.model, CustomModel):
@@ -218,9 +249,9 @@ def parse_task_config(task_cfg) -> TaskConfig:
218
249
  elif isinstance(task_cfg, str):
219
250
  extension = os.path.splitext(task_cfg)[-1]
220
251
  logger.info(f'Args: Task config is provided with {extension} file type.')
221
- if extension in ['yaml', 'yml']:
252
+ if extension in ['.yaml', '.yml']:
222
253
  task_cfg = TaskConfig.from_yaml(task_cfg)
223
- elif extension == 'json':
254
+ elif extension == '.json':
224
255
  task_cfg = TaskConfig.from_json(task_cfg)
225
256
  else:
226
257
  raise ValueError('Args: Unsupported file extension.')
@@ -97,13 +97,23 @@ class Evaluator(object):
97
97
  answer_d[AnswerKeys.ANSWER_ID] = answer_id
98
98
  answer_d[AnswerKeys.SUBSET_NAME] = subset_name
99
99
  answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
100
- # answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
101
100
  answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
102
101
  return answer_d
103
102
 
104
103
  def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
105
104
  answers_list = []
106
- answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
105
+ try:
106
+ # get answer from model
107
+ answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
108
+ except Exception as e:
109
+ logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
110
+ # if ignore_errors is True, continue to next input
111
+ if self.task_cfg.ignore_errors:
112
+ logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
113
+ return answers_list
114
+ else:
115
+ raise e
116
+ # process answer
107
117
  for answer_d, input_prompt in zip(answer_ds, input_prompts):
108
118
  answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
109
119
  processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
@@ -197,16 +207,17 @@ class Evaluator(object):
197
207
  reviewer_spec = {}
198
208
 
199
209
  review_res = deepcopy(answer_d)
200
- choices = review_res[AnswerKeys.CHOICES]
201
- if len(choices) == 0:
202
- review_res[ReviewKeys.REVIEWED] = False
210
+ if AnswerKeys.CHOICES not in review_res:
211
+ review_res[AnswerKeys.CHOICES] = []
212
+ review_res[ReviewKeys.REVIEWED] = True
203
213
  review_res[ReviewKeys.REVIEW_ID] = None
204
214
  review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
205
215
  review_res[ReviewKeys.REVIEW_TIME] = time.time()
216
+ logger.warning(f'No choices found for answer dict: {review_res}')
206
217
  return review_res
207
218
 
208
219
  rev_choices = []
209
- for choice in choices:
220
+ for choice in review_res[AnswerKeys.CHOICES]:
210
221
  raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
211
222
  answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
212
223
  gold_content = self.data_adapter.get_gold_answer(raw_input_d)
@@ -280,11 +291,20 @@ class Evaluator(object):
280
291
  review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
281
292
  os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
282
293
 
294
+ # Load existing reviews if using cache
295
+ existing_reviews = {}
283
296
  if self.use_cache and os.path.exists(review_file_path):
284
- logger.info(f'Updating the review file: {review_file_path} ...')
285
- os.remove(review_file_path)
297
+ with open(review_file_path, 'r') as f:
298
+ for line in f:
299
+ review = json.loads(line.strip())
300
+ existing_reviews[review['index']] = review
301
+ logger.info(f'Reusing review result from {review_file_path}, got {len(existing_reviews)} reviews.')
286
302
 
287
303
  def process_single_review(answer_d):
304
+ # Check if review already exists in cache
305
+ if self.use_cache and answer_d['index'] in existing_reviews:
306
+ return existing_reviews[answer_d['index']]
307
+
288
308
  review_id, reviewer_spec = self._generate_review_id(answer_d)
289
309
  # Get review
290
310
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
@@ -299,8 +319,9 @@ class Evaluator(object):
299
319
  for future in tqdm(as_completed(futures), total=len(futures), desc=f'Reviewing({subset_name}): '):
300
320
  review_d = future.result()
301
321
  reviews_list.append(review_d)
302
- # Dump reviews
303
- dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
322
+ # Dump new reviews only if not using cache or review is new
323
+ if not self.use_cache or review_d['index'] not in existing_reviews:
324
+ dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
304
325
 
305
326
  return reviews_list
306
327
 
@@ -315,17 +336,24 @@ class Evaluator(object):
315
336
  Returns:
316
337
  The metric result. Depends on the metric function in data_adapter.
317
338
  """
339
+ # Get max choices
340
+ choices_lengths = [
341
+ len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d.get(ReviewKeys.REVIEWED)
342
+ ]
343
+ if choices_lengths:
344
+ max_choices = max(choices_lengths)
345
+ else:
346
+ max_choices = 0
318
347
 
348
+ # Get review result
319
349
  review_res_list = []
320
- max_choices = max(
321
- len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
322
350
  for review_d in reviews_list:
323
351
  if not review_d[ReviewKeys.REVIEWED]:
324
- logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
352
+ logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
325
353
  continue
326
354
 
327
355
  if len(review_d[AnswerKeys.CHOICES]) == 0:
328
- logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
356
+ logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
329
357
  continue
330
358
  elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
331
359
  review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
@@ -367,7 +395,7 @@ class Evaluator(object):
367
395
  os.makedirs(os.path.dirname(report_path), exist_ok=True)
368
396
 
369
397
  # Write report
370
- with open(report_path, 'w') as f:
398
+ with open(report_path, 'w', encoding='utf-8') as f:
371
399
  f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
372
400
  logger.info(f'Dump report: {report_path} \n')
373
401
 
@@ -44,20 +44,25 @@ from evalscope.utils import get_logger
44
44
 
45
45
  logger = get_logger()
46
46
 
47
- # Deal with nltk punkt_tab.zip tokenizer file to avoid downloading issue
48
- try:
49
- nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
50
- os.makedirs(nltk_dir, exist_ok=True)
51
- punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
52
- punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
53
-
54
- if not os.path.exists(punkt_path):
55
- os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
56
- os.system(f'unzip {punkt_path} -d {nltk_dir}')
57
- else:
58
- logger.debug(f'{punkt_path} already exists, skipping download')
59
- except Exception as e:
60
- logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
47
+
48
+ def check_nltk_data():
49
+ """
50
+ Check if nltk data is available in the system.
51
+ If not, download the necessary data files.
52
+ """
53
+ try:
54
+ nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
55
+ os.makedirs(nltk_dir, exist_ok=True)
56
+ punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
57
+ punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
58
+
59
+ if not os.path.exists(punkt_path):
60
+ os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
61
+ os.system(f'unzip {punkt_path} -d {nltk_dir}')
62
+ else:
63
+ logger.debug(f'{punkt_path} already exists, skipping download')
64
+ except Exception as e:
65
+ logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
61
66
 
62
67
 
63
68
  class RougeScorer(scoring.BaseScorer):
@@ -83,7 +88,7 @@ class RougeScorer(scoring.BaseScorer):
83
88
  """
84
89
 
85
90
  def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
86
-
91
+ check_nltk_data()
87
92
  self.rouge_types = rouge_types
88
93
  if tokenizer:
89
94
  self._tokenizer = tokenizer
@@ -59,13 +59,13 @@ class LLMJudge:
59
59
  # Initialize ServerModelAdapter
60
60
  self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
61
61
 
62
- def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> float:
62
+ def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str:
63
63
  """
64
64
  Args:
65
65
  prompt (str): The prompt to evaluate
66
66
  system_prompt (str, optional): The system prompt to use for the evaluation
67
67
  Returns:
68
- float: The score of the evaluation
68
+ str: The response from the LLM
69
69
  """
70
70
  input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
71
71
 
@@ -83,7 +83,7 @@ class LLMJudge:
83
83
  return llm_response
84
84
  except Exception as e:
85
85
  logger.error(f'Error during LLM evaluation: {e}')
86
- return None
86
+ return ''
87
87
 
88
88
  def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
89
89
  if question is None:
@@ -19,10 +19,6 @@ class DummyTokenizer:
19
19
  return text.split()
20
20
 
21
21
 
22
- scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
23
- zh_scorer = Rouge()
24
-
25
-
26
22
  def is_contains_chinese(strs):
27
23
  for _char in strs:
28
24
  if '\u4e00' <= _char <= '\u9fa5':
@@ -51,6 +47,7 @@ def compute_rouge_score(predict_l, reference_l):
51
47
 
52
48
  def compute_rouge_score_one_sample_zh(predict, reference):
53
49
  result = dict()
50
+ zh_scorer = Rouge()
54
51
  for p, r in zip(predict, reference):
55
52
  p = ' '.join(jieba.cut(p)) if is_contains_chinese(p) else p
56
53
  r = ' '.join(jieba.cut(r)) if is_contains_chinese(r) else r
@@ -60,21 +57,22 @@ def compute_rouge_score_one_sample_zh(predict, reference):
60
57
  except Exception as e:
61
58
  logger.warning(f'rouge score error: {p} {r} {e}')
62
59
  continue
63
- result['rouge-1-r'] = score['rouge-1']['r']
64
- result['rouge-1-p'] = score['rouge-1']['p']
65
- result['rouge-1-f'] = score['rouge-1']['f']
66
- result['rouge-2-r'] = score['rouge-2']['r']
67
- result['rouge-2-p'] = score['rouge-2']['p']
68
- result['rouge-2-f'] = score['rouge-2']['f']
69
- result['rouge-l-r'] = score['rouge-l']['r']
70
- result['rouge-l-p'] = score['rouge-l']['p']
71
- result['rouge-l-f'] = score['rouge-l']['f']
60
+ result['Rouge-1-R'] = score['rouge-1']['r']
61
+ result['Rouge-1-P'] = score['rouge-1']['p']
62
+ result['Rouge-1-F'] = score['rouge-1']['f']
63
+ result['Rouge-2-R'] = score['rouge-2']['r']
64
+ result['Rouge-2-P'] = score['rouge-2']['p']
65
+ result['Rouge-2-F'] = score['rouge-2']['f']
66
+ result['Rouge-L-R'] = score['rouge-l']['r']
67
+ result['Rouge-L-P'] = score['rouge-l']['p']
68
+ result['Rouge-L-F'] = score['rouge-l']['f']
72
69
 
73
70
  return result
74
71
 
75
72
 
76
73
  def compute_rouge_score_one_sample(predict, reference):
77
74
  result = dict()
75
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
78
76
  for p, r in zip(predict, reference):
79
77
  try:
80
78
  score = scorer.score(p, r)
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import time
3
3
  import torch
4
- from typing import Any, Dict, List, Tuple, Union
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
5
 
6
6
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
7
7
  from evalscope.utils.logger import get_logger
@@ -58,19 +58,15 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
58
58
  return generation_config
59
59
 
60
60
  def _model_generate(self,
61
- queries: List[str],
62
- system_prompts: List[str] = None,
61
+ formatted_prompts: List[str],
63
62
  infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
64
63
  """
65
64
  Args:
66
- queries: The input queries.
67
- system_prompts: The system prompts.
65
+ formatted_prompts: The formatted prompts.
68
66
  infer_cfg: The inference configuration.
69
67
  Returns:
70
68
  The prediction results.
71
69
  """
72
- if system_prompts is None:
73
- system_prompts = []
74
70
  if infer_cfg is None:
75
71
  infer_cfg = {}
76
72
 
@@ -92,27 +88,6 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
92
88
  self.generation_config.update(**infer_cfg)
93
89
  fix_do_sample_warning(self.generation_config)
94
90
 
95
- # For chat model, use the chat template to format the input
96
- if self.tokenizer.chat_template is not None:
97
- formatted_prompts = []
98
- for i, query in enumerate(queries):
99
- messages = [ChatMessage(role='user', content=query)]
100
- if i < len(system_prompts) and system_prompts[i]:
101
- messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
102
- # whether thinking is needed
103
- chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
104
- if chat_template_kwargs is not None:
105
- prompts = self.tokenizer.apply_chat_template(
106
- messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
107
- else:
108
- prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
109
- formatted_prompts.append(prompts)
110
- else:
111
- # For base model, use the queries as the input
112
- formatted_prompts = queries
113
-
114
- logger.debug(f'formatted_prompts: {formatted_prompts}')
115
-
116
91
  # Get input ids
117
92
  inputs = self.tokenizer(
118
93
  formatted_prompts, return_tensors='pt', padding=True, truncation=True,
@@ -136,26 +111,68 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
136
111
 
137
112
  return responses, input_lengths
138
113
 
139
- @torch.no_grad()
140
- def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
114
+ def _prepare_inputs(self, inputs: List[dict], infer_cfg: dict = {}) -> List[str]:
141
115
  """
116
+ Prepare the inputs for the model.
142
117
  Args:
143
118
  inputs: The input data.
144
119
  infer_cfg: The inference configuration.
145
120
  Returns:
146
- The prediction results.
121
+ The prepared inputs and system prompts.
147
122
  """
148
-
149
- # Process inputs
150
123
  queries = []
151
124
  system_prompts = []
125
+ message_list = []
152
126
 
153
127
  for input_item in inputs:
154
128
  queries.append(input_item['data'][0])
155
129
  system_prompts.append(input_item.get('system_prompt', None))
130
+ if input_item.get('messages', None):
131
+ message_list.append(input_item.get('messages', None))
132
+
133
+ # For non chat model, use the original queries as the input
134
+ if self.tokenizer.chat_template is None:
135
+ return queries
136
+
137
+ # For chat model, use the messages as the input
138
+ # if message_list is None, use the queries as the input
139
+ if len(message_list) == 0:
140
+ for i, query in enumerate(queries):
141
+ messages = [ChatMessage(role='user', content=query)]
142
+ if i < len(system_prompts) and system_prompts[i]:
143
+ messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
144
+ message_list.append(messages)
145
+
146
+ # Format the messages
147
+ formatted_prompts = []
148
+ for messages in message_list:
149
+ # apply chat template
150
+ chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
151
+ if chat_template_kwargs is not None:
152
+ prompts = self.tokenizer.apply_chat_template(
153
+ messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
154
+ else:
155
+ prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
156
+ formatted_prompts.append(prompts)
157
+
158
+ logger.debug(f'formatted_prompts: {formatted_prompts}')
159
+ return formatted_prompts
160
+
161
+ @torch.no_grad()
162
+ def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = {}) -> List[dict]:
163
+ """
164
+ Args:
165
+ inputs: The input data.
166
+ infer_cfg: The inference configuration.
167
+ Returns:
168
+ The prediction results.
169
+ """
170
+
171
+ # Process inputs
172
+ formatted_prompts = self._prepare_inputs(inputs, infer_cfg)
156
173
 
157
174
  # Run inference
158
- responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
175
+ responses, input_lengths = self._model_generate(formatted_prompts, infer_cfg)
159
176
 
160
177
  # Process outputs
161
178
  results = []