evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (78) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +67 -59
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +12 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  7. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  8. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  9. evalscope/backend/rag_eval/utils/embedding.py +75 -35
  10. evalscope/backend/rag_eval/utils/llm.py +1 -1
  11. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  12. evalscope/benchmarks/benchmark.py +1 -0
  13. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  14. evalscope/benchmarks/data_adapter.py +101 -18
  15. evalscope/benchmarks/docmath/__init__.py +0 -0
  16. evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
  17. evalscope/benchmarks/docmath/utils.py +220 -0
  18. evalscope/benchmarks/drop/__init__.py +0 -0
  19. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  20. evalscope/benchmarks/drop/utils.py +59 -0
  21. evalscope/benchmarks/frames/__init__.py +0 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +90 -0
  23. evalscope/benchmarks/frames/utils.py +37 -0
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
  25. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  26. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
  27. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  28. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  29. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  30. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
  31. evalscope/benchmarks/tool_bench/utils.py +203 -0
  32. evalscope/benchmarks/utils.py +28 -2
  33. evalscope/benchmarks/winogrande/__init__.py +0 -0
  34. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  35. evalscope/cli/start_app.py +2 -2
  36. evalscope/collections/__init__.py +35 -3
  37. evalscope/collections/evaluator.py +94 -32
  38. evalscope/config.py +54 -17
  39. evalscope/evaluator/evaluator.py +80 -41
  40. evalscope/metrics/__init__.py +3 -1
  41. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  42. evalscope/metrics/llm_judge.py +15 -8
  43. evalscope/metrics/math_parser.py +1 -1
  44. evalscope/metrics/rouge_metric.py +11 -13
  45. evalscope/models/adapters/chat_adapter.py +51 -34
  46. evalscope/models/adapters/server_adapter.py +17 -25
  47. evalscope/perf/arguments.py +16 -7
  48. evalscope/perf/benchmark.py +0 -15
  49. evalscope/perf/main.py +72 -15
  50. evalscope/perf/plugin/datasets/custom.py +15 -0
  51. evalscope/perf/utils/benchmark_util.py +34 -16
  52. evalscope/perf/utils/db_util.py +25 -15
  53. evalscope/perf/utils/local_server.py +1 -0
  54. evalscope/perf/utils/log_utils.py +12 -5
  55. evalscope/perf/utils/rich_display.py +186 -0
  56. evalscope/report/__init__.py +36 -4
  57. evalscope/report/combinator.py +8 -0
  58. evalscope/report/generator.py +33 -9
  59. evalscope/report/utils.py +61 -4
  60. evalscope/run.py +12 -0
  61. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  62. evalscope/utils/deprecation_utils.py +42 -0
  63. evalscope/utils/logger.py +1 -1
  64. evalscope/utils/utils.py +12 -0
  65. evalscope/version.py +2 -2
  66. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
  67. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
  68. tests/aigc/test_t2i.py +40 -3
  69. tests/cli/test_all.py +39 -32
  70. tests/cli/test_collection.py +8 -6
  71. tests/cli/test_run.py +43 -17
  72. tests/perf/test_perf.py +23 -0
  73. tests/rag/test_mteb.py +5 -5
  74. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  75. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
  76. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
  77. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
  78. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
evalscope/config.py CHANGED
@@ -13,29 +13,20 @@ from evalscope.models import CustomModel, DummyCustomModel
13
13
  from evalscope.utils import gen_hash
14
14
  from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
15
15
  from evalscope.utils.logger import get_logger
16
+ from evalscope.utils.utils import parse_int_or_float
16
17
 
17
18
  logger = get_logger()
18
19
 
19
20
  cur_path = os.path.dirname(os.path.abspath(__file__))
20
21
 
21
- DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16'}
22
- DEFAULT_GENERATION_CONFIG = {
23
- 'max_length': 2048,
24
- 'max_new_tokens': 512,
25
- 'do_sample': False,
26
- 'top_k': 50,
27
- 'top_p': 1.0,
28
- 'temperature': 1.0,
29
- }
30
-
31
22
 
32
23
  @dataclass
33
24
  class TaskConfig:
34
25
  # Model-related arguments
35
26
  model: Union[str, 'CustomModel', None] = None
36
27
  model_id: Optional[str] = None
37
- model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
38
- model_task: Optional[str] = ModelTask.TEXT_GENERATION
28
+ model_args: Dict = field(default_factory=dict)
29
+ model_task: str = ModelTask.TEXT_GENERATION
39
30
 
40
31
  # Template-related arguments
41
32
  template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
@@ -48,14 +39,14 @@ class TaskConfig:
48
39
  dataset_hub: str = HubType.MODELSCOPE
49
40
 
50
41
  # Generation configuration arguments
51
- generation_config: Optional[Dict] = field(default_factory=lambda: DEFAULT_GENERATION_CONFIG | {})
42
+ generation_config: Dict = field(default_factory=dict)
52
43
 
53
44
  # Evaluation-related arguments
54
45
  eval_type: str = EvalType.CHECKPOINT
55
46
  eval_backend: str = EvalBackend.NATIVE
56
47
  eval_config: Union[str, Dict, None] = None
57
48
  stage: str = EvalStage.ALL
58
- limit: Optional[int] = None
49
+ limit: Optional[Union[int, float]] = None
59
50
  eval_batch_size: Optional[int] = None
60
51
 
61
52
  # Cache and working directory arguments
@@ -65,6 +56,7 @@ class TaskConfig:
65
56
  outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
66
57
 
67
58
  # Debug and runtime mode arguments
59
+ ignore_errors: bool = False
68
60
  debug: bool = False
69
61
  dry_run: bool = False
70
62
  seed: Optional[int] = 42
@@ -76,7 +68,8 @@ class TaskConfig:
76
68
  # LLMJudge arguments
77
69
  judge_strategy: str = JudgeStrategy.AUTO
78
70
  judge_worker_num: int = 1
79
- judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
71
+ judge_model_args: Optional[Dict] = field(default_factory=dict)
72
+ analysis_report: bool = False
80
73
 
81
74
  def __post_init__(self):
82
75
  if self.model is None:
@@ -95,6 +88,50 @@ class TaskConfig:
95
88
  if self.eval_batch_size is None:
96
89
  self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
97
90
 
91
+ # Post process limit
92
+ if self.limit is not None:
93
+ self.limit = parse_int_or_float(self.limit)
94
+
95
+ # Set default generation_config and model_args
96
+ self.__init_default_generation_config()
97
+ self.__init_default_model_args()
98
+
99
+ def __init_default_generation_config(self):
100
+ if self.generation_config:
101
+ return
102
+ if self.model_task == ModelTask.IMAGE_GENERATION:
103
+ self.generation_config = {
104
+ 'height': 1024,
105
+ 'width': 1024,
106
+ 'num_inference_steps': 50,
107
+ 'guidance_scale': 9.0,
108
+ }
109
+ elif self.model_task == ModelTask.TEXT_GENERATION:
110
+ if self.eval_type == EvalType.CHECKPOINT:
111
+ self.generation_config = {
112
+ 'max_length': 2048,
113
+ 'max_new_tokens': 512,
114
+ 'do_sample': False,
115
+ 'top_k': 50,
116
+ 'top_p': 1.0,
117
+ 'temperature': 1.0,
118
+ }
119
+ elif self.eval_type == EvalType.SERVICE:
120
+ self.generation_config = {
121
+ 'max_tokens': 2048,
122
+ 'temperature': 0.0,
123
+ }
124
+
125
+ def __init_default_model_args(self):
126
+ if self.model_args:
127
+ return
128
+ if self.model_task == ModelTask.TEXT_GENERATION:
129
+ if self.eval_type == EvalType.CHECKPOINT:
130
+ self.model_args = {
131
+ 'revision': 'master',
132
+ 'precision': 'torch.float16',
133
+ }
134
+
98
135
  def to_dict(self):
99
136
  result = self.__dict__.copy()
100
137
  if isinstance(self.model, CustomModel):
@@ -218,9 +255,9 @@ def parse_task_config(task_cfg) -> TaskConfig:
218
255
  elif isinstance(task_cfg, str):
219
256
  extension = os.path.splitext(task_cfg)[-1]
220
257
  logger.info(f'Args: Task config is provided with {extension} file type.')
221
- if extension in ['yaml', 'yml']:
258
+ if extension in ['.yaml', '.yml']:
222
259
  task_cfg = TaskConfig.from_yaml(task_cfg)
223
- elif extension == 'json':
260
+ elif extension == '.json':
224
261
  task_cfg = TaskConfig.from_json(task_cfg)
225
262
  else:
226
263
  raise ValueError('Args: Unsupported file extension.')
@@ -13,7 +13,7 @@ from evalscope.benchmarks import DataAdapter
13
13
  from evalscope.config import TaskConfig
14
14
  from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
15
15
  from evalscope.models import BaseModelAdapter
16
- from evalscope.report import Report, gen_table
16
+ from evalscope.report import Report, gen_report_table
17
17
  from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
18
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
19
19
  from evalscope.utils.logger import get_logger
@@ -46,7 +46,6 @@ class Evaluator(object):
46
46
  self.dataset_name = data_adapter.name
47
47
  self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
48
48
  self.model_name = task_cfg.model_id
49
- self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
50
49
 
51
50
  self.data_adapter = data_adapter
52
51
  self.model_adapter = model_adapter
@@ -79,8 +78,16 @@ class Evaluator(object):
79
78
  # Limit and index prompts
80
79
  limited_prompts = defaultdict(list)
81
80
  for subset_name, prompts_list in prompts.items():
82
- limit = self.task_cfg.limit or len(prompts_list)
83
- for index, prompt in enumerate(prompts_list[:limit]):
81
+ # If limit is None, use all prompts
82
+ if self.task_cfg.limit is None:
83
+ limit = len(prompts_list)
84
+ else:
85
+ if isinstance(self.task_cfg.limit, int):
86
+ limit = self.task_cfg.limit
87
+ elif isinstance(self.task_cfg.limit, float):
88
+ limit = int(len(prompts_list) * self.task_cfg.limit)
89
+ # Limit the number of prompts
90
+ for index, prompt in enumerate(prompts_list[:min(limit, len(prompts_list))]):
84
91
  prompt[AnswerKeys.INDEX] = index
85
92
  limited_prompts[subset_name].append(prompt)
86
93
 
@@ -97,13 +104,23 @@ class Evaluator(object):
97
104
  answer_d[AnswerKeys.ANSWER_ID] = answer_id
98
105
  answer_d[AnswerKeys.SUBSET_NAME] = subset_name
99
106
  answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
100
- # answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
101
107
  answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
102
108
  return answer_d
103
109
 
104
110
  def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
105
111
  answers_list = []
106
- answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
112
+ try:
113
+ # get answer from model
114
+ answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
115
+ except Exception as e:
116
+ logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
117
+ # if ignore_errors is True, continue to next input
118
+ if self.task_cfg.ignore_errors:
119
+ logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
120
+ return answers_list
121
+ else:
122
+ raise e
123
+ # process answer
107
124
  for answer_d, input_prompt in zip(answer_ds, input_prompts):
108
125
  answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
109
126
  processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
@@ -197,16 +214,17 @@ class Evaluator(object):
197
214
  reviewer_spec = {}
198
215
 
199
216
  review_res = deepcopy(answer_d)
200
- choices = review_res[AnswerKeys.CHOICES]
201
- if len(choices) == 0:
202
- review_res[ReviewKeys.REVIEWED] = False
217
+ if AnswerKeys.CHOICES not in review_res:
218
+ review_res[AnswerKeys.CHOICES] = []
219
+ review_res[ReviewKeys.REVIEWED] = True
203
220
  review_res[ReviewKeys.REVIEW_ID] = None
204
221
  review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
205
222
  review_res[ReviewKeys.REVIEW_TIME] = time.time()
223
+ logger.warning(f'No choices found for answer dict: {review_res}')
206
224
  return review_res
207
225
 
208
226
  rev_choices = []
209
- for choice in choices:
227
+ for choice in review_res[AnswerKeys.CHOICES]:
210
228
  raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
211
229
  answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
212
230
  gold_content = self.data_adapter.get_gold_answer(raw_input_d)
@@ -280,11 +298,20 @@ class Evaluator(object):
280
298
  review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
281
299
  os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
282
300
 
301
+ # Load existing reviews if using cache
302
+ existing_reviews = {}
283
303
  if self.use_cache and os.path.exists(review_file_path):
284
- logger.info(f'Updating the review file: {review_file_path} ...')
285
- os.remove(review_file_path)
304
+ with open(review_file_path, 'r') as f:
305
+ for line in f:
306
+ review = json.loads(line.strip())
307
+ existing_reviews[review['index']] = review
308
+ logger.info(f'Reusing review result from {review_file_path}, got {len(existing_reviews)} reviews.')
286
309
 
287
310
  def process_single_review(answer_d):
311
+ # Check if review already exists in cache
312
+ if self.use_cache and answer_d['index'] in existing_reviews:
313
+ return existing_reviews[answer_d['index']]
314
+
288
315
  review_id, reviewer_spec = self._generate_review_id(answer_d)
289
316
  # Get review
290
317
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
@@ -299,8 +326,9 @@ class Evaluator(object):
299
326
  for future in tqdm(as_completed(futures), total=len(futures), desc=f'Reviewing({subset_name}): '):
300
327
  review_d = future.result()
301
328
  reviews_list.append(review_d)
302
- # Dump reviews
303
- dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
329
+ # Dump new reviews only if not using cache or review is new
330
+ if not self.use_cache or review_d['index'] not in existing_reviews:
331
+ dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
304
332
 
305
333
  return reviews_list
306
334
 
@@ -315,17 +343,24 @@ class Evaluator(object):
315
343
  Returns:
316
344
  The metric result. Depends on the metric function in data_adapter.
317
345
  """
346
+ # Get max choices
347
+ choices_lengths = [
348
+ len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d.get(ReviewKeys.REVIEWED)
349
+ ]
350
+ if choices_lengths:
351
+ max_choices = max(choices_lengths)
352
+ else:
353
+ max_choices = 0
318
354
 
355
+ # Get review result
319
356
  review_res_list = []
320
- max_choices = max(
321
- len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
322
357
  for review_d in reviews_list:
323
358
  if not review_d[ReviewKeys.REVIEWED]:
324
- logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
359
+ logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
325
360
  continue
326
361
 
327
362
  if len(review_d[AnswerKeys.CHOICES]) == 0:
328
- logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
363
+ logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
329
364
  continue
330
365
  elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
331
366
  review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
@@ -343,41 +378,45 @@ class Evaluator(object):
343
378
 
344
379
  return metric_score
345
380
 
346
- def dump_report(self, reviews_score_all: List[dict], use_table: bool = True):
381
+ def dump_report(self, reviews_score_all: List[dict]):
347
382
  """
348
383
  Get report for total reviews of specific dataset.
349
384
  It is required to rewrite this method to support your own evaluator.
350
385
 
351
386
  Args:
352
387
  reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
353
- use_table: whether to generate table for reports. Default to True.
354
388
 
355
389
  Returns: None
356
390
  """
391
+ report_path = os.path.join(self.outputs_structure.reports_dir, self.model_name)
392
+ os.makedirs(report_path, exist_ok=True)
357
393
  # Get report map
358
394
  report_map: Report = self.data_adapter.gen_report(
359
- subset_score_map=reviews_score_all,
360
- report_name=self.custom_task_name,
361
- model_name=self.model_name,
362
- dataset_name=self.dataset_name)
363
-
364
- # Dump report
365
- report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
366
- self.dataset_name + '.json')
367
- os.makedirs(os.path.dirname(report_path), exist_ok=True)
395
+ subset_score_map=reviews_score_all, model_name=self.model_name)
368
396
 
369
- # Write report
370
- with open(report_path, 'w') as f:
371
- f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
372
- logger.info(f'Dump report: {report_path} \n')
397
+ # Post process report
398
+ self.data_adapter.post_process_report(report_map, report_path=report_path)
373
399
 
374
400
  # Make table
375
- if use_table:
376
- try:
377
- report_table: str = gen_table([self.outputs_structure.reports_dir])
378
- logger.info(f'Report table: \n{report_table} \n')
379
- except Exception:
380
- logger.error('Failed to generate report table.')
401
+ try:
402
+ report_table = gen_report_table(report_map)
403
+ logger.info(f'{self.dataset_name_or_path} report table: \n{report_table} \n')
404
+ except Exception:
405
+ logger.error('Failed to generate report table.')
406
+
407
+ # Make report analysis
408
+ if self.task_cfg.analysis_report:
409
+ logger.info('Generating report analysis, please wait ...')
410
+ analysis = report_map.generate_analysis(self.task_cfg.judge_model_args)
411
+ logger.info('Report analysis:\n%s', analysis)
412
+ else:
413
+ logger.info('Skipping report analysis (`analysis_report=False`).')
414
+
415
+ # Dump report
416
+ report_file = os.path.join(report_path, f'{self.dataset_name}.json')
417
+ report_map.to_json(report_file)
418
+ logger.info(f'Dump report to: {report_file} \n')
419
+
381
420
  return report_map
382
421
 
383
422
  def eval(self, **kwargs) -> dict:
@@ -403,7 +442,7 @@ class Evaluator(object):
403
442
  stage == 'review': return the reviews_map
404
443
  """
405
444
 
406
- logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
445
+ logger.info(f'Start evaluating on dataset {self.dataset_name_or_path}')
407
446
 
408
447
  reviews_score_all = {} # {subset_name: (score, num)}
409
448
  stage_answers_dict = {}
@@ -433,6 +472,6 @@ class Evaluator(object):
433
472
  # Generate report
434
473
  report_map = self.dump_report(reviews_score_all)
435
474
 
436
- logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
475
+ logger.info(f'Evaluation finished on {self.dataset_name_or_path}')
437
476
 
438
477
  return report_map
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
9
9
  from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
10
10
  weighted_mean)
11
11
  from .named_metrics import Metric, metric_registry
12
- from .rouge_metric import compute_rouge_score_one_sample_zh
12
+ from .rouge_metric import compute_rouge_score, compute_rouge_score_one_sample, compute_rouge_score_one_sample_zh
13
13
 
14
14
  else:
15
15
  _import_structure = {
@@ -28,6 +28,8 @@ else:
28
28
  ],
29
29
  'rouge_metric': [
30
30
  'compute_rouge_score_one_sample_zh',
31
+ 'compute_rouge_score',
32
+ 'compute_rouge_score_one_sample',
31
33
  ],
32
34
  'llm_judge': [
33
35
  'LLMJudge',
@@ -44,20 +44,25 @@ from evalscope.utils import get_logger
44
44
 
45
45
  logger = get_logger()
46
46
 
47
- # Deal with nltk punkt_tab.zip tokenizer file to avoid downloading issue
48
- try:
49
- nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
50
- os.makedirs(nltk_dir, exist_ok=True)
51
- punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
52
- punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
53
-
54
- if not os.path.exists(punkt_path):
55
- os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
56
- os.system(f'unzip {punkt_path} -d {nltk_dir}')
57
- else:
58
- logger.debug(f'{punkt_path} already exists, skipping download')
59
- except Exception as e:
60
- logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
47
+
48
+ def check_nltk_data():
49
+ """
50
+ Check if nltk data is available in the system.
51
+ If not, download the necessary data files.
52
+ """
53
+ try:
54
+ nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
55
+ os.makedirs(nltk_dir, exist_ok=True)
56
+ punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
57
+ punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
58
+
59
+ if not os.path.exists(punkt_path):
60
+ os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
61
+ os.system(f'unzip {punkt_path} -d {nltk_dir}')
62
+ else:
63
+ logger.debug(f'{punkt_path} already exists, skipping download')
64
+ except Exception as e:
65
+ logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
61
66
 
62
67
 
63
68
  class RougeScorer(scoring.BaseScorer):
@@ -83,11 +88,11 @@ class RougeScorer(scoring.BaseScorer):
83
88
  """
84
89
 
85
90
  def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
86
-
87
91
  self.rouge_types = rouge_types
88
92
  if tokenizer:
89
93
  self._tokenizer = tokenizer
90
94
  else:
95
+ check_nltk_data()
91
96
  self._tokenizer = tokenizers.DefaultTokenizer(use_stemmer)
92
97
  logging.info('Using default tokenizer.')
93
98
 
@@ -22,6 +22,9 @@ B: INCORRECT
22
22
  Just return the letters "A" or "B", with no text around it.
23
23
  """ # noqa: E501
24
24
 
25
+ DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
26
+ DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
27
+
25
28
 
26
29
  class LLMJudge:
27
30
  """
@@ -47,25 +50,25 @@ class LLMJudge:
47
50
  prompt_template (str, optional): Prompt template for the judge
48
51
  generation_config (dict, optional): Generation configuration for the judge
49
52
  """
50
- self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
51
- self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
52
- self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-4')
53
+ self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
54
+ self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
55
+ self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
53
56
  self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
54
57
  self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
55
- self.generation_config = generation_config
58
+ self.generation_config = generation_config or {}
56
59
 
57
60
  from evalscope.models import ServerModelAdapter
58
61
 
59
62
  # Initialize ServerModelAdapter
60
63
  self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
61
64
 
62
- def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> float:
65
+ def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str:
63
66
  """
64
67
  Args:
65
68
  prompt (str): The prompt to evaluate
66
69
  system_prompt (str, optional): The system prompt to use for the evaluation
67
70
  Returns:
68
- float: The score of the evaluation
71
+ str: The response from the LLM
69
72
  """
70
73
  input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
71
74
 
@@ -74,6 +77,10 @@ class LLMJudge:
74
77
  if self.generation_config:
75
78
  infer_cfg.update(self.generation_config)
76
79
 
80
+ if self.model_id == DEFAULT_JUDGE_MODEL:
81
+ # Disable thinking for the default judge model
82
+ infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
83
+
77
84
  try:
78
85
  # Send request using ServerModelAdapter
79
86
  response = self.server_adapter.process_single_input(input_data, infer_cfg)
@@ -82,8 +89,8 @@ class LLMJudge:
82
89
  llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
83
90
  return llm_response
84
91
  except Exception as e:
85
- logger.error(f'Error during LLM evaluation: {e}')
86
- return None
92
+ logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
93
+ return ''
87
94
 
88
95
  def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
89
96
  if question is None:
@@ -4,7 +4,7 @@ The logic in this file largely borrows from Qwen2.5-Math codebase at https://git
4
4
  # flake8: noqa
5
5
  import re
6
6
  import regex
7
- from latex2sympy2 import latex2sympy
7
+ from latex2sympy2_extended import latex2sympy
8
8
  from math import isclose
9
9
  from sympy import N, simplify
10
10
  from sympy.parsing.latex import parse_latex
@@ -19,10 +19,6 @@ class DummyTokenizer:
19
19
  return text.split()
20
20
 
21
21
 
22
- scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
23
- zh_scorer = Rouge()
24
-
25
-
26
22
  def is_contains_chinese(strs):
27
23
  for _char in strs:
28
24
  if '\u4e00' <= _char <= '\u9fa5':
@@ -51,6 +47,7 @@ def compute_rouge_score(predict_l, reference_l):
51
47
 
52
48
  def compute_rouge_score_one_sample_zh(predict, reference):
53
49
  result = dict()
50
+ zh_scorer = Rouge()
54
51
  for p, r in zip(predict, reference):
55
52
  p = ' '.join(jieba.cut(p)) if is_contains_chinese(p) else p
56
53
  r = ' '.join(jieba.cut(r)) if is_contains_chinese(r) else r
@@ -60,21 +57,22 @@ def compute_rouge_score_one_sample_zh(predict, reference):
60
57
  except Exception as e:
61
58
  logger.warning(f'rouge score error: {p} {r} {e}')
62
59
  continue
63
- result['rouge-1-r'] = score['rouge-1']['r']
64
- result['rouge-1-p'] = score['rouge-1']['p']
65
- result['rouge-1-f'] = score['rouge-1']['f']
66
- result['rouge-2-r'] = score['rouge-2']['r']
67
- result['rouge-2-p'] = score['rouge-2']['p']
68
- result['rouge-2-f'] = score['rouge-2']['f']
69
- result['rouge-l-r'] = score['rouge-l']['r']
70
- result['rouge-l-p'] = score['rouge-l']['p']
71
- result['rouge-l-f'] = score['rouge-l']['f']
60
+ result['Rouge-1-R'] = score['rouge-1']['r']
61
+ result['Rouge-1-P'] = score['rouge-1']['p']
62
+ result['Rouge-1-F'] = score['rouge-1']['f']
63
+ result['Rouge-2-R'] = score['rouge-2']['r']
64
+ result['Rouge-2-P'] = score['rouge-2']['p']
65
+ result['Rouge-2-F'] = score['rouge-2']['f']
66
+ result['Rouge-L-R'] = score['rouge-l']['r']
67
+ result['Rouge-L-P'] = score['rouge-l']['p']
68
+ result['Rouge-L-F'] = score['rouge-l']['f']
72
69
 
73
70
  return result
74
71
 
75
72
 
76
73
  def compute_rouge_score_one_sample(predict, reference):
77
74
  result = dict()
75
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
78
76
  for p, r in zip(predict, reference):
79
77
  try:
80
78
  score = scorer.score(p, r)