evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (114) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +40 -30
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  7. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  8. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  9. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  10. evalscope/backend/rag_eval/utils/embedding.py +77 -39
  11. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  12. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  13. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  14. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  16. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  17. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  18. evalscope/benchmarks/benchmark.py +2 -0
  19. evalscope/benchmarks/bfcl/__init__.py +0 -0
  20. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  21. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  22. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  23. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  24. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  25. evalscope/benchmarks/data_adapter.py +99 -16
  26. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  27. evalscope/benchmarks/docmath/__init__.py +0 -0
  28. evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
  29. evalscope/benchmarks/docmath/utils.py +220 -0
  30. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  31. evalscope/benchmarks/frames/__init__.py +0 -0
  32. evalscope/benchmarks/frames/frames_adapter.py +91 -0
  33. evalscope/benchmarks/frames/utils.py +37 -0
  34. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  35. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  36. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  37. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  38. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  39. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  40. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  41. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  42. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  43. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  44. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  45. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  46. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  47. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  48. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  49. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  50. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
  51. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  52. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  53. evalscope/benchmarks/race/race_adapter.py +3 -0
  54. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  55. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  56. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  57. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  58. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
  59. evalscope/benchmarks/tool_bench/utils.py +5 -4
  60. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  61. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  62. evalscope/benchmarks/utils.py +25 -0
  63. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  64. evalscope/cli/start_app.py +2 -2
  65. evalscope/collections/__init__.py +35 -3
  66. evalscope/collections/evaluator.py +68 -34
  67. evalscope/config.py +8 -2
  68. evalscope/constants.py +1 -1
  69. evalscope/evaluator/evaluator.py +40 -28
  70. evalscope/metrics/__init__.py +3 -1
  71. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  72. evalscope/metrics/llm_judge.py +12 -5
  73. evalscope/metrics/math_parser.py +1 -1
  74. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  75. evalscope/models/adapters/__init__.py +2 -0
  76. evalscope/models/adapters/base_adapter.py +31 -27
  77. evalscope/models/adapters/bfcl_adapter.py +244 -0
  78. evalscope/models/adapters/server_adapter.py +80 -23
  79. evalscope/models/custom/custom_model.py +0 -3
  80. evalscope/models/custom/dummy_model.py +77 -39
  81. evalscope/models/local_model.py +1 -1
  82. evalscope/models/register.py +2 -1
  83. evalscope/perf/arguments.py +4 -2
  84. evalscope/perf/benchmark.py +16 -12
  85. evalscope/perf/main.py +7 -0
  86. evalscope/perf/plugin/api/openai_api.py +2 -0
  87. evalscope/perf/plugin/datasets/custom.py +15 -0
  88. evalscope/perf/utils/benchmark_util.py +1 -1
  89. evalscope/perf/utils/local_server.py +1 -0
  90. evalscope/perf/utils/log_utils.py +12 -5
  91. evalscope/perf/utils/rich_display.py +1 -1
  92. evalscope/report/__init__.py +36 -4
  93. evalscope/report/combinator.py +40 -6
  94. evalscope/report/generator.py +33 -9
  95. evalscope/report/utils.py +84 -4
  96. evalscope/run.py +12 -0
  97. evalscope/summarizer.py +1 -1
  98. evalscope/utils/io_utils.py +59 -2
  99. evalscope/utils/logger.py +1 -1
  100. evalscope/utils/utils.py +12 -0
  101. evalscope/version.py +2 -2
  102. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
  103. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
  104. tests/aigc/test_t2i.py +48 -11
  105. tests/cli/test_all.py +14 -3
  106. tests/cli/test_collection.py +6 -4
  107. tests/cli/test_run.py +50 -25
  108. tests/rag/test_clip_benchmark.py +5 -1
  109. tests/rag/test_mteb.py +51 -7
  110. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  111. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  112. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  113. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  114. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -21,13 +21,13 @@ class StartAppCMD(CLICommand):
21
21
  def define_args(parsers: ArgumentParser):
22
22
  """ define args for create pipeline template command.
23
23
  """
24
- from evalscope.report import add_argument
24
+ from evalscope.app import add_argument
25
25
 
26
26
  parser = parsers.add_parser(StartAppCMD.name)
27
27
  add_argument(parser)
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
- from evalscope.report.app import create_app
31
+ from evalscope.app import create_app
32
32
 
33
33
  create_app(self.args)
@@ -1,3 +1,35 @@
1
- from evalscope.collections.evaluator import EvaluatorCollection
2
- from evalscope.collections.sampler import StratifiedSampler, UniformSampler, WeightedSampler
3
- from evalscope.collections.schema import CollectionSchema, DatasetInfo
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
3
+
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .evaluator import EvaluatorCollection
8
+ from .sampler import StratifiedSampler, UniformSampler, WeightedSampler
9
+ from .schema import CollectionSchema, DatasetInfo
10
+
11
+ else:
12
+ _import_structure = {
13
+ 'evaluator': [
14
+ 'EvaluatorCollection',
15
+ ],
16
+ 'sampler': [
17
+ 'StratifiedSampler',
18
+ 'UniformSampler',
19
+ 'WeightedSampler',
20
+ ],
21
+ 'schema': [
22
+ 'CollectionSchema',
23
+ 'DatasetInfo',
24
+ ],
25
+ }
26
+
27
+ import sys
28
+
29
+ sys.modules[__name__] = _LazyModule(
30
+ __name__,
31
+ globals()['__file__'],
32
+ _import_structure,
33
+ module_spec=__spec__,
34
+ extra_objects={},
35
+ )
@@ -32,11 +32,22 @@ class SimpleEvaluator(Evaluator):
32
32
  task_cfg=task_cfg,
33
33
  outputs=outputs)
34
34
 
35
- def get_answer(self, samples, infer_cfg) -> List[dict]:
35
+ def get_answer(self, samples: List[DatasetEntry], infer_cfg: dict) -> List[dict]:
36
36
  input_prompts = [sample.prompt for sample in samples]
37
37
  subset_name = samples[0].subset_name
38
+ try:
39
+ # get answer from model
40
+ answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
41
+ except Exception as e:
42
+ logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
43
+ # if ignore_errors is True, continue to next input
44
+ if self.task_cfg.ignore_errors:
45
+ logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
46
+ return [None] * len(samples), samples
47
+ else:
48
+ raise e
49
+ # process answers
38
50
  answers_list = []
39
- answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
40
51
  for answer_d, input_prompt in zip(answer_ds, input_prompts):
41
52
  answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
42
53
  processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
@@ -66,13 +77,17 @@ class EvaluatorCollection:
66
77
  self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
67
78
  self.evaluators = self._initialize_evaluators()
68
79
 
69
- def load(self) -> tuple[list[DatasetEntry], str]:
80
+ def load(self) -> tuple[List[DatasetEntry], str]:
70
81
  dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
71
82
  raw_dataset = self.data_adapter.load()
72
83
  # random limit the dataset
73
- if self.task_cfg.limit:
74
- raw_dataset = random.sample(raw_dataset,
75
- self.task_cfg.limit) if len(raw_dataset) > self.task_cfg.limit else raw_dataset
84
+ limit = len(raw_dataset)
85
+ if self.task_cfg.limit is not None:
86
+ if isinstance(self.task_cfg.limit, int):
87
+ limit = self.task_cfg.limit
88
+ elif isinstance(self.task_cfg.limit, float):
89
+ limit = int(len(raw_dataset) * self.task_cfg.limit)
90
+ raw_dataset = random.sample(raw_dataset, min(limit, len(raw_dataset)))
76
91
  # index dataset
77
92
  datasets = []
78
93
  for sample in raw_dataset:
@@ -82,7 +97,7 @@ class EvaluatorCollection:
82
97
  return datasets, dataset_name
83
98
 
84
99
  @staticmethod
85
- def _init_name_map(dataset):
100
+ def _init_name_map(dataset: List[DatasetEntry]) -> Dict[str, Dict[str, List[int]]]:
86
101
  dataset_name_map = defaultdict(lambda: defaultdict(list))
87
102
  for sample in dataset:
88
103
  dataset_name, subset_name = sample.dataset_name, sample.subset_name
@@ -90,13 +105,13 @@ class EvaluatorCollection:
90
105
  return dataset_name_map
91
106
 
92
107
  @staticmethod
93
- def _init_id_map(dataset):
108
+ def _init_id_map(dataset: List[DatasetEntry]) -> Dict[int, DatasetEntry]:
94
109
  dataset_id_map = {}
95
110
  for sample in dataset:
96
111
  dataset_id_map[sample.index] = sample
97
112
  return dataset_id_map
98
113
 
99
- def _initialize_evaluators(self):
114
+ def _initialize_evaluators(self) -> Dict[str, SimpleEvaluator]:
100
115
  evaluators = {}
101
116
  # load dataset args
102
117
  dataset_args = deepcopy(self.task_cfg.dataset_args)
@@ -114,6 +129,8 @@ class EvaluatorCollection:
114
129
  return evaluators
115
130
 
116
131
  def get_report(self, scores):
132
+ if not scores:
133
+ return
117
134
 
118
135
  def get_dataframe(scores):
119
136
  data = []
@@ -179,11 +196,19 @@ class EvaluatorCollection:
179
196
  logger.info(f'{level} Report:\n{table}')
180
197
 
181
198
  report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
199
+ # Make report analysis
200
+ if self.task_cfg.analysis_report:
201
+ logger.info('Generating report analysis, please wait ...')
202
+ analysis = report.generate_analysis(self.task_cfg.judge_model_args)
203
+ logger.info('Report analysis:\n%s', analysis)
204
+ else:
205
+ logger.info('Skipping report analysis (`analysis_report=False`).')
206
+
182
207
  # save report to JSON file
183
208
  report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
184
- os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
185
- with open(report_file_path, 'w', encoding='utf-8') as f:
186
- json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
209
+ report.to_json(report_file_path)
210
+
211
+ logger.info(f'Report saved to {report_file_path}')
187
212
  return report
188
213
 
189
214
  def _filter_answer(self, pred_file_path):
@@ -229,9 +254,12 @@ class EvaluatorCollection:
229
254
  # Process completed tasks
230
255
  for future in as_completed(futures):
231
256
  answer_list, samples = future.result()
232
- answers[samples[0].index] = answer_list[0]
233
- dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
234
- pbar.update(1)
257
+ for answer_d, sample in zip(answer_list, samples):
258
+ if answer_d is None:
259
+ continue
260
+ answers[sample.index] = answer_d
261
+ dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
262
+ pbar.update(1)
235
263
  else:
236
264
  for dataset_name, data_map in dataset_name_map.items():
237
265
  # get evaluator for the dataset
@@ -241,13 +269,14 @@ class EvaluatorCollection:
241
269
  # get batch samples
242
270
  batch_ids = ids[i:i + eval_batch_size]
243
271
  batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
244
- answer_list, _ = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
272
+ answer_list, samples = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
245
273
  # update answers
246
- for j, _id in enumerate(batch_ids):
247
- answers[_id] = answer_list[j]
248
- dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
249
-
250
- pbar.update(len(batch_ids))
274
+ for answer_d, sample in zip(answer_list, samples):
275
+ if answer_d is None:
276
+ continue
277
+ answers[sample.index] = answer_d
278
+ dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
279
+ pbar.update(1)
251
280
  return answers
252
281
 
253
282
  def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
@@ -277,19 +306,22 @@ class EvaluatorCollection:
277
306
 
278
307
  reviews = {}
279
308
  for sample in tqdm(self.dataset, desc='Getting reviews'):
280
- file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
281
-
282
- if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
283
- # Use cached review if available
284
- review_d = review_history_map[file_name][sample.index]
285
- else:
286
- # Generate new review
287
- evaluator = self.evaluators[sample.dataset_name]
288
- review_d = evaluator.get_review(answers[sample.index])
289
- # Only save the review if it's not in the cache
290
- self._save_review(review_file_path, file_name, review_d)
291
-
292
- reviews[sample.index] = review_d
309
+ try:
310
+ file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
311
+
312
+ if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
313
+ # Use cached review if available
314
+ review_d = review_history_map[file_name][sample.index]
315
+ else:
316
+ # Generate new review
317
+ evaluator = self.evaluators[sample.dataset_name]
318
+ review_d = evaluator.get_review(answers[sample.index])
319
+ # Only save the review if it's not in the cache
320
+ self._save_review(review_file_path, file_name, review_d)
321
+
322
+ reviews[sample.index] = review_d
323
+ except Exception as e:
324
+ logger.error(f'Error getting review for sample index {sample.index}: {e}. Skipping this sample.')
293
325
 
294
326
  return reviews
295
327
 
@@ -327,6 +359,8 @@ class EvaluatorCollection:
327
359
  scores = defaultdict(dict)
328
360
  for sample in tqdm(self.dataset, desc='Getting scores'):
329
361
  evaluator = self.evaluators[sample.dataset_name]
362
+ if sample.index not in reviews:
363
+ continue
330
364
  review_d = reviews[sample.index]
331
365
  score = evaluator.get_score(review_d)
332
366
  scores[sample.index] = score
evalscope/config.py CHANGED
@@ -13,6 +13,7 @@ from evalscope.models import CustomModel, DummyCustomModel
13
13
  from evalscope.utils import gen_hash
14
14
  from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
15
15
  from evalscope.utils.logger import get_logger
16
+ from evalscope.utils.utils import parse_int_or_float
16
17
 
17
18
  logger = get_logger()
18
19
 
@@ -45,7 +46,7 @@ class TaskConfig:
45
46
  eval_backend: str = EvalBackend.NATIVE
46
47
  eval_config: Union[str, Dict, None] = None
47
48
  stage: str = EvalStage.ALL
48
- limit: Optional[int] = None
49
+ limit: Optional[Union[int, float]] = None
49
50
  eval_batch_size: Optional[int] = None
50
51
 
51
52
  # Cache and working directory arguments
@@ -67,7 +68,8 @@ class TaskConfig:
67
68
  # LLMJudge arguments
68
69
  judge_strategy: str = JudgeStrategy.AUTO
69
70
  judge_worker_num: int = 1
70
- judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
71
+ judge_model_args: Optional[Dict] = field(default_factory=dict)
72
+ analysis_report: bool = False
71
73
 
72
74
  def __post_init__(self):
73
75
  if self.model is None:
@@ -86,6 +88,10 @@ class TaskConfig:
86
88
  if self.eval_batch_size is None:
87
89
  self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
88
90
 
91
+ # Post process limit
92
+ if self.limit is not None:
93
+ self.limit = parse_int_or_float(self.limit)
94
+
89
95
  # Set default generation_config and model_args
90
96
  self.__init_default_generation_config()
91
97
  self.__init_default_model_args()
evalscope/constants.py CHANGED
@@ -146,7 +146,7 @@ class EvalType:
146
146
 
147
147
 
148
148
  class OutputType:
149
- LOGITS = 'logits' # for multiple choice tasks
149
+ LOGITS = 'logits' # for logits output tasks
150
150
  GENERATION = 'generation' # for text generation tasks and general tasks
151
151
  MULTIPLE_CHOICE = 'multiple_choice_logits' # for multiple choice tasks
152
152
  CONTINUOUS = 'continuous_logits' # for continuous tasks
@@ -46,7 +46,6 @@ class Evaluator(object):
46
46
  self.dataset_name = data_adapter.name
47
47
  self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
48
48
  self.model_name = task_cfg.model_id
49
- self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
50
49
 
51
50
  self.data_adapter = data_adapter
52
51
  self.model_adapter = model_adapter
@@ -79,8 +78,16 @@ class Evaluator(object):
79
78
  # Limit and index prompts
80
79
  limited_prompts = defaultdict(list)
81
80
  for subset_name, prompts_list in prompts.items():
82
- limit = self.task_cfg.limit or len(prompts_list)
83
- for index, prompt in enumerate(prompts_list[:limit]):
81
+ # If limit is None, use all prompts
82
+ if self.task_cfg.limit is None:
83
+ limit = len(prompts_list)
84
+ else:
85
+ if isinstance(self.task_cfg.limit, int):
86
+ limit = self.task_cfg.limit
87
+ elif isinstance(self.task_cfg.limit, float):
88
+ limit = int(len(prompts_list) * self.task_cfg.limit)
89
+ # Limit the number of prompts
90
+ for index, prompt in enumerate(prompts_list[:min(limit, len(prompts_list))]):
84
91
  prompt[AnswerKeys.INDEX] = index
85
92
  limited_prompts[subset_name].append(prompt)
86
93
 
@@ -101,7 +108,6 @@ class Evaluator(object):
101
108
  return answer_d
102
109
 
103
110
  def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
104
- answers_list = []
105
111
  try:
106
112
  # get answer from model
107
113
  answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
@@ -110,10 +116,11 @@ class Evaluator(object):
110
116
  # if ignore_errors is True, continue to next input
111
117
  if self.task_cfg.ignore_errors:
112
118
  logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
113
- return answers_list
119
+ return []
114
120
  else:
115
121
  raise e
116
122
  # process answer
123
+ answers_list = []
117
124
  for answer_d, input_prompt in zip(answer_ds, input_prompts):
118
125
  answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
119
126
  processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
@@ -371,41 +378,46 @@ class Evaluator(object):
371
378
 
372
379
  return metric_score
373
380
 
374
- def dump_report(self, reviews_score_all: List[dict], use_table: bool = True):
381
+ def dump_report(self, reviews_score_all: List[dict]):
375
382
  """
376
383
  Get report for total reviews of specific dataset.
377
384
  It is required to rewrite this method to support your own evaluator.
378
385
 
379
386
  Args:
380
387
  reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
381
- use_table: whether to generate table for reports. Default to True.
382
388
 
383
389
  Returns: None
384
390
  """
391
+ report_path = os.path.join(self.outputs_structure.reports_dir, self.model_name)
392
+ os.makedirs(report_path, exist_ok=True)
385
393
  # Get report map
386
394
  report_map: Report = self.data_adapter.gen_report(
387
- subset_score_map=reviews_score_all,
388
- report_name=self.custom_task_name,
389
- model_name=self.model_name,
390
- dataset_name=self.dataset_name)
391
-
392
- # Dump report
393
- report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
394
- self.dataset_name + '.json')
395
- os.makedirs(os.path.dirname(report_path), exist_ok=True)
395
+ subset_score_map=reviews_score_all, model_name=self.model_name)
396
396
 
397
- # Write report
398
- with open(report_path, 'w', encoding='utf-8') as f:
399
- f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
400
- logger.info(f'Dump report: {report_path} \n')
397
+ # Post process report
398
+ self.data_adapter.post_process_report(report_map, report_path=report_path)
401
399
 
402
400
  # Make table
403
- if use_table:
404
- try:
405
- report_table: str = gen_table([self.outputs_structure.reports_dir])
406
- logger.info(f'Report table: \n{report_table} \n')
407
- except Exception:
408
- logger.error('Failed to generate report table.')
401
+ try:
402
+ report_table = gen_table(report_list=[report_map], add_overall_metric=True)
403
+ logger.info(f'\n{self.dataset_name_or_path} report table:'
404
+ f'\n{report_table} \n')
405
+ except Exception:
406
+ logger.error('Failed to generate report table.')
407
+
408
+ # Make report analysis
409
+ if self.task_cfg.analysis_report:
410
+ logger.info('Generating report analysis, please wait ...')
411
+ analysis = report_map.generate_analysis(self.task_cfg.judge_model_args)
412
+ logger.info('Report analysis:\n%s', analysis)
413
+ else:
414
+ logger.info('Skipping report analysis (`analysis_report=False`).')
415
+
416
+ # Dump report
417
+ report_file = os.path.join(report_path, f'{self.dataset_name}.json')
418
+ report_map.to_json(report_file)
419
+ logger.info(f'Dump report to: {report_file} \n')
420
+
409
421
  return report_map
410
422
 
411
423
  def eval(self, **kwargs) -> dict:
@@ -431,7 +443,7 @@ class Evaluator(object):
431
443
  stage == 'review': return the reviews_map
432
444
  """
433
445
 
434
- logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
446
+ logger.info(f'Start evaluating on dataset {self.dataset_name_or_path}')
435
447
 
436
448
  reviews_score_all = {} # {subset_name: (score, num)}
437
449
  stage_answers_dict = {}
@@ -461,6 +473,6 @@ class Evaluator(object):
461
473
  # Generate report
462
474
  report_map = self.dump_report(reviews_score_all)
463
475
 
464
- logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
476
+ logger.info(f'Evaluation finished on {self.dataset_name_or_path}')
465
477
 
466
478
  return report_map
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
9
9
  from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
10
10
  weighted_mean)
11
11
  from .named_metrics import Metric, metric_registry
12
- from .rouge_metric import compute_rouge_score_one_sample_zh
12
+ from .rouge_metric import compute_rouge_score, compute_rouge_score_one_sample, compute_rouge_score_one_sample_zh
13
13
 
14
14
  else:
15
15
  _import_structure = {
@@ -28,6 +28,8 @@ else:
28
28
  ],
29
29
  'rouge_metric': [
30
30
  'compute_rouge_score_one_sample_zh',
31
+ 'compute_rouge_score',
32
+ 'compute_rouge_score_one_sample',
31
33
  ],
32
34
  'llm_judge': [
33
35
  'LLMJudge',
@@ -88,11 +88,11 @@ class RougeScorer(scoring.BaseScorer):
88
88
  """
89
89
 
90
90
  def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
91
- check_nltk_data()
92
91
  self.rouge_types = rouge_types
93
92
  if tokenizer:
94
93
  self._tokenizer = tokenizer
95
94
  else:
95
+ check_nltk_data()
96
96
  self._tokenizer = tokenizers.DefaultTokenizer(use_stemmer)
97
97
  logging.info('Using default tokenizer.')
98
98
 
@@ -22,6 +22,9 @@ B: INCORRECT
22
22
  Just return the letters "A" or "B", with no text around it.
23
23
  """ # noqa: E501
24
24
 
25
+ DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
26
+ DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
27
+
25
28
 
26
29
  class LLMJudge:
27
30
  """
@@ -47,12 +50,12 @@ class LLMJudge:
47
50
  prompt_template (str, optional): Prompt template for the judge
48
51
  generation_config (dict, optional): Generation configuration for the judge
49
52
  """
50
- self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
51
- self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
52
- self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-4')
53
+ self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
54
+ self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
55
+ self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
53
56
  self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
54
57
  self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
55
- self.generation_config = generation_config
58
+ self.generation_config = generation_config or {}
56
59
 
57
60
  from evalscope.models import ServerModelAdapter
58
61
 
@@ -74,6 +77,10 @@ class LLMJudge:
74
77
  if self.generation_config:
75
78
  infer_cfg.update(self.generation_config)
76
79
 
80
+ if self.model_id == DEFAULT_JUDGE_MODEL:
81
+ # Disable thinking for the default judge model
82
+ infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
83
+
77
84
  try:
78
85
  # Send request using ServerModelAdapter
79
86
  response = self.server_adapter.process_single_input(input_data, infer_cfg)
@@ -82,7 +89,7 @@ class LLMJudge:
82
89
  llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
83
90
  return llm_response
84
91
  except Exception as e:
85
- logger.error(f'Error during LLM evaluation: {e}')
92
+ logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
86
93
  return ''
87
94
 
88
95
  def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
@@ -4,7 +4,7 @@ The logic in this file largely borrows from Qwen2.5-Math codebase at https://git
4
4
  # flake8: noqa
5
5
  import re
6
6
  import regex
7
- from latex2sympy2 import latex2sympy
7
+ from latex2sympy2_extended import latex2sympy
8
8
  from math import isclose
9
9
  from sympy import N, simplify
10
10
  from sympy.parsing.latex import parse_latex
@@ -1,66 +1,52 @@
1
- from __future__ import absolute_import, division, print_function
2
-
3
- from .clipscore import CLIPScore, list_all_clipscore_models
4
- from .constants import CACHE_DIR
5
- from .itmscore import ITMScore, list_all_itmscore_models
6
- from .vqascore import VQAScore, list_all_vqascore_models
7
-
8
-
9
- def list_all_models():
10
- return list_all_vqascore_models() + list_all_clipscore_models() + list_all_itmscore_models()
11
-
12
-
13
- def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR, **kwargs):
14
- if model in list_all_vqascore_models():
15
- return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs)
16
- elif model in list_all_clipscore_models():
17
- return CLIPScore(model, device=device, cache_dir=cache_dir, **kwargs)
18
- elif model in list_all_itmscore_models():
19
- return ITMScore(model, device=device, cache_dir=cache_dir, **kwargs)
20
- else:
21
- raise NotImplementedError()
22
-
23
-
24
1
  def clip_flant5_score():
2
+ from .vqascore import VQAScore
25
3
  clip_flant5_score = VQAScore(model='clip-flant5-xxl')
26
4
  return clip_flant5_score
27
5
 
28
6
 
29
7
  def pick_score():
8
+ from .clipscore import CLIPScore
30
9
  pick_score = CLIPScore(model='pickscore-v1')
31
10
  return pick_score
32
11
 
33
12
 
34
13
  def clip_score():
14
+ from .clipscore import CLIPScore
35
15
  clip_score = CLIPScore(model='openai:ViT-L-14-336')
36
16
  return clip_score
37
17
 
38
18
 
39
19
  def blip2_score():
20
+ from .itmscore import ITMScore
40
21
  blip_itm_score = ITMScore(model='blip2-itm')
41
22
  return blip_itm_score
42
23
 
43
24
 
44
25
  def hpsv2_score():
26
+ from .clipscore import CLIPScore
45
27
  hpsv2_score = CLIPScore(model='hpsv2')
46
28
  return hpsv2_score
47
29
 
48
30
 
49
31
  def hpsv2_1_score():
32
+ from .clipscore import CLIPScore
50
33
  hpsv2_1_score = CLIPScore(model='hpsv2.1')
51
34
  return hpsv2_1_score
52
35
 
53
36
 
54
37
  def image_reward_score():
38
+ from .itmscore import ITMScore
55
39
  image_reward_score = ITMScore(model='image-reward-v1')
56
40
  return image_reward_score
57
41
 
58
42
 
59
43
  def fga_blip2_score():
44
+ from .itmscore import ITMScore
60
45
  fga_blip2_score = ITMScore(model='fga_blip2')
61
46
  return fga_blip2_score
62
47
 
63
48
 
64
49
  def mps_score():
50
+ from .clipscore import CLIPScore
65
51
  mps_score = CLIPScore(model='mps')
66
52
  return mps_score
@@ -1,4 +1,5 @@
1
1
  from .base_adapter import BaseModelAdapter, initialize_model_adapter
2
+ from .bfcl_adapter import BFCLAdapter
2
3
  from .chat_adapter import ChatGenerationModelAdapter
3
4
  from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
4
5
  from .custom_adapter import CustomModelAdapter
@@ -13,5 +14,6 @@ __all__ = [
13
14
  'MultiChoiceModelAdapter',
14
15
  'CustomModelAdapter',
15
16
  'ServerModelAdapter',
17
+ 'BFCLAdapter',
16
18
  'T2IModelAdapter',
17
19
  ]
@@ -44,35 +44,39 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
44
44
  raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
45
45
  from evalscope.models import CustomModelAdapter
46
46
  return CustomModelAdapter(custom_model=task_cfg.model)
47
- elif task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
48
- from evalscope.models import ServerModelAdapter
49
-
50
- if benchmark.model_adapter in [OutputType.CONTINUOUS, OutputType.MULTIPLE_CHOICE]:
51
- logger.warning('Output type is set to logits. This is not supported for service evaluation. '
52
- 'Setting output type to generation by default.')
53
- benchmark.model_adapter = OutputType.GENERATION
54
-
55
- return ServerModelAdapter(
56
- api_url=task_cfg.api_url,
57
- model_id=task_cfg.model,
58
- api_key=task_cfg.api_key,
59
- seed=task_cfg.seed,
60
- timeout=task_cfg.timeout,
61
- stream=task_cfg.stream,
62
- )
63
47
  else:
64
48
  from ..register import get_model_adapter
65
49
 
66
- # for local model, we need to determine the model adapter class based on the output type
50
+ # we need to determine the model adapter class based on the output type
67
51
  model_adapter_cls_str = benchmark.model_adapter
68
- if model_adapter_cls_str not in benchmark.output_types:
69
- logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}. '
70
- f'Using {benchmark.output_types[0]} instead.')
71
- model_adapter_cls_str = benchmark.output_types[0]
72
52
 
73
- model_adapter_cls = get_model_adapter(model_adapter_cls_str)
74
- return model_adapter_cls(
75
- model=base_model,
76
- generation_config=task_cfg.generation_config,
77
- chat_template=task_cfg.chat_template,
78
- task_cfg=task_cfg)
53
+ if task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
54
+
55
+ if 'server' not in model_adapter_cls_str:
56
+ model_adapter_cls_str = 'server'
57
+ logger.info(
58
+ f'Using {model_adapter_cls.__name__} for api model evaluation for benchmark {benchmark.name}.')
59
+
60
+ # init server model adapter
61
+ model_adapter_cls = get_model_adapter(model_adapter_cls_str)
62
+
63
+ return model_adapter_cls(
64
+ api_url=task_cfg.api_url,
65
+ model_id=task_cfg.model,
66
+ api_key=task_cfg.api_key,
67
+ seed=task_cfg.seed,
68
+ timeout=task_cfg.timeout,
69
+ stream=task_cfg.stream,
70
+ )
71
+ else:
72
+ if model_adapter_cls_str not in benchmark.output_types:
73
+ logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}.'
74
+ f'Using {benchmark.output_types[0]} instead.')
75
+ model_adapter_cls_str = benchmark.output_types[0]
76
+
77
+ model_adapter_cls = get_model_adapter(model_adapter_cls_str)
78
+ return model_adapter_cls(
79
+ model=base_model,
80
+ generation_config=task_cfg.generation_config,
81
+ chat_template=task_cfg.chat_template,
82
+ task_cfg=task_cfg)