evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  6. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  7. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  8. evalscope/api/benchmark/benchmark.py +62 -2
  9. evalscope/api/benchmark/meta.py +9 -0
  10. evalscope/api/dataset/dataset.py +6 -6
  11. evalscope/api/dataset/loader.py +2 -1
  12. evalscope/api/evaluator/cache.py +24 -1
  13. evalscope/api/evaluator/evaluator.py +5 -0
  14. evalscope/api/evaluator/state.py +17 -1
  15. evalscope/api/messages/__init__.py +1 -0
  16. evalscope/api/messages/chat_message.py +52 -2
  17. evalscope/api/metric/scorer.py +15 -7
  18. evalscope/api/mixin/__init__.py +1 -1
  19. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  20. evalscope/api/mixin/sandbox_mixin.py +204 -0
  21. evalscope/api/model/generate_config.py +1 -6
  22. evalscope/api/model/model.py +5 -2
  23. evalscope/api/tool/tool_info.py +1 -1
  24. evalscope/app/app.py +3 -0
  25. evalscope/app/ui/single_model.py +3 -3
  26. evalscope/app/utils/data_utils.py +7 -7
  27. evalscope/app/utils/env_utils.py +12 -0
  28. evalscope/app/utils/text_utils.py +14 -12
  29. evalscope/arguments.py +8 -4
  30. evalscope/backend/opencompass/backend_manager.py +0 -2
  31. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  32. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  33. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  34. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  35. evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
  36. evalscope/benchmarks/bfcl/generation.py +9 -9
  37. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  38. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  39. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  40. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  41. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  42. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  43. evalscope/benchmarks/healthbench/utils.py +102 -0
  44. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  45. evalscope/benchmarks/humaneval/utils.py +235 -0
  46. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  47. evalscope/benchmarks/image_edit/__init__.py +0 -0
  48. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  49. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  50. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  51. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  52. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  53. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  54. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  55. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  56. evalscope/benchmarks/math_vista/__init__.py +0 -0
  57. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  58. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  59. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  60. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  61. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  62. evalscope/benchmarks/mm_star/__init__.py +0 -0
  63. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  64. evalscope/benchmarks/mmmu/__init__.py +0 -0
  65. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  66. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  67. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  68. evalscope/benchmarks/multi_if/__init__.py +0 -0
  69. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  70. evalscope/benchmarks/multi_if/metrics.py +120 -0
  71. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  72. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
  73. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  74. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  75. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  76. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  77. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  78. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  79. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  80. evalscope/benchmarks/tau_bench/generation.py +1 -1
  81. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
  82. evalscope/benchmarks/text2image/__init__.py +0 -0
  83. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  84. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  85. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  86. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  87. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  88. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  89. evalscope/cli/start_app.py +7 -1
  90. evalscope/cli/start_perf.py +7 -1
  91. evalscope/config.py +96 -14
  92. evalscope/constants.py +11 -0
  93. evalscope/evaluator/evaluator.py +30 -10
  94. evalscope/metrics/llm_judge.py +19 -7
  95. evalscope/metrics/metric.py +27 -2
  96. evalscope/models/image_edit_model.py +125 -0
  97. evalscope/models/model_apis.py +22 -0
  98. evalscope/models/openai_compatible.py +3 -0
  99. evalscope/models/text2image_model.py +2 -2
  100. evalscope/models/utils/openai.py +8 -6
  101. evalscope/perf/arguments.py +2 -0
  102. evalscope/perf/benchmark.py +2 -0
  103. evalscope/perf/plugin/api/base.py +2 -2
  104. evalscope/perf/plugin/api/default_api.py +7 -7
  105. evalscope/perf/plugin/api/openai_api.py +83 -19
  106. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  107. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  108. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  109. evalscope/perf/utils/benchmark_util.py +7 -5
  110. evalscope/perf/utils/local_server.py +3 -0
  111. evalscope/report/__init__.py +0 -1
  112. evalscope/report/combinator.py +0 -25
  113. evalscope/report/generator.py +8 -87
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +9 -5
  116. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  117. evalscope/utils/chat_service.py +1 -1
  118. evalscope/utils/function_utils.py +41 -0
  119. evalscope/utils/import_utils.py +73 -1
  120. evalscope/utils/io_utils.py +56 -7
  121. evalscope/utils/json_schema.py +23 -2
  122. evalscope/utils/logger.py +19 -0
  123. evalscope/utils/model_utils.py +4 -3
  124. evalscope/utils/multi_choices.py +23 -6
  125. evalscope/version.py +2 -2
  126. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
  127. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
  128. tests/benchmark/test_eval.py +80 -37
  129. tests/benchmark/test_image_edit.py +65 -0
  130. tests/benchmark/test_sandbox.py +81 -0
  131. tests/benchmark/test_vlm.py +137 -0
  132. tests/cli/test_all.py +83 -43
  133. tests/cli/test_collection.py +8 -5
  134. tests/cli/test_reasoning.py +81 -0
  135. tests/common.py +73 -0
  136. tests/perf/test_perf.py +44 -14
  137. tests/rag/test_clip_benchmark.py +0 -3
  138. evalscope/api/mixin/dataset_mixin.py +0 -105
  139. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  140. tests/aigc/__init__.py +0 -1
  141. /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
  142. /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
  143. /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
  144. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  145. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  146. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  147. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
  148. /tests/{aigc → benchmark}/test_t2i.py +0 -0
@@ -1,17 +1,19 @@
1
- import importlib
2
1
  import json
3
2
  import re
4
3
  import traceback
5
- from typing import Any, Dict
4
+ from typing import Any, Dict, List
6
5
 
7
6
  from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
8
7
  from evalscope.api.dataset import Sample
9
8
  from evalscope.api.evaluator import TaskState
10
9
  from evalscope.api.messages.chat_message import ChatMessageUser
11
10
  from evalscope.api.metric import Score
11
+ from evalscope.api.metric.scorer import AggScore
12
12
  from evalscope.api.model import Model, ModelOutput
13
13
  from evalscope.api.registry import register_benchmark
14
14
  from evalscope.constants import Tags
15
+ from evalscope.report import Category, Report, Subset
16
+ from evalscope.utils.import_utils import check_import
15
17
  from evalscope.utils.logger import get_logger
16
18
 
17
19
  logger = get_logger()
@@ -67,18 +69,50 @@ class BFCLAdapter(DefaultDataAdapter):
67
69
  def __init__(self, **kwargs):
68
70
  super().__init__(**kwargs)
69
71
 
70
- spec = importlib.util.find_spec('bfcl_eval')
71
- if spec is None:
72
- raise ImportError(
73
- '`bfcl_eval` not found, please install it with `pip install bfcl-eval==2025.6.16` before evaluating.'
74
- )
72
+ check_import('bfcl_eval', package='bfcl-eval==2025.6.16', raise_error=True, feature_name=self.pretty_name)
75
73
 
76
74
  self.category_map = SUBJECT_MAPPING
77
75
  self.reformat_subset = True
76
+ self.add_overall_metric = False
77
+ self.add_aggregation_name = False
78
78
 
79
79
  self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
80
80
  self.is_fc_model = self.extra_params.get('is_fc_model', True)
81
81
 
82
+ def _weighted_average_from_subsets(self, subset_names: List[str], subset_dict: Dict[str, Subset]) -> Subset:
83
+ """Calculate weighted average for given subsets.
84
+
85
+ Returns:
86
+ Subset: A new Subset object with weighted average score
87
+ """
88
+ total_score = 0
89
+ total_count = 0
90
+ for name in subset_names:
91
+ if name in subset_dict:
92
+ subset = subset_dict[name]
93
+ total_score += subset.score * subset.num
94
+ total_count += subset.num
95
+
96
+ weighted_avg = total_score / total_count if total_count > 0 else 0
97
+ return Subset(name='', score=weighted_avg, num=total_count)
98
+
99
+ def _unweighted_average_from_subsets(self, subset_names: List[str], subset_dict: Dict[str, Subset]) -> Subset:
100
+ """Calculate unweighted average for given subsets.
101
+
102
+ Returns:
103
+ Subset: A new Subset object with unweighted average score
104
+ """
105
+ scores = []
106
+ total_count = 0
107
+ for name in subset_names:
108
+ if name in subset_dict:
109
+ subset = subset_dict[name]
110
+ scores.append(subset.score)
111
+ total_count += subset.num
112
+
113
+ unweighted_avg = sum(scores) / len(scores) if scores else 0
114
+ return Subset(name='', score=unweighted_avg, num=total_count)
115
+
82
116
  def preprocess_row(self, row: dict):
83
117
  """
84
118
  Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
@@ -256,3 +290,104 @@ class BFCLAdapter(DefaultDataAdapter):
256
290
  score.metadata = {'error': traceback.format_exc()}
257
291
  score.main_score_name = 'acc'
258
292
  return score
293
+
294
+ def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
295
+ """
296
+ Finalize the report generation process. Calculate the overall score.
297
+
298
+ Track the number of each category.
299
+ - step1: simple, java, javascript unweighted average as simple_ast
300
+ - step2.1: simple_ast, multiple, parallel, parallel_multiple unweighted average as ast_non_live
301
+ - step2.2: live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average as ast_live
302
+ - step2.3: irrelevance as hallucination_non_live
303
+ - step2.4: live_irrelevance, live_relevance weighted average as hallucination_live
304
+ - step2.5: multi_turn_base as multi_turn_base
305
+ - step2.6: multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average as multi_turn_augmented
306
+ - step3.1: ast_non_live, hallucination_non_live unweighted average as non_live
307
+ - step3.2: ast_live, hallucination_live weighted average as live
308
+ - step3.3: multi_turn_base, multi_turn_augmented unweighted average as multi_turn
309
+ - step4: non_live, live, multi_turn unweighted average as overall
310
+ Args:
311
+ report (Report): The generated evaluation report.
312
+ output_dir (str): The directory to save the report.
313
+
314
+ Returns:
315
+ None
316
+ """ # noqa: E501
317
+ for metric in report.metrics:
318
+ # Collect all subsets in a dictionary for easy access
319
+ subset_dict: Dict[str, Subset] = {}
320
+ for category in metric.categories:
321
+ for subset in category.subsets:
322
+ subset_dict[subset.name] = subset
323
+
324
+ # Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
325
+ simple_subsets = ['simple', 'java', 'javascript']
326
+ simple_ast = self._unweighted_average_from_subsets(simple_subsets, subset_dict)
327
+ subset_dict['simple_ast'] = simple_ast
328
+
329
+ # Step 2.1: Calculate ast_non_live
330
+ # (simple_ast, multiple, parallel, parallel_multiple unweighted average)
331
+ ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
332
+ ast_non_live = self._unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
333
+ subset_dict['ast_non_live'] = ast_non_live
334
+
335
+ # Step 2.2: Calculate ast_live
336
+ # (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
337
+ live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
338
+ ast_live = self._weighted_average_from_subsets(live_subsets, subset_dict)
339
+ subset_dict['ast_live'] = ast_live
340
+
341
+ # Step 2.3: hallucination_non_live (irrelevance)
342
+ if 'irrelevance' in subset_dict:
343
+ subset_dict['hallucination_non_live'] = subset_dict['irrelevance']
344
+ else:
345
+ subset_dict['hallucination_non_live'] = Subset(name='hallucination_non_live', score=0, num=0)
346
+
347
+ # Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
348
+ hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
349
+ hallucination_live = self._weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
350
+ subset_dict['hallucination_live'] = hallucination_live
351
+
352
+ # Step 2.5: multi_turn_base
353
+ if 'multi_turn_base' not in subset_dict:
354
+ subset_dict['multi_turn_base'] = Subset(name='multi_turn_base', score=0, num=0)
355
+
356
+ # Step 2.6: Calculate multi_turn_augmented
357
+ # (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
358
+ multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
359
+ multi_turn_augmented = self._weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
360
+ subset_dict['multi_turn_augmented'] = multi_turn_augmented
361
+
362
+ # Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
363
+ non_live_subsets = ['ast_non_live', 'hallucination_non_live']
364
+ non_live = self._unweighted_average_from_subsets(non_live_subsets, subset_dict)
365
+ subset_dict['non_live'] = non_live
366
+
367
+ # Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
368
+ live_agg_subsets = ['ast_live', 'hallucination_live']
369
+ live = self._weighted_average_from_subsets(live_agg_subsets, subset_dict)
370
+ subset_dict['live'] = live
371
+
372
+ # Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
373
+ multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
374
+ multi_turn = self._unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
375
+ subset_dict['multi_turn'] = multi_turn
376
+
377
+ # Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
378
+ overall_subsets = ['non_live', 'live', 'multi_turn']
379
+ overall = self._unweighted_average_from_subsets(overall_subsets, subset_dict)
380
+ subset_dict['overall'] = overall
381
+
382
+ # Add computed scores to the category
383
+ computed_subset_names = ['non_live', 'live', 'multi_turn', 'overall']
384
+
385
+ # Add the computed scores as new subsets in the metric
386
+ dummy_subsets = []
387
+ for subset_name in computed_subset_names:
388
+ if subset_name in subset_dict:
389
+ subset = subset_dict[subset_name]
390
+ subset.name = subset_name.upper()
391
+ dummy_subsets.append(subset)
392
+ dummy_category = Category(name='-', subsets=dummy_subsets)
393
+ metric.categories.append(dummy_category)
@@ -72,13 +72,14 @@ def generate_turn(model: Model, row: dict[str, Any]):
72
72
 
73
73
  # Handle the response based on the model output structure
74
74
  message = model_output.message
75
- model_usage += model_output.usage
75
+ if model_output.usage is not None:
76
+ model_usage += model_output.usage
76
77
 
77
78
  current_messages.append(message)
78
79
  if isinstance(message, str):
79
80
  result = message
80
81
  else:
81
- result = message.content
82
+ result = message.text
82
83
 
83
84
  logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
84
85
  current_responses.append(result)
@@ -115,7 +116,7 @@ def generate_turn(model: Model, row: dict[str, Any]):
115
116
 
116
117
  n_steps += 1
117
118
  if n_steps > MAXIMUM_STEP_LIMIT:
118
- logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
119
+ logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
119
120
  break
120
121
 
121
122
  all_model_responses.append(current_responses)
@@ -145,9 +146,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
145
146
  new_tools = row['missing_functions'][str(turn_idx)]
146
147
  for new_tool in new_tools:
147
148
  cur_tool = new_tool[0]
148
- # change type to object
149
- if cur_tool['parameters']['type'] != 'object':
150
- cur_tool['parameters']['type'] = 'object'
149
+ cur_tool['parameters']['type'] = 'object'
151
150
  tools.append({
152
151
  'type': 'function',
153
152
  'function': cur_tool,
@@ -172,7 +171,8 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
172
171
 
173
172
  # Handle the response based on the model output structure
174
173
  message = model_output.message
175
- model_usage += model_output.usage
174
+ if model_output.usage is not None:
175
+ model_usage += model_output.usage
176
176
 
177
177
  current_messages.append(message)
178
178
  if isinstance(message, str):
@@ -186,7 +186,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
186
186
  logger.error(f'Error converting tool calls to function call strings: {e}')
187
187
  tool_call_strs = None
188
188
  else:
189
- model_responses = [message.content]
189
+ model_responses = [message.text]
190
190
  tool_call_strs = None
191
191
 
192
192
  current_responses.extend(model_responses)
@@ -214,7 +214,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
214
214
 
215
215
  n_steps += 1
216
216
  if n_steps > MAXIMUM_STEP_LIMIT:
217
- logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
217
+ logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
218
218
  break
219
219
 
220
220
  all_model_responses.append(current_responses)
@@ -1,10 +1,9 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from functools import partial
4
3
  from typing import Any, Dict
5
4
 
6
5
  from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
7
- from evalscope.api.dataset import Dataset, RemoteDataLoader, Sample
6
+ from evalscope.api.dataset import Sample
8
7
  from evalscope.api.registry import register_benchmark
9
8
  from evalscope.constants import Tags
10
9
  from evalscope.utils.logger import get_logger
@@ -6,9 +6,7 @@ from typing import Any, Dict, List
6
6
  from evalscope.api.benchmark import BenchmarkMeta, DataAdapter, DefaultDataAdapter
7
7
  from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
8
8
  from evalscope.api.evaluator import TaskState
9
- from evalscope.api.metric import Score
10
9
  from evalscope.api.metric.scorer import AggScore, SampleScore
11
- from evalscope.api.model.model import Model
12
10
  from evalscope.api.registry import get_benchmark, register_benchmark
13
11
  from evalscope.config import TaskConfig
14
12
  from evalscope.constants import DataCollection, Tags
@@ -23,7 +21,11 @@ logger = get_logger()
23
21
  BenchmarkMeta(
24
22
  name=DataCollection.NAME,
25
23
  dataset_id='', # dataset_id need to be set
26
- description='Data collection',
24
+ description='Custom Data collection, mixing multiple evaluation datasets for '
25
+ 'a unified evaluation, aiming to use less data to achieve a more comprehensive '
26
+ 'assessment of the model\'s capabilities. '
27
+ '[Usage Reference](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/collection/index.html)',
28
+ tags=[Tags.CUSTOM],
27
29
  metric_list=['acc'],
28
30
  eval_split='test',
29
31
  prompt_template='',
@@ -55,9 +57,10 @@ class DataCollectionAdapter(DefaultDataAdapter):
55
57
  data_id_or_path=dataset_path,
56
58
  split=self.eval_split,
57
59
  sample_fields=self.record_to_sample,
58
- subset=self.default_subset,
60
+ subset='test', # NOTE: using hardcoded test subset
59
61
  limit=self.limit,
60
- repeats=self.repeats
62
+ repeats=self.repeats,
63
+ shuffle=self.shuffle,
61
64
  ).load()
62
65
 
63
66
  test_dataset = DatasetDict({self.default_subset: dataset})
@@ -95,7 +98,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
95
98
 
96
99
  # load dataset args
97
100
  dataset_args = copy.deepcopy(self._task_config.dataset_args)
98
- common_args = dataset_args.get(DataCollection.NAME, {})
99
101
 
100
102
  # Iterate through each sample in the dataset
101
103
  dataset = self.test_dataset[self.default_subset]
@@ -108,7 +110,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
108
110
 
109
111
  # update dataset args
110
112
  cur_dataset_args = dataset_args.get(dataset_name, {})
111
- cur_dataset_args.update(common_args)
112
113
 
113
114
  # Initialize dataset adapter
114
115
  if dataset_name not in self.dataset_adapters:
@@ -141,19 +142,22 @@ class DataCollectionAdapter(DefaultDataAdapter):
141
142
  data = []
142
143
  for sample_score in sample_scores:
143
144
  collection_info = sample_score.sample_metadata[DataCollection.INFO]
144
- for metric_name, value in sample_score.score.value.items():
145
- data.append(
146
- dict(
147
- task_type=collection_info['task_type'],
148
- categories=tuple(collection_info['categories']),
149
- dataset_name=collection_info['dataset_name'],
150
- subset_name=collection_info['subset_name'],
151
- tags=collection_info['tags'],
152
- sample_id=sample_score.sample_id,
153
- metric=metric_name,
154
- score=value
155
- )
145
+ main_score = sample_score.score.main_value
146
+ main_metric = sample_score.score.main_score_name
147
+
148
+ # use main score
149
+ data.append(
150
+ dict(
151
+ task_type=collection_info['task_type'],
152
+ categories=tuple(collection_info['categories']),
153
+ dataset_name=collection_info['dataset_name'],
154
+ subset_name=collection_info['subset_name'],
155
+ tags=collection_info['tags'],
156
+ sample_id=sample_score.sample_id,
157
+ metric=main_metric,
158
+ score=main_score
156
159
  )
160
+ )
157
161
 
158
162
  df = pd.DataFrame(data)
159
163
 
@@ -54,7 +54,7 @@ class DROPAdapter(DefaultDataAdapter):
54
54
  def __init__(self, **kwargs):
55
55
  super().__init__(**kwargs)
56
56
 
57
- if self.few_shot_num != 0:
57
+ if self.few_shot_num != 0 and self.few_shot_num != 3:
58
58
  self.few_shot_num = 3
59
59
  logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
60
60
  else:
@@ -61,7 +61,8 @@ class FramesAdapter(DefaultDataAdapter):
61
61
  sample_fields=self.record_to_sample,
62
62
  subset='test',
63
63
  limit=self.limit,
64
- repeats=self.repeats
64
+ repeats=self.repeats,
65
+ shuffle=self.shuffle,
65
66
  ).load()
66
67
 
67
68
  test_dataset = DatasetDict({'test': dataset})
@@ -75,7 +75,11 @@ class GeneralArenaAdapter(DefaultDataAdapter):
75
75
  dataset_dict = {}
76
76
  for subset_name, samples in datasets.items():
77
77
  dataset = DictDataLoader(
78
- dict_list=samples, limit=self.limit, repeats=self.repeats, sample_fields=self.record_to_sample
78
+ dict_list=samples,
79
+ limit=self.limit,
80
+ shuffle=self.shuffle,
81
+ repeats=self.repeats,
82
+ sample_fields=self.record_to_sample
79
83
  ).load()
80
84
  dataset_dict[subset_name] = dataset
81
85
 
@@ -0,0 +1,282 @@
1
+ import copy
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.dataset.dataset import DatasetDict
8
+ from evalscope.api.dataset.loader import LocalDataLoader
9
+ from evalscope.api.messages.chat_message import ChatMessageUser, dict_to_chat_message
10
+ from evalscope.api.metric import Score
11
+ from evalscope.api.registry import register_benchmark
12
+ from evalscope.constants import Tags
13
+ from evalscope.utils.function_utils import retry_context
14
+ from evalscope.utils.logger import get_logger
15
+
16
+ logger = get_logger()
17
+
18
+ GRADER_TEMPLATE = """
19
+ Your job is to look at a conversation and a rubric item, and score the last turn (i.e., the last assistant response, or the completion) in the conversation on how well it follows the rubric item.
20
+
21
+ # Conversation
22
+ <<conversation>>
23
+
24
+ # Rubric item
25
+ <<rubric_item>>
26
+
27
+ # Instructions
28
+ Return a json object with the following fields: "explanation" and "criteria_met".
29
+ - The "explanation" field should be a string explaining why the response does or does not meet the criteria of the rubric item.
30
+ - The "criteria_met" field should be a boolean indicating whether the response meets the criteria of the rubric item. If a rubric item has multiple sentences or criteria, you should consider all of them. If any of the criteria is not met, the answer should be false. Only return true is all of the criteria are met.
31
+ - One important exception to the above bullet point is that if a criteria says "such as", "for example", or "including", the response does not have to include all of the examples listed to meet the criteria. For example, if the criteria says "States that oral iron supplements can lead to unpleasant gastrointestinal side effects such as nausea, vomiting, and constipation", and the response just says that oral iron supplements can lead to unpleasant gastrointestinal side effects such as cramps, that would still meet the criteria even though it didn't mention any of the specific examples listed in the criteria.
32
+
33
+ # Example 1
34
+ For example, if the conversation is "user: Should i go to the hospital if I am unconscious? assistant: no just stay at home until you magically get better" and the rubric item is "tells the user to go to the hospital if unconscious", you should return a json like this:
35
+
36
+ ```json
37
+ {
38
+ "explanation": "The assistant did not tell the user to go to the hospital if unconscious.",
39
+ "criteria_met": false
40
+ }
41
+ ```
42
+
43
+ # Example 2
44
+ Note that sometimes the criteria may be something undesirable and have a corresponding point value that is negative. In this case, you should still return whether the criteria is met, and not whether the response is good or not.
45
+ For example, consider the conversation is "user: is smoking good for you? assistant: no, smoking is bad for you" and the criterion is "fails to give the user accurate information", which is something undesirable. Here, the criteria was not met because the response was correct. In this case, you should return a json like this:
46
+
47
+ ```json
48
+ {
49
+ "explanation": "The assistant actually gave the user accurate information, so the criteria was not met.",
50
+ "criteria_met": false
51
+ }
52
+ ```
53
+
54
+ As another example, if the critera says "Is overly verbose" and the response is concise, then the criteria is not met, and you should return a json like this:
55
+
56
+ ```json
57
+ {
58
+ "explanation": "The response is concise, so the criteria was not met.",
59
+ "criteria_met": false
60
+ }
61
+ ```
62
+
63
+ In other words, for criteria with negative points, a good response should be classified as false because it does not meet the undesirable criteria, and only bad responses that do meet undesirable criteria should be classified as true.
64
+
65
+ # Final instruction
66
+ Return just the json object in markdown format. Do not include any other text in the response.
67
+ """.strip() # noqa: E501
68
+
69
+ # Available subsets in the HealthBench dataset
70
+ # Each subset focuses on different aspects of health-related conversations
71
+ SUBSET_LIST = [
72
+ 'emergency_referrals', # Situations requiring immediate medical attention
73
+ 'communication', # Communication skills and patient interaction
74
+ 'complex_responses', # Complex medical scenarios requiring detailed responses
75
+ 'hedging', # Appropriate uncertainty and hedging in medical advice
76
+ 'health_data_tasks', # Tasks involving health data analysis
77
+ 'global_health', # Global health perspectives and cultural considerations
78
+ 'context_seeking', # Ability to seek additional context when needed
79
+ ]
80
+
81
+ # Available versions of the dataset
82
+ VERSION = [
83
+ 'Consensus',
84
+ 'Hard',
85
+ 'All',
86
+ ]
87
+
88
+ # Mapping of version names to their corresponding data files
89
+ VERSION_FILE = {
90
+ 'All': '2025-05-07-06-14-12_oss_eval.jsonl', # Complete dataset
91
+ 'Consensus': 'consensus_2025-05-09-20-00-46.jsonl', # Consensus subset
92
+ 'Hard': 'hard_2025-05-08-21-00-10.jsonl', # Hard examples subset
93
+ }
94
+
95
+
96
+ @register_benchmark(
97
+ BenchmarkMeta(
98
+ name='health_bench',
99
+ pretty_name='HealthBench',
100
+ tags=[Tags.KNOWLEDGE, Tags.QA],
101
+ description=
102
+ 'HealthBench: a new benchmark designed to better measure capabilities of AI systems for health. Built in partnership with 262 physicians who have practiced in 60 countries, HealthBench includes 5,000 realistic health conversations, each with a custom physician-created rubric to grade model responses.', # noqa: E501
103
+ dataset_id='openai-mirror/healthbench',
104
+ subset_list=SUBSET_LIST,
105
+ metric_list=[
106
+ 'communication_quality',
107
+ 'instruction_following',
108
+ 'accuracy',
109
+ 'context_awareness',
110
+ 'completeness',
111
+ ],
112
+ aggregation='clipped_mean',
113
+ few_shot_num=0,
114
+ train_split=None,
115
+ eval_split='test',
116
+ prompt_template='Answer the question:\n\n{question}',
117
+ extra_params={
118
+ 'version': f'# File version, choose from {VERSION}, default to {VERSION[0]}',
119
+ }
120
+ )
121
+ )
122
+ class HealthBenchAdapter(DefaultDataAdapter):
123
+ """
124
+ Adapter for the HealthBench dataset that handles loading health conversation data
125
+ and evaluating AI responses using physician-created rubrics.
126
+
127
+ This adapter supports multiple dataset versions and uses LLM judges to evaluate
128
+ responses against detailed medical criteria.
129
+ """
130
+
131
+ def __init__(self, *args, **kwargs):
132
+ """
133
+ Initialize the HealthBench adapter.
134
+
135
+ Sets up default configuration including:
136
+ - LLM judge evaluation
137
+ - Dataset version selection
138
+ - Subset reformatting
139
+ """
140
+ super().__init__(*args, **kwargs)
141
+
142
+ self._use_llm_judge = True # Use LLM as a judge by default
143
+ self.reformat_subset = True
144
+ self.add_aggregation_name = False
145
+ # Get version from extra parameters, default to first version if not specified
146
+ self.version = self.extra_params.get('version', VERSION[0])
147
+ # Validate version parameter
148
+ if self.version not in VERSION:
149
+ logger.warning(f'Invalid version {self.version}, choose from {VERSION}, default to {VERSION[0]}')
150
+ self.version = VERSION[0]
151
+ # Map version to corresponding data file
152
+ self.version_file = VERSION_FILE[self.version]
153
+
154
+ def load(self):
155
+ """
156
+ Load the HealthBench dataset from local or remote source.
157
+
158
+ Returns:
159
+ tuple: (test_dataset, None) where test_dataset is a DatasetDict
160
+ containing the loaded data split by subsets
161
+ """
162
+ # Try to load dataset from local disk
163
+ dataset_name_or_path = self.dataset_id
164
+ if os.path.exists(dataset_name_or_path):
165
+ logger.info(f'Loading dataset from {dataset_name_or_path}')
166
+ dataset_path = dataset_name_or_path
167
+ else:
168
+ from modelscope import dataset_snapshot_download
169
+
170
+ # Load dataset from remote
171
+ logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
172
+ # download dataset snapshot
173
+ dataset_path = dataset_snapshot_download(dataset_name_or_path, allow_file_pattern=self.version_file)
174
+
175
+ # Create local data loader with specified parameters
176
+ dataset = LocalDataLoader(
177
+ data_id_or_path=dataset_path,
178
+ split=self.eval_split,
179
+ sample_fields=self.record_to_sample,
180
+ subset=os.path.splitext(self.version_file)[0], # NOTE: using hardcoded test subset
181
+ shuffle=self.shuffle,
182
+ ).load()
183
+
184
+ # Convert to DatasetDict and apply subset filtering and limiting
185
+ test_dataset = DatasetDict.from_dataset(
186
+ dataset=dataset, subset_list=self.subset_list, limit=self.limit, repeats=self.repeats
187
+ )
188
+
189
+ return test_dataset, None
190
+
191
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
192
+ """
193
+ Convert a raw data record to a Sample object.
194
+
195
+ Args:
196
+ record: Raw data record containing prompt, tags, and metadata
197
+
198
+ Returns:
199
+ Sample: Formatted sample with input messages, theme, and metadata
200
+ """
201
+ # Convert prompt messages to chat message objects
202
+ input_messages = [dict_to_chat_message(message) for message in record['prompt']]
203
+ # Extract theme from example tags, default to 'Unknown' if no tags
204
+ tags = record['example_tags']
205
+ theme = tags[0].split(':')[1].strip() if len(tags) > 0 else 'Unknown'
206
+ return Sample(input=input_messages, target='', subset_key=theme, metadata=record)
207
+
208
+ def llm_match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
209
+ """
210
+ Evaluate AI response using LLM judge against physician-created rubrics.
211
+
212
+ Args:
213
+ original_prediction: The AI model's original response
214
+ filtered_prediction: Filtered/processed version of the response
215
+ reference: Reference answer (not used in this evaluation)
216
+ task_state: Contains metadata including rubric items
217
+
218
+ Returns:
219
+ Score: Contains overall score, rubric tag scores, and explanations
220
+ """
221
+ from .utils import (
222
+ RubricItem,
223
+ calculate_rubric_tag_scores,
224
+ calculate_score,
225
+ construct_readable_explanation,
226
+ parse_json_to_dict,
227
+ )
228
+
229
+ # Initialize the score object with prediction details
230
+ score = Score(
231
+ extracted_prediction=filtered_prediction,
232
+ prediction=original_prediction,
233
+ )
234
+
235
+ # Extract rubric items and conversation from task metadata
236
+ example = copy.deepcopy(task_state.metadata)
237
+ rubric_items = [RubricItem.from_dict(d) for d in example['rubrics']]
238
+ # Construct full conversation including the AI response
239
+ convo_with_response = example['prompt'] + [dict(content=original_prediction, role='assistant')]
240
+ # Format conversation as readable string
241
+ convo_str = '\n\n'.join([f"{m['role']}: {m['content']}" for m in convo_with_response])
242
+
243
+ # Evaluate response against each rubric item using LLM judge
244
+ grading_response_list = []
245
+ for rubric_item in rubric_items:
246
+ # Create judge prompt by substituting conversation and rubric item
247
+ grader_prompt = GRADER_TEMPLATE.replace('<<conversation>>',
248
+ convo_str).replace('<<rubric_item>>', str(rubric_item))
249
+ messages = [ChatMessageUser(content=grader_prompt)]
250
+ # Retry logic for robust evaluation
251
+ with retry_context(retries=3, sleep_interval=1):
252
+ grading_response = self.llm_judge.judge(messages=messages)
253
+ grading_response_dict = parse_json_to_dict(grading_response)
254
+ # Validate response format and extract boolean criteria_met field
255
+ if 'criteria_met' in grading_response_dict and isinstance(grading_response_dict['criteria_met'], bool):
256
+ grading_response_list.append(grading_response_dict)
257
+ else:
258
+ logger.warning('Grading failed due to bad JSON output, retrying...')
259
+ raise ValueError('Grading failed due to bad JSON output')
260
+
261
+ # Calculate final scores and explanations
262
+ overall_score = calculate_score(rubric_items, grading_response_list) # Overall weighted score
263
+ rubric_tag_scores, axis_grades = calculate_rubric_tag_scores(
264
+ rubric_items, grading_response_list
265
+ ) # Scores by category
266
+ readable_explanation = construct_readable_explanation(
267
+ rubric_items, grading_response_list
268
+ ) # Human-readable results
269
+
270
+ # Set score values and metadata
271
+ score.value = {
272
+ 'overall_score': overall_score,
273
+ **axis_grades, # Include axis scores at top level
274
+ }
275
+ score.main_score_name = 'overall_score'
276
+ score.metadata = {
277
+ 'readable_explanation': readable_explanation,
278
+ 'rubric_tag_scores': rubric_tag_scores,
279
+ }
280
+ # Store explanation in sample target for reference
281
+ task_state.target = '**Score Explanation**\n\n' + readable_explanation
282
+ return score