evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +62 -2
- evalscope/api/benchmark/meta.py +9 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +1 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
- evalscope/benchmarks/bfcl/generation.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +96 -14
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +30 -10
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/metric.py +27 -2
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +8 -6
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/combinator.py +0 -25
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +56 -7
- evalscope/utils/json_schema.py +23 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
- tests/benchmark/test_eval.py +80 -37
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_sandbox.py +81 -0
- tests/benchmark/test_vlm.py +137 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +44 -14
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
|
@@ -1,17 +1,19 @@
|
|
|
1
|
-
import importlib
|
|
2
1
|
import json
|
|
3
2
|
import re
|
|
4
3
|
import traceback
|
|
5
|
-
from typing import Any, Dict
|
|
4
|
+
from typing import Any, Dict, List
|
|
6
5
|
|
|
7
6
|
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
8
7
|
from evalscope.api.dataset import Sample
|
|
9
8
|
from evalscope.api.evaluator import TaskState
|
|
10
9
|
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
11
10
|
from evalscope.api.metric import Score
|
|
11
|
+
from evalscope.api.metric.scorer import AggScore
|
|
12
12
|
from evalscope.api.model import Model, ModelOutput
|
|
13
13
|
from evalscope.api.registry import register_benchmark
|
|
14
14
|
from evalscope.constants import Tags
|
|
15
|
+
from evalscope.report import Category, Report, Subset
|
|
16
|
+
from evalscope.utils.import_utils import check_import
|
|
15
17
|
from evalscope.utils.logger import get_logger
|
|
16
18
|
|
|
17
19
|
logger = get_logger()
|
|
@@ -67,18 +69,50 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
67
69
|
def __init__(self, **kwargs):
|
|
68
70
|
super().__init__(**kwargs)
|
|
69
71
|
|
|
70
|
-
|
|
71
|
-
if spec is None:
|
|
72
|
-
raise ImportError(
|
|
73
|
-
'`bfcl_eval` not found, please install it with `pip install bfcl-eval==2025.6.16` before evaluating.'
|
|
74
|
-
)
|
|
72
|
+
check_import('bfcl_eval', package='bfcl-eval==2025.6.16', raise_error=True, feature_name=self.pretty_name)
|
|
75
73
|
|
|
76
74
|
self.category_map = SUBJECT_MAPPING
|
|
77
75
|
self.reformat_subset = True
|
|
76
|
+
self.add_overall_metric = False
|
|
77
|
+
self.add_aggregation_name = False
|
|
78
78
|
|
|
79
79
|
self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
|
|
80
80
|
self.is_fc_model = self.extra_params.get('is_fc_model', True)
|
|
81
81
|
|
|
82
|
+
def _weighted_average_from_subsets(self, subset_names: List[str], subset_dict: Dict[str, Subset]) -> Subset:
|
|
83
|
+
"""Calculate weighted average for given subsets.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Subset: A new Subset object with weighted average score
|
|
87
|
+
"""
|
|
88
|
+
total_score = 0
|
|
89
|
+
total_count = 0
|
|
90
|
+
for name in subset_names:
|
|
91
|
+
if name in subset_dict:
|
|
92
|
+
subset = subset_dict[name]
|
|
93
|
+
total_score += subset.score * subset.num
|
|
94
|
+
total_count += subset.num
|
|
95
|
+
|
|
96
|
+
weighted_avg = total_score / total_count if total_count > 0 else 0
|
|
97
|
+
return Subset(name='', score=weighted_avg, num=total_count)
|
|
98
|
+
|
|
99
|
+
def _unweighted_average_from_subsets(self, subset_names: List[str], subset_dict: Dict[str, Subset]) -> Subset:
|
|
100
|
+
"""Calculate unweighted average for given subsets.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Subset: A new Subset object with unweighted average score
|
|
104
|
+
"""
|
|
105
|
+
scores = []
|
|
106
|
+
total_count = 0
|
|
107
|
+
for name in subset_names:
|
|
108
|
+
if name in subset_dict:
|
|
109
|
+
subset = subset_dict[name]
|
|
110
|
+
scores.append(subset.score)
|
|
111
|
+
total_count += subset.num
|
|
112
|
+
|
|
113
|
+
unweighted_avg = sum(scores) / len(scores) if scores else 0
|
|
114
|
+
return Subset(name='', score=unweighted_avg, num=total_count)
|
|
115
|
+
|
|
82
116
|
def preprocess_row(self, row: dict):
|
|
83
117
|
"""
|
|
84
118
|
Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
|
|
@@ -256,3 +290,104 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
256
290
|
score.metadata = {'error': traceback.format_exc()}
|
|
257
291
|
score.main_score_name = 'acc'
|
|
258
292
|
return score
|
|
293
|
+
|
|
294
|
+
def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
|
|
295
|
+
"""
|
|
296
|
+
Finalize the report generation process. Calculate the overall score.
|
|
297
|
+
|
|
298
|
+
Track the number of each category.
|
|
299
|
+
- step1: simple, java, javascript unweighted average as simple_ast
|
|
300
|
+
- step2.1: simple_ast, multiple, parallel, parallel_multiple unweighted average as ast_non_live
|
|
301
|
+
- step2.2: live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average as ast_live
|
|
302
|
+
- step2.3: irrelevance as hallucination_non_live
|
|
303
|
+
- step2.4: live_irrelevance, live_relevance weighted average as hallucination_live
|
|
304
|
+
- step2.5: multi_turn_base as multi_turn_base
|
|
305
|
+
- step2.6: multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average as multi_turn_augmented
|
|
306
|
+
- step3.1: ast_non_live, hallucination_non_live unweighted average as non_live
|
|
307
|
+
- step3.2: ast_live, hallucination_live weighted average as live
|
|
308
|
+
- step3.3: multi_turn_base, multi_turn_augmented unweighted average as multi_turn
|
|
309
|
+
- step4: non_live, live, multi_turn unweighted average as overall
|
|
310
|
+
Args:
|
|
311
|
+
report (Report): The generated evaluation report.
|
|
312
|
+
output_dir (str): The directory to save the report.
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
None
|
|
316
|
+
""" # noqa: E501
|
|
317
|
+
for metric in report.metrics:
|
|
318
|
+
# Collect all subsets in a dictionary for easy access
|
|
319
|
+
subset_dict: Dict[str, Subset] = {}
|
|
320
|
+
for category in metric.categories:
|
|
321
|
+
for subset in category.subsets:
|
|
322
|
+
subset_dict[subset.name] = subset
|
|
323
|
+
|
|
324
|
+
# Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
|
|
325
|
+
simple_subsets = ['simple', 'java', 'javascript']
|
|
326
|
+
simple_ast = self._unweighted_average_from_subsets(simple_subsets, subset_dict)
|
|
327
|
+
subset_dict['simple_ast'] = simple_ast
|
|
328
|
+
|
|
329
|
+
# Step 2.1: Calculate ast_non_live
|
|
330
|
+
# (simple_ast, multiple, parallel, parallel_multiple unweighted average)
|
|
331
|
+
ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
|
|
332
|
+
ast_non_live = self._unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
|
|
333
|
+
subset_dict['ast_non_live'] = ast_non_live
|
|
334
|
+
|
|
335
|
+
# Step 2.2: Calculate ast_live
|
|
336
|
+
# (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
|
|
337
|
+
live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
|
|
338
|
+
ast_live = self._weighted_average_from_subsets(live_subsets, subset_dict)
|
|
339
|
+
subset_dict['ast_live'] = ast_live
|
|
340
|
+
|
|
341
|
+
# Step 2.3: hallucination_non_live (irrelevance)
|
|
342
|
+
if 'irrelevance' in subset_dict:
|
|
343
|
+
subset_dict['hallucination_non_live'] = subset_dict['irrelevance']
|
|
344
|
+
else:
|
|
345
|
+
subset_dict['hallucination_non_live'] = Subset(name='hallucination_non_live', score=0, num=0)
|
|
346
|
+
|
|
347
|
+
# Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
|
|
348
|
+
hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
|
|
349
|
+
hallucination_live = self._weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
|
|
350
|
+
subset_dict['hallucination_live'] = hallucination_live
|
|
351
|
+
|
|
352
|
+
# Step 2.5: multi_turn_base
|
|
353
|
+
if 'multi_turn_base' not in subset_dict:
|
|
354
|
+
subset_dict['multi_turn_base'] = Subset(name='multi_turn_base', score=0, num=0)
|
|
355
|
+
|
|
356
|
+
# Step 2.6: Calculate multi_turn_augmented
|
|
357
|
+
# (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
|
|
358
|
+
multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
|
|
359
|
+
multi_turn_augmented = self._weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
|
|
360
|
+
subset_dict['multi_turn_augmented'] = multi_turn_augmented
|
|
361
|
+
|
|
362
|
+
# Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
|
|
363
|
+
non_live_subsets = ['ast_non_live', 'hallucination_non_live']
|
|
364
|
+
non_live = self._unweighted_average_from_subsets(non_live_subsets, subset_dict)
|
|
365
|
+
subset_dict['non_live'] = non_live
|
|
366
|
+
|
|
367
|
+
# Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
|
|
368
|
+
live_agg_subsets = ['ast_live', 'hallucination_live']
|
|
369
|
+
live = self._weighted_average_from_subsets(live_agg_subsets, subset_dict)
|
|
370
|
+
subset_dict['live'] = live
|
|
371
|
+
|
|
372
|
+
# Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
|
|
373
|
+
multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
|
|
374
|
+
multi_turn = self._unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
|
|
375
|
+
subset_dict['multi_turn'] = multi_turn
|
|
376
|
+
|
|
377
|
+
# Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
|
|
378
|
+
overall_subsets = ['non_live', 'live', 'multi_turn']
|
|
379
|
+
overall = self._unweighted_average_from_subsets(overall_subsets, subset_dict)
|
|
380
|
+
subset_dict['overall'] = overall
|
|
381
|
+
|
|
382
|
+
# Add computed scores to the category
|
|
383
|
+
computed_subset_names = ['non_live', 'live', 'multi_turn', 'overall']
|
|
384
|
+
|
|
385
|
+
# Add the computed scores as new subsets in the metric
|
|
386
|
+
dummy_subsets = []
|
|
387
|
+
for subset_name in computed_subset_names:
|
|
388
|
+
if subset_name in subset_dict:
|
|
389
|
+
subset = subset_dict[subset_name]
|
|
390
|
+
subset.name = subset_name.upper()
|
|
391
|
+
dummy_subsets.append(subset)
|
|
392
|
+
dummy_category = Category(name='-', subsets=dummy_subsets)
|
|
393
|
+
metric.categories.append(dummy_category)
|
|
@@ -72,13 +72,14 @@ def generate_turn(model: Model, row: dict[str, Any]):
|
|
|
72
72
|
|
|
73
73
|
# Handle the response based on the model output structure
|
|
74
74
|
message = model_output.message
|
|
75
|
-
|
|
75
|
+
if model_output.usage is not None:
|
|
76
|
+
model_usage += model_output.usage
|
|
76
77
|
|
|
77
78
|
current_messages.append(message)
|
|
78
79
|
if isinstance(message, str):
|
|
79
80
|
result = message
|
|
80
81
|
else:
|
|
81
|
-
result = message.
|
|
82
|
+
result = message.text
|
|
82
83
|
|
|
83
84
|
logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
|
|
84
85
|
current_responses.append(result)
|
|
@@ -115,7 +116,7 @@ def generate_turn(model: Model, row: dict[str, Any]):
|
|
|
115
116
|
|
|
116
117
|
n_steps += 1
|
|
117
118
|
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
118
|
-
logger.
|
|
119
|
+
logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
119
120
|
break
|
|
120
121
|
|
|
121
122
|
all_model_responses.append(current_responses)
|
|
@@ -145,9 +146,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
145
146
|
new_tools = row['missing_functions'][str(turn_idx)]
|
|
146
147
|
for new_tool in new_tools:
|
|
147
148
|
cur_tool = new_tool[0]
|
|
148
|
-
|
|
149
|
-
if cur_tool['parameters']['type'] != 'object':
|
|
150
|
-
cur_tool['parameters']['type'] = 'object'
|
|
149
|
+
cur_tool['parameters']['type'] = 'object'
|
|
151
150
|
tools.append({
|
|
152
151
|
'type': 'function',
|
|
153
152
|
'function': cur_tool,
|
|
@@ -172,7 +171,8 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
172
171
|
|
|
173
172
|
# Handle the response based on the model output structure
|
|
174
173
|
message = model_output.message
|
|
175
|
-
|
|
174
|
+
if model_output.usage is not None:
|
|
175
|
+
model_usage += model_output.usage
|
|
176
176
|
|
|
177
177
|
current_messages.append(message)
|
|
178
178
|
if isinstance(message, str):
|
|
@@ -186,7 +186,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
186
186
|
logger.error(f'Error converting tool calls to function call strings: {e}')
|
|
187
187
|
tool_call_strs = None
|
|
188
188
|
else:
|
|
189
|
-
model_responses = [message.
|
|
189
|
+
model_responses = [message.text]
|
|
190
190
|
tool_call_strs = None
|
|
191
191
|
|
|
192
192
|
current_responses.extend(model_responses)
|
|
@@ -214,7 +214,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
214
214
|
|
|
215
215
|
n_steps += 1
|
|
216
216
|
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
217
|
-
logger.
|
|
217
|
+
logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
218
218
|
break
|
|
219
219
|
|
|
220
220
|
all_model_responses.append(current_responses)
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from functools import partial
|
|
4
3
|
from typing import Any, Dict
|
|
5
4
|
|
|
6
5
|
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
7
|
-
from evalscope.api.dataset import
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
8
7
|
from evalscope.api.registry import register_benchmark
|
|
9
8
|
from evalscope.constants import Tags
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
@@ -6,9 +6,7 @@ from typing import Any, Dict, List
|
|
|
6
6
|
from evalscope.api.benchmark import BenchmarkMeta, DataAdapter, DefaultDataAdapter
|
|
7
7
|
from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
|
|
8
8
|
from evalscope.api.evaluator import TaskState
|
|
9
|
-
from evalscope.api.metric import Score
|
|
10
9
|
from evalscope.api.metric.scorer import AggScore, SampleScore
|
|
11
|
-
from evalscope.api.model.model import Model
|
|
12
10
|
from evalscope.api.registry import get_benchmark, register_benchmark
|
|
13
11
|
from evalscope.config import TaskConfig
|
|
14
12
|
from evalscope.constants import DataCollection, Tags
|
|
@@ -23,7 +21,11 @@ logger = get_logger()
|
|
|
23
21
|
BenchmarkMeta(
|
|
24
22
|
name=DataCollection.NAME,
|
|
25
23
|
dataset_id='', # dataset_id need to be set
|
|
26
|
-
description='Data collection'
|
|
24
|
+
description='Custom Data collection, mixing multiple evaluation datasets for '
|
|
25
|
+
'a unified evaluation, aiming to use less data to achieve a more comprehensive '
|
|
26
|
+
'assessment of the model\'s capabilities. '
|
|
27
|
+
'[Usage Reference](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/collection/index.html)',
|
|
28
|
+
tags=[Tags.CUSTOM],
|
|
27
29
|
metric_list=['acc'],
|
|
28
30
|
eval_split='test',
|
|
29
31
|
prompt_template='',
|
|
@@ -55,9 +57,10 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
55
57
|
data_id_or_path=dataset_path,
|
|
56
58
|
split=self.eval_split,
|
|
57
59
|
sample_fields=self.record_to_sample,
|
|
58
|
-
subset=
|
|
60
|
+
subset='test', # NOTE: using hardcoded test subset
|
|
59
61
|
limit=self.limit,
|
|
60
|
-
repeats=self.repeats
|
|
62
|
+
repeats=self.repeats,
|
|
63
|
+
shuffle=self.shuffle,
|
|
61
64
|
).load()
|
|
62
65
|
|
|
63
66
|
test_dataset = DatasetDict({self.default_subset: dataset})
|
|
@@ -95,7 +98,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
95
98
|
|
|
96
99
|
# load dataset args
|
|
97
100
|
dataset_args = copy.deepcopy(self._task_config.dataset_args)
|
|
98
|
-
common_args = dataset_args.get(DataCollection.NAME, {})
|
|
99
101
|
|
|
100
102
|
# Iterate through each sample in the dataset
|
|
101
103
|
dataset = self.test_dataset[self.default_subset]
|
|
@@ -108,7 +110,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
108
110
|
|
|
109
111
|
# update dataset args
|
|
110
112
|
cur_dataset_args = dataset_args.get(dataset_name, {})
|
|
111
|
-
cur_dataset_args.update(common_args)
|
|
112
113
|
|
|
113
114
|
# Initialize dataset adapter
|
|
114
115
|
if dataset_name not in self.dataset_adapters:
|
|
@@ -141,19 +142,22 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
141
142
|
data = []
|
|
142
143
|
for sample_score in sample_scores:
|
|
143
144
|
collection_info = sample_score.sample_metadata[DataCollection.INFO]
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
145
|
+
main_score = sample_score.score.main_value
|
|
146
|
+
main_metric = sample_score.score.main_score_name
|
|
147
|
+
|
|
148
|
+
# use main score
|
|
149
|
+
data.append(
|
|
150
|
+
dict(
|
|
151
|
+
task_type=collection_info['task_type'],
|
|
152
|
+
categories=tuple(collection_info['categories']),
|
|
153
|
+
dataset_name=collection_info['dataset_name'],
|
|
154
|
+
subset_name=collection_info['subset_name'],
|
|
155
|
+
tags=collection_info['tags'],
|
|
156
|
+
sample_id=sample_score.sample_id,
|
|
157
|
+
metric=main_metric,
|
|
158
|
+
score=main_score
|
|
156
159
|
)
|
|
160
|
+
)
|
|
157
161
|
|
|
158
162
|
df = pd.DataFrame(data)
|
|
159
163
|
|
|
@@ -54,7 +54,7 @@ class DROPAdapter(DefaultDataAdapter):
|
|
|
54
54
|
def __init__(self, **kwargs):
|
|
55
55
|
super().__init__(**kwargs)
|
|
56
56
|
|
|
57
|
-
if self.few_shot_num != 0:
|
|
57
|
+
if self.few_shot_num != 0 and self.few_shot_num != 3:
|
|
58
58
|
self.few_shot_num = 3
|
|
59
59
|
logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
|
|
60
60
|
else:
|
|
@@ -75,7 +75,11 @@ class GeneralArenaAdapter(DefaultDataAdapter):
|
|
|
75
75
|
dataset_dict = {}
|
|
76
76
|
for subset_name, samples in datasets.items():
|
|
77
77
|
dataset = DictDataLoader(
|
|
78
|
-
dict_list=samples,
|
|
78
|
+
dict_list=samples,
|
|
79
|
+
limit=self.limit,
|
|
80
|
+
shuffle=self.shuffle,
|
|
81
|
+
repeats=self.repeats,
|
|
82
|
+
sample_fields=self.record_to_sample
|
|
79
83
|
).load()
|
|
80
84
|
dataset_dict[subset_name] = dataset
|
|
81
85
|
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.dataset.dataset import DatasetDict
|
|
8
|
+
from evalscope.api.dataset.loader import LocalDataLoader
|
|
9
|
+
from evalscope.api.messages.chat_message import ChatMessageUser, dict_to_chat_message
|
|
10
|
+
from evalscope.api.metric import Score
|
|
11
|
+
from evalscope.api.registry import register_benchmark
|
|
12
|
+
from evalscope.constants import Tags
|
|
13
|
+
from evalscope.utils.function_utils import retry_context
|
|
14
|
+
from evalscope.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
GRADER_TEMPLATE = """
|
|
19
|
+
Your job is to look at a conversation and a rubric item, and score the last turn (i.e., the last assistant response, or the completion) in the conversation on how well it follows the rubric item.
|
|
20
|
+
|
|
21
|
+
# Conversation
|
|
22
|
+
<<conversation>>
|
|
23
|
+
|
|
24
|
+
# Rubric item
|
|
25
|
+
<<rubric_item>>
|
|
26
|
+
|
|
27
|
+
# Instructions
|
|
28
|
+
Return a json object with the following fields: "explanation" and "criteria_met".
|
|
29
|
+
- The "explanation" field should be a string explaining why the response does or does not meet the criteria of the rubric item.
|
|
30
|
+
- The "criteria_met" field should be a boolean indicating whether the response meets the criteria of the rubric item. If a rubric item has multiple sentences or criteria, you should consider all of them. If any of the criteria is not met, the answer should be false. Only return true is all of the criteria are met.
|
|
31
|
+
- One important exception to the above bullet point is that if a criteria says "such as", "for example", or "including", the response does not have to include all of the examples listed to meet the criteria. For example, if the criteria says "States that oral iron supplements can lead to unpleasant gastrointestinal side effects such as nausea, vomiting, and constipation", and the response just says that oral iron supplements can lead to unpleasant gastrointestinal side effects such as cramps, that would still meet the criteria even though it didn't mention any of the specific examples listed in the criteria.
|
|
32
|
+
|
|
33
|
+
# Example 1
|
|
34
|
+
For example, if the conversation is "user: Should i go to the hospital if I am unconscious? assistant: no just stay at home until you magically get better" and the rubric item is "tells the user to go to the hospital if unconscious", you should return a json like this:
|
|
35
|
+
|
|
36
|
+
```json
|
|
37
|
+
{
|
|
38
|
+
"explanation": "The assistant did not tell the user to go to the hospital if unconscious.",
|
|
39
|
+
"criteria_met": false
|
|
40
|
+
}
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
# Example 2
|
|
44
|
+
Note that sometimes the criteria may be something undesirable and have a corresponding point value that is negative. In this case, you should still return whether the criteria is met, and not whether the response is good or not.
|
|
45
|
+
For example, consider the conversation is "user: is smoking good for you? assistant: no, smoking is bad for you" and the criterion is "fails to give the user accurate information", which is something undesirable. Here, the criteria was not met because the response was correct. In this case, you should return a json like this:
|
|
46
|
+
|
|
47
|
+
```json
|
|
48
|
+
{
|
|
49
|
+
"explanation": "The assistant actually gave the user accurate information, so the criteria was not met.",
|
|
50
|
+
"criteria_met": false
|
|
51
|
+
}
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
As another example, if the critera says "Is overly verbose" and the response is concise, then the criteria is not met, and you should return a json like this:
|
|
55
|
+
|
|
56
|
+
```json
|
|
57
|
+
{
|
|
58
|
+
"explanation": "The response is concise, so the criteria was not met.",
|
|
59
|
+
"criteria_met": false
|
|
60
|
+
}
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
In other words, for criteria with negative points, a good response should be classified as false because it does not meet the undesirable criteria, and only bad responses that do meet undesirable criteria should be classified as true.
|
|
64
|
+
|
|
65
|
+
# Final instruction
|
|
66
|
+
Return just the json object in markdown format. Do not include any other text in the response.
|
|
67
|
+
""".strip() # noqa: E501
|
|
68
|
+
|
|
69
|
+
# Available subsets in the HealthBench dataset
|
|
70
|
+
# Each subset focuses on different aspects of health-related conversations
|
|
71
|
+
SUBSET_LIST = [
|
|
72
|
+
'emergency_referrals', # Situations requiring immediate medical attention
|
|
73
|
+
'communication', # Communication skills and patient interaction
|
|
74
|
+
'complex_responses', # Complex medical scenarios requiring detailed responses
|
|
75
|
+
'hedging', # Appropriate uncertainty and hedging in medical advice
|
|
76
|
+
'health_data_tasks', # Tasks involving health data analysis
|
|
77
|
+
'global_health', # Global health perspectives and cultural considerations
|
|
78
|
+
'context_seeking', # Ability to seek additional context when needed
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
# Available versions of the dataset
|
|
82
|
+
VERSION = [
|
|
83
|
+
'Consensus',
|
|
84
|
+
'Hard',
|
|
85
|
+
'All',
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
# Mapping of version names to their corresponding data files
|
|
89
|
+
VERSION_FILE = {
|
|
90
|
+
'All': '2025-05-07-06-14-12_oss_eval.jsonl', # Complete dataset
|
|
91
|
+
'Consensus': 'consensus_2025-05-09-20-00-46.jsonl', # Consensus subset
|
|
92
|
+
'Hard': 'hard_2025-05-08-21-00-10.jsonl', # Hard examples subset
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@register_benchmark(
|
|
97
|
+
BenchmarkMeta(
|
|
98
|
+
name='health_bench',
|
|
99
|
+
pretty_name='HealthBench',
|
|
100
|
+
tags=[Tags.KNOWLEDGE, Tags.QA],
|
|
101
|
+
description=
|
|
102
|
+
'HealthBench: a new benchmark designed to better measure capabilities of AI systems for health. Built in partnership with 262 physicians who have practiced in 60 countries, HealthBench includes 5,000 realistic health conversations, each with a custom physician-created rubric to grade model responses.', # noqa: E501
|
|
103
|
+
dataset_id='openai-mirror/healthbench',
|
|
104
|
+
subset_list=SUBSET_LIST,
|
|
105
|
+
metric_list=[
|
|
106
|
+
'communication_quality',
|
|
107
|
+
'instruction_following',
|
|
108
|
+
'accuracy',
|
|
109
|
+
'context_awareness',
|
|
110
|
+
'completeness',
|
|
111
|
+
],
|
|
112
|
+
aggregation='clipped_mean',
|
|
113
|
+
few_shot_num=0,
|
|
114
|
+
train_split=None,
|
|
115
|
+
eval_split='test',
|
|
116
|
+
prompt_template='Answer the question:\n\n{question}',
|
|
117
|
+
extra_params={
|
|
118
|
+
'version': f'# File version, choose from {VERSION}, default to {VERSION[0]}',
|
|
119
|
+
}
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
class HealthBenchAdapter(DefaultDataAdapter):
|
|
123
|
+
"""
|
|
124
|
+
Adapter for the HealthBench dataset that handles loading health conversation data
|
|
125
|
+
and evaluating AI responses using physician-created rubrics.
|
|
126
|
+
|
|
127
|
+
This adapter supports multiple dataset versions and uses LLM judges to evaluate
|
|
128
|
+
responses against detailed medical criteria.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(self, *args, **kwargs):
|
|
132
|
+
"""
|
|
133
|
+
Initialize the HealthBench adapter.
|
|
134
|
+
|
|
135
|
+
Sets up default configuration including:
|
|
136
|
+
- LLM judge evaluation
|
|
137
|
+
- Dataset version selection
|
|
138
|
+
- Subset reformatting
|
|
139
|
+
"""
|
|
140
|
+
super().__init__(*args, **kwargs)
|
|
141
|
+
|
|
142
|
+
self._use_llm_judge = True # Use LLM as a judge by default
|
|
143
|
+
self.reformat_subset = True
|
|
144
|
+
self.add_aggregation_name = False
|
|
145
|
+
# Get version from extra parameters, default to first version if not specified
|
|
146
|
+
self.version = self.extra_params.get('version', VERSION[0])
|
|
147
|
+
# Validate version parameter
|
|
148
|
+
if self.version not in VERSION:
|
|
149
|
+
logger.warning(f'Invalid version {self.version}, choose from {VERSION}, default to {VERSION[0]}')
|
|
150
|
+
self.version = VERSION[0]
|
|
151
|
+
# Map version to corresponding data file
|
|
152
|
+
self.version_file = VERSION_FILE[self.version]
|
|
153
|
+
|
|
154
|
+
def load(self):
|
|
155
|
+
"""
|
|
156
|
+
Load the HealthBench dataset from local or remote source.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
tuple: (test_dataset, None) where test_dataset is a DatasetDict
|
|
160
|
+
containing the loaded data split by subsets
|
|
161
|
+
"""
|
|
162
|
+
# Try to load dataset from local disk
|
|
163
|
+
dataset_name_or_path = self.dataset_id
|
|
164
|
+
if os.path.exists(dataset_name_or_path):
|
|
165
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
166
|
+
dataset_path = dataset_name_or_path
|
|
167
|
+
else:
|
|
168
|
+
from modelscope import dataset_snapshot_download
|
|
169
|
+
|
|
170
|
+
# Load dataset from remote
|
|
171
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
172
|
+
# download dataset snapshot
|
|
173
|
+
dataset_path = dataset_snapshot_download(dataset_name_or_path, allow_file_pattern=self.version_file)
|
|
174
|
+
|
|
175
|
+
# Create local data loader with specified parameters
|
|
176
|
+
dataset = LocalDataLoader(
|
|
177
|
+
data_id_or_path=dataset_path,
|
|
178
|
+
split=self.eval_split,
|
|
179
|
+
sample_fields=self.record_to_sample,
|
|
180
|
+
subset=os.path.splitext(self.version_file)[0], # NOTE: using hardcoded test subset
|
|
181
|
+
shuffle=self.shuffle,
|
|
182
|
+
).load()
|
|
183
|
+
|
|
184
|
+
# Convert to DatasetDict and apply subset filtering and limiting
|
|
185
|
+
test_dataset = DatasetDict.from_dataset(
|
|
186
|
+
dataset=dataset, subset_list=self.subset_list, limit=self.limit, repeats=self.repeats
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return test_dataset, None
|
|
190
|
+
|
|
191
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
192
|
+
"""
|
|
193
|
+
Convert a raw data record to a Sample object.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
record: Raw data record containing prompt, tags, and metadata
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Sample: Formatted sample with input messages, theme, and metadata
|
|
200
|
+
"""
|
|
201
|
+
# Convert prompt messages to chat message objects
|
|
202
|
+
input_messages = [dict_to_chat_message(message) for message in record['prompt']]
|
|
203
|
+
# Extract theme from example tags, default to 'Unknown' if no tags
|
|
204
|
+
tags = record['example_tags']
|
|
205
|
+
theme = tags[0].split(':')[1].strip() if len(tags) > 0 else 'Unknown'
|
|
206
|
+
return Sample(input=input_messages, target='', subset_key=theme, metadata=record)
|
|
207
|
+
|
|
208
|
+
def llm_match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
209
|
+
"""
|
|
210
|
+
Evaluate AI response using LLM judge against physician-created rubrics.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
original_prediction: The AI model's original response
|
|
214
|
+
filtered_prediction: Filtered/processed version of the response
|
|
215
|
+
reference: Reference answer (not used in this evaluation)
|
|
216
|
+
task_state: Contains metadata including rubric items
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Score: Contains overall score, rubric tag scores, and explanations
|
|
220
|
+
"""
|
|
221
|
+
from .utils import (
|
|
222
|
+
RubricItem,
|
|
223
|
+
calculate_rubric_tag_scores,
|
|
224
|
+
calculate_score,
|
|
225
|
+
construct_readable_explanation,
|
|
226
|
+
parse_json_to_dict,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Initialize the score object with prediction details
|
|
230
|
+
score = Score(
|
|
231
|
+
extracted_prediction=filtered_prediction,
|
|
232
|
+
prediction=original_prediction,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Extract rubric items and conversation from task metadata
|
|
236
|
+
example = copy.deepcopy(task_state.metadata)
|
|
237
|
+
rubric_items = [RubricItem.from_dict(d) for d in example['rubrics']]
|
|
238
|
+
# Construct full conversation including the AI response
|
|
239
|
+
convo_with_response = example['prompt'] + [dict(content=original_prediction, role='assistant')]
|
|
240
|
+
# Format conversation as readable string
|
|
241
|
+
convo_str = '\n\n'.join([f"{m['role']}: {m['content']}" for m in convo_with_response])
|
|
242
|
+
|
|
243
|
+
# Evaluate response against each rubric item using LLM judge
|
|
244
|
+
grading_response_list = []
|
|
245
|
+
for rubric_item in rubric_items:
|
|
246
|
+
# Create judge prompt by substituting conversation and rubric item
|
|
247
|
+
grader_prompt = GRADER_TEMPLATE.replace('<<conversation>>',
|
|
248
|
+
convo_str).replace('<<rubric_item>>', str(rubric_item))
|
|
249
|
+
messages = [ChatMessageUser(content=grader_prompt)]
|
|
250
|
+
# Retry logic for robust evaluation
|
|
251
|
+
with retry_context(retries=3, sleep_interval=1):
|
|
252
|
+
grading_response = self.llm_judge.judge(messages=messages)
|
|
253
|
+
grading_response_dict = parse_json_to_dict(grading_response)
|
|
254
|
+
# Validate response format and extract boolean criteria_met field
|
|
255
|
+
if 'criteria_met' in grading_response_dict and isinstance(grading_response_dict['criteria_met'], bool):
|
|
256
|
+
grading_response_list.append(grading_response_dict)
|
|
257
|
+
else:
|
|
258
|
+
logger.warning('Grading failed due to bad JSON output, retrying...')
|
|
259
|
+
raise ValueError('Grading failed due to bad JSON output')
|
|
260
|
+
|
|
261
|
+
# Calculate final scores and explanations
|
|
262
|
+
overall_score = calculate_score(rubric_items, grading_response_list) # Overall weighted score
|
|
263
|
+
rubric_tag_scores, axis_grades = calculate_rubric_tag_scores(
|
|
264
|
+
rubric_items, grading_response_list
|
|
265
|
+
) # Scores by category
|
|
266
|
+
readable_explanation = construct_readable_explanation(
|
|
267
|
+
rubric_items, grading_response_list
|
|
268
|
+
) # Human-readable results
|
|
269
|
+
|
|
270
|
+
# Set score values and metadata
|
|
271
|
+
score.value = {
|
|
272
|
+
'overall_score': overall_score,
|
|
273
|
+
**axis_grades, # Include axis scores at top level
|
|
274
|
+
}
|
|
275
|
+
score.main_score_name = 'overall_score'
|
|
276
|
+
score.metadata = {
|
|
277
|
+
'readable_explanation': readable_explanation,
|
|
278
|
+
'rubric_tag_scores': rubric_tag_scores,
|
|
279
|
+
}
|
|
280
|
+
# Store explanation in sample target for reference
|
|
281
|
+
task_state.target = '**Score Explanation**\n\n' + readable_explanation
|
|
282
|
+
return score
|