evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,14 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
from evalscope.
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.metric import Score
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
4
12
|
|
|
5
13
|
TEMPLATE_0SHOT = """Please read the following text and answer the question below.
|
|
6
14
|
|
|
@@ -13,73 +21,123 @@ TEMPLATE_0SHOT = """Please read the following text and answer the question below
|
|
|
13
21
|
Format your response as follows: "Therefore, the answer is (insert answer here)"."""
|
|
14
22
|
|
|
15
23
|
|
|
16
|
-
@
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
24
|
+
@register_benchmark(
|
|
25
|
+
BenchmarkMeta(
|
|
26
|
+
name='docmath',
|
|
27
|
+
pretty_name='DocMath',
|
|
28
|
+
tags=[Tags.REASONING, Tags.MATH, Tags.LONG_CONTEXT],
|
|
29
|
+
description=
|
|
30
|
+
'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
|
|
31
|
+
dataset_id='yale-nlp/DocMath-Eval',
|
|
32
|
+
metric_list=['acc'],
|
|
33
|
+
subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
|
|
34
|
+
eval_split='test',
|
|
35
|
+
prompt_template=TEMPLATE_0SHOT,
|
|
36
|
+
)
|
|
29
37
|
)
|
|
30
|
-
class DocMathAdapter(
|
|
38
|
+
class DocMathAdapter(DefaultDataAdapter):
|
|
31
39
|
|
|
32
40
|
def __init__(self, **kwargs):
|
|
33
41
|
super().__init__(**kwargs)
|
|
42
|
+
self._use_llm_judge = True # Enable LLM judge for DocMath
|
|
43
|
+
self.split_as_subset = True # Use split as subset for DocMath
|
|
34
44
|
|
|
35
|
-
def
|
|
36
|
-
# default load mini test
|
|
37
|
-
kwargs['split_as_subset'] = True
|
|
38
|
-
data_dict = super().load(**kwargs)
|
|
39
|
-
return data_dict
|
|
40
|
-
|
|
41
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
42
|
-
"""
|
|
43
|
-
Generate model prompt from input data.
|
|
45
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
44
46
|
"""
|
|
45
|
-
|
|
46
|
-
question = input_d['question']
|
|
47
|
-
prompt = self.prompt_template.format(context=context, question=question)
|
|
48
|
-
return self.gen_prompt_data(prompt)
|
|
47
|
+
Convert a data record to a Sample object.
|
|
49
48
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
Parse the raw input labels (gold).
|
|
53
|
-
"""
|
|
54
|
-
return input_d['ground_truth']
|
|
49
|
+
Args:
|
|
50
|
+
record (Dict[str, Any]): Input data record.
|
|
55
51
|
|
|
56
|
-
|
|
52
|
+
Returns:
|
|
53
|
+
Sample: Sample object with input, target, and metadata.
|
|
54
|
+
"""
|
|
55
|
+
ground_truth = record['ground_truth']
|
|
56
|
+
|
|
57
|
+
return Sample(
|
|
58
|
+
input=record['question'],
|
|
59
|
+
target=str(ground_truth),
|
|
60
|
+
metadata={
|
|
61
|
+
'question_id': record.get('question_id', ''),
|
|
62
|
+
'paragraphs': record['paragraphs'],
|
|
63
|
+
'answer_type': type(ground_truth).__name__
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def format_prompt_template(self, sample):
|
|
68
|
+
context = '\n'.join(sample.metadata['paragraphs'])
|
|
69
|
+
question = sample.input
|
|
70
|
+
return self.prompt_template.format(context=context, question=question)
|
|
71
|
+
|
|
72
|
+
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
57
73
|
"""
|
|
58
|
-
|
|
74
|
+
Extract the answer from the model prediction.
|
|
59
75
|
"""
|
|
60
76
|
from .utils import extract_answer
|
|
61
77
|
|
|
62
|
-
extracted_answer = extract_answer(
|
|
78
|
+
extracted_answer = extract_answer(prediction)
|
|
63
79
|
return extracted_answer
|
|
64
80
|
|
|
65
|
-
def
|
|
81
|
+
def match_score(
|
|
82
|
+
self,
|
|
83
|
+
original_prediction: str,
|
|
84
|
+
filtered_prediction: str,
|
|
85
|
+
reference: str,
|
|
86
|
+
task_state: TaskState,
|
|
87
|
+
) -> Score:
|
|
66
88
|
"""
|
|
67
|
-
|
|
89
|
+
Calculate accuracy score by matching prediction with reference.
|
|
68
90
|
"""
|
|
69
91
|
from .utils import get_acc
|
|
70
92
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
93
|
+
score = Score(
|
|
94
|
+
extracted_prediction=filtered_prediction,
|
|
95
|
+
prediction=original_prediction,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
answer_type = task_state.metadata.get('answer_type', 'unknown')
|
|
99
|
+
accuracy = get_acc(prediction=filtered_prediction, gt=reference, answer_type=answer_type)
|
|
100
|
+
score.value = {'acc': accuracy}
|
|
101
|
+
score.main_score_name = 'acc'
|
|
102
|
+
|
|
103
|
+
return score
|
|
104
|
+
|
|
105
|
+
def llm_match_score(
|
|
106
|
+
self,
|
|
107
|
+
original_prediction: str,
|
|
108
|
+
filtered_prediction: str,
|
|
109
|
+
reference: str,
|
|
110
|
+
task_state: TaskState,
|
|
111
|
+
) -> Score:
|
|
112
|
+
"""
|
|
113
|
+
Use LLM judge to evaluate the prediction against the reference.
|
|
114
|
+
"""
|
|
74
115
|
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
|
|
75
116
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
117
|
+
score = Score(
|
|
118
|
+
extracted_prediction=filtered_prediction,
|
|
119
|
+
prediction=original_prediction,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
question = task_state.metadata.get('question', '')
|
|
123
|
+
|
|
124
|
+
# Get grading response
|
|
125
|
+
prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=reference, answer_2=filtered_prediction)
|
|
126
|
+
orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
127
|
+
|
|
128
|
+
# Parse grading response
|
|
82
129
|
if 'YES' in orm_response:
|
|
83
|
-
|
|
130
|
+
accuracy = 1.0
|
|
84
131
|
else:
|
|
85
|
-
|
|
132
|
+
accuracy = 0.0
|
|
133
|
+
|
|
134
|
+
score.value = {'acc': accuracy}
|
|
135
|
+
score.explanation = f'LLM judge: {orm_response}'
|
|
136
|
+
score.metadata = {
|
|
137
|
+
'source': 'llm_judge',
|
|
138
|
+
'judge_strategy': self.judge_strategy,
|
|
139
|
+
'model': self.llm_judge.model_id
|
|
140
|
+
}
|
|
141
|
+
score.main_score_name = 'acc'
|
|
142
|
+
|
|
143
|
+
return score
|
|
@@ -193,23 +193,22 @@ def compare_two_numbers(p, gt):
|
|
|
193
193
|
return within_eps(pred=p, gt=gt)
|
|
194
194
|
|
|
195
195
|
|
|
196
|
-
def get_acc(prediction, gt, cot=True):
|
|
196
|
+
def get_acc(prediction, gt, answer_type, cot=True):
|
|
197
197
|
try:
|
|
198
198
|
if cot:
|
|
199
199
|
prediction = normalize(prediction)
|
|
200
200
|
else:
|
|
201
201
|
prediction = float(prediction)
|
|
202
202
|
|
|
203
|
-
answer_type = type(gt).__name__
|
|
204
203
|
assert answer_type in ['int', 'float', 'float64', 'bool'], answer_type
|
|
205
204
|
if isinstance(prediction, (str, int, float, bool)) or isinstance(prediction, list):
|
|
206
205
|
# Comparing prediction against the reference
|
|
207
206
|
if answer_type in ['bool']:
|
|
208
|
-
acc = int(prediction == gt)
|
|
207
|
+
acc = int(prediction == bool(gt))
|
|
209
208
|
elif answer_type == 'int':
|
|
210
|
-
acc = int(compare_two_numbers(prediction, gt))
|
|
209
|
+
acc = int(compare_two_numbers(prediction, int(gt)))
|
|
211
210
|
elif answer_type == 'float' or answer_type == 'float64':
|
|
212
|
-
acc = int(compare_two_numbers(prediction, gt))
|
|
211
|
+
acc = int(compare_two_numbers(prediction, float(gt)))
|
|
213
212
|
else:
|
|
214
213
|
acc = 0
|
|
215
214
|
else:
|
|
@@ -1,8 +1,13 @@
|
|
|
1
|
+
import ast
|
|
1
2
|
import re
|
|
2
|
-
from typing import List
|
|
3
|
-
|
|
4
|
-
from evalscope.
|
|
5
|
-
from evalscope.
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.evaluator import TaskState
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
6
11
|
from evalscope.utils.logger import get_logger
|
|
7
12
|
|
|
8
13
|
logger = get_logger()
|
|
@@ -28,54 +33,82 @@ Answer: 43
|
|
|
28
33
|
''' # noqa: E501
|
|
29
34
|
|
|
30
35
|
|
|
31
|
-
@
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
36
|
+
@register_benchmark(
|
|
37
|
+
BenchmarkMeta(
|
|
38
|
+
name='drop',
|
|
39
|
+
pretty_name='DROP',
|
|
40
|
+
tags=[Tags.REASONING],
|
|
41
|
+
description=
|
|
42
|
+
'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
|
|
43
|
+
dataset_id='AI-ModelScope/DROP',
|
|
44
|
+
metric_list=['acc'],
|
|
45
|
+
few_shot_num=3,
|
|
46
|
+
train_split=None,
|
|
47
|
+
eval_split='validation',
|
|
48
|
+
prompt_template=
|
|
49
|
+
'You will be asked to read a passage and answer a question. {drop_examples}\n# Your Task\n\n---\n{query}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.', # noqa: E501
|
|
50
|
+
)
|
|
44
51
|
)
|
|
45
|
-
class DROPAdapter(
|
|
52
|
+
class DROPAdapter(DefaultDataAdapter):
|
|
46
53
|
|
|
47
54
|
def __init__(self, **kwargs):
|
|
48
55
|
super().__init__(**kwargs)
|
|
49
56
|
|
|
50
|
-
|
|
51
|
-
if few_shot_num != 0:
|
|
57
|
+
if self.few_shot_num != 0:
|
|
52
58
|
self.few_shot_num = 3
|
|
53
59
|
logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
|
|
54
60
|
else:
|
|
55
61
|
self.few_shot_num = 0
|
|
56
62
|
|
|
57
|
-
def
|
|
63
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
58
64
|
"""
|
|
59
|
-
|
|
65
|
+
Convert a data record to a Sample object.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
record (Dict[str, Any]): Input data record.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Sample: Sample object with input, target, and metadata.
|
|
60
72
|
"""
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
73
|
+
|
|
74
|
+
# Parse gold answers
|
|
75
|
+
gold_answers = self._get_gold_answers(record)
|
|
76
|
+
|
|
77
|
+
return Sample(
|
|
78
|
+
input=record['question'],
|
|
79
|
+
target=str(gold_answers),
|
|
80
|
+
metadata={
|
|
81
|
+
'passage': record['passage'],
|
|
82
|
+
'answer': record['answer'],
|
|
83
|
+
'validated_answers': record['validated_answers']
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def format_prompt_template(self, sample: Sample) -> str:
|
|
88
|
+
drop_examples = ''
|
|
89
|
+
query = f"Passage: {sample.metadata['passage']}\nQuestion: {sample.input}"
|
|
90
|
+
|
|
91
|
+
return self.prompt_template.format(
|
|
64
92
|
drop_examples=drop_examples,
|
|
65
93
|
query=query,
|
|
66
94
|
)
|
|
67
|
-
return self.gen_prompt_data(prompt)
|
|
68
95
|
|
|
69
|
-
def
|
|
96
|
+
def format_fewshot_template(self, fewshot, sample):
|
|
97
|
+
drop_examples = DROP_EXAMPLES
|
|
98
|
+
query = f"Passage: {sample.metadata['passage']}\nQuestion: {sample.input}"
|
|
99
|
+
|
|
100
|
+
return self.prompt_template.format(
|
|
101
|
+
drop_examples=drop_examples,
|
|
102
|
+
query=query,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def _get_gold_answers(self, input_d: dict) -> List[str]:
|
|
70
106
|
"""
|
|
71
107
|
Parse the raw input labels (gold).
|
|
72
108
|
"""
|
|
73
109
|
|
|
74
110
|
def _flatten_validated_answers(validated_answers):
|
|
75
|
-
"""Flattens a dict of lists of validated answers.
|
|
76
|
-
{"number": ['1', '8'], ...}
|
|
77
|
-
-> [{"number": ['1'], ...}, {"number": ['8'], ...}]
|
|
78
|
-
"""
|
|
111
|
+
"""Flattens a dict of lists of validated answers."""
|
|
79
112
|
valid_answers = []
|
|
80
113
|
for i in range(len(validated_answers['number'])):
|
|
81
114
|
valid_answers.append({
|
|
@@ -96,24 +129,36 @@ class DROPAdapter(DataAdapter):
|
|
|
96
129
|
answers.append(answer)
|
|
97
130
|
return answers
|
|
98
131
|
|
|
99
|
-
def
|
|
132
|
+
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
100
133
|
"""
|
|
101
|
-
|
|
134
|
+
Extract the answer from the model prediction.
|
|
102
135
|
"""
|
|
103
|
-
match = re.search(r'(?i)Answer\s*:\s*([^\n]+)',
|
|
104
|
-
extracted_answer = match.group(1) if match else
|
|
136
|
+
match = re.search(r'(?i)Answer\s*:\s*([^\n]+)', prediction)
|
|
137
|
+
extracted_answer = match.group(1) if match else prediction
|
|
105
138
|
return extracted_answer
|
|
106
139
|
|
|
107
|
-
def
|
|
140
|
+
def match_score(
|
|
141
|
+
self,
|
|
142
|
+
original_prediction: str,
|
|
143
|
+
filtered_prediction: str,
|
|
144
|
+
reference: str,
|
|
145
|
+
task_state: TaskState,
|
|
146
|
+
) -> Score:
|
|
108
147
|
"""
|
|
109
|
-
|
|
148
|
+
Calculate accuracy score by matching prediction with reference answers.
|
|
110
149
|
"""
|
|
111
150
|
from .utils import _answer_to_bags
|
|
112
151
|
|
|
152
|
+
score = Score(
|
|
153
|
+
extracted_prediction=filtered_prediction,
|
|
154
|
+
prediction=original_prediction,
|
|
155
|
+
)
|
|
156
|
+
|
|
113
157
|
max_em = 0
|
|
114
|
-
|
|
158
|
+
reference = ast.literal_eval(reference) if isinstance(reference, str) else reference
|
|
159
|
+
for gold_answer in reference:
|
|
115
160
|
# Convert the answers to bags of answers
|
|
116
|
-
predicted_bags = _answer_to_bags(
|
|
161
|
+
predicted_bags = _answer_to_bags(filtered_prediction)
|
|
117
162
|
gold_bags = _answer_to_bags(gold_answer)
|
|
118
163
|
|
|
119
164
|
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
|
|
@@ -124,7 +169,10 @@ class DROPAdapter(DataAdapter):
|
|
|
124
169
|
if gold_answer[0].strip():
|
|
125
170
|
max_em = max(max_em, exact_match)
|
|
126
171
|
|
|
127
|
-
|
|
172
|
+
score.value = {'acc': max_em}
|
|
173
|
+
score.main_score_name = 'acc'
|
|
174
|
+
|
|
175
|
+
return score
|
|
128
176
|
|
|
129
177
|
@staticmethod
|
|
130
178
|
def parse_answer(answer):
|
|
@@ -1,6 +1,15 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
3
|
-
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.metric import Score
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
4
13
|
|
|
5
14
|
TEMPLATE_0SHOT = """Please read the following text and answer the question below.
|
|
6
15
|
|
|
@@ -13,52 +22,82 @@ TEMPLATE_0SHOT = """Please read the following text and answer the question below
|
|
|
13
22
|
Format your response as follows: "Therefore, the answer is (insert answer here)"."""
|
|
14
23
|
|
|
15
24
|
|
|
16
|
-
@
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
eval_split='test',
|
|
29
|
-
prompt_template=TEMPLATE_0SHOT,
|
|
25
|
+
@register_benchmark(
|
|
26
|
+
BenchmarkMeta(
|
|
27
|
+
name='frames',
|
|
28
|
+
pretty_name='FRAMES',
|
|
29
|
+
tags=[Tags.REASONING, Tags.LONG_CONTEXT],
|
|
30
|
+
description=
|
|
31
|
+
'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.', # noqa: E501
|
|
32
|
+
dataset_id='iic/frames',
|
|
33
|
+
metric_list=['acc'],
|
|
34
|
+
eval_split='test',
|
|
35
|
+
prompt_template=TEMPLATE_0SHOT,
|
|
36
|
+
)
|
|
30
37
|
)
|
|
31
|
-
class FramesAdapter(
|
|
38
|
+
class FramesAdapter(DefaultDataAdapter):
|
|
32
39
|
|
|
33
40
|
def __init__(self, **kwargs):
|
|
34
41
|
super().__init__(**kwargs)
|
|
42
|
+
self._use_llm_judge = True # Enable LLM judge for FRAMES
|
|
43
|
+
|
|
44
|
+
def load(self):
|
|
45
|
+
# Try to load dataset from local disk
|
|
46
|
+
dataset_name_or_path = self.dataset_id
|
|
47
|
+
if os.path.exists(dataset_name_or_path):
|
|
48
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
49
|
+
dataset_path = dataset_name_or_path
|
|
50
|
+
else:
|
|
51
|
+
from modelscope import dataset_snapshot_download
|
|
35
52
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
return data_dict
|
|
53
|
+
# Load dataset from remote
|
|
54
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
55
|
+
# download dataset snapshot
|
|
56
|
+
dataset_path = dataset_snapshot_download(dataset_name_or_path, allow_file_pattern='test.jsonl')
|
|
41
57
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
58
|
+
dataset = LocalDataLoader(
|
|
59
|
+
data_id_or_path=dataset_path,
|
|
60
|
+
split=self.eval_split,
|
|
61
|
+
sample_fields=self.record_to_sample,
|
|
62
|
+
subset='test',
|
|
63
|
+
limit=self.limit,
|
|
64
|
+
repeats=self.repeats
|
|
65
|
+
).load()
|
|
50
66
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
67
|
+
test_dataset = DatasetDict({'test': dataset})
|
|
68
|
+
|
|
69
|
+
return test_dataset, None
|
|
70
|
+
|
|
71
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
54
72
|
"""
|
|
55
|
-
|
|
73
|
+
Convert a data record to a Sample object.
|
|
56
74
|
|
|
57
|
-
|
|
75
|
+
Args:
|
|
76
|
+
record (Dict[str, Any]): Input data record.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Sample: Sample object with input, target, and metadata.
|
|
80
|
+
"""
|
|
81
|
+
context = '\n'.join([f"{i['title']}\n{i['text']}" for i in record['wiki_items']])
|
|
82
|
+
question = record['Prompt']
|
|
83
|
+
|
|
84
|
+
return Sample(
|
|
85
|
+
input=question, target=record['Answer'], metadata={
|
|
86
|
+
'context': context,
|
|
87
|
+
'wiki_items': record['wiki_items']
|
|
88
|
+
}
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def format_prompt_template(self, sample):
|
|
92
|
+
context = sample.metadata['context']
|
|
93
|
+
question = sample.input
|
|
94
|
+
return self.prompt_template.format(context=context, question=question)
|
|
95
|
+
|
|
96
|
+
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
58
97
|
"""
|
|
59
|
-
|
|
98
|
+
Extract the answer from the model prediction.
|
|
60
99
|
"""
|
|
61
|
-
response =
|
|
100
|
+
response = prediction.replace('*', '')
|
|
62
101
|
|
|
63
102
|
if 'the answer is' in response:
|
|
64
103
|
ans = response.rsplit('the answer is', 1)[-1].strip().strip('.').strip()
|
|
@@ -67,25 +106,69 @@ class FramesAdapter(DataAdapter):
|
|
|
67
106
|
|
|
68
107
|
return ans
|
|
69
108
|
|
|
70
|
-
def
|
|
109
|
+
def match_score(
|
|
110
|
+
self,
|
|
111
|
+
original_prediction: str,
|
|
112
|
+
filtered_prediction: str,
|
|
113
|
+
reference: str,
|
|
114
|
+
task_state: TaskState,
|
|
115
|
+
) -> Score:
|
|
71
116
|
"""
|
|
72
|
-
|
|
117
|
+
Calculate accuracy score by matching prediction with reference.
|
|
73
118
|
"""
|
|
119
|
+
from evalscope.metrics import exact_match
|
|
74
120
|
from .utils import normalize_answer
|
|
75
|
-
gold = normalize_answer(gold)
|
|
76
|
-
pred = normalize_answer(pred)
|
|
77
|
-
return exact_match(gold=gold, pred=pred)
|
|
78
121
|
|
|
79
|
-
|
|
122
|
+
score = Score(
|
|
123
|
+
extracted_prediction=filtered_prediction,
|
|
124
|
+
prediction=original_prediction,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
gold = normalize_answer(reference)
|
|
128
|
+
pred = normalize_answer(filtered_prediction)
|
|
129
|
+
accuracy = exact_match(gold=gold, pred=pred)
|
|
130
|
+
|
|
131
|
+
score.value = {'acc': accuracy}
|
|
132
|
+
score.main_score_name = 'acc'
|
|
133
|
+
|
|
134
|
+
return score
|
|
135
|
+
|
|
136
|
+
def llm_match_score(
|
|
137
|
+
self,
|
|
138
|
+
original_prediction: str,
|
|
139
|
+
filtered_prediction: str,
|
|
140
|
+
reference: str,
|
|
141
|
+
task_state: TaskState,
|
|
142
|
+
) -> Score:
|
|
143
|
+
"""
|
|
144
|
+
Use LLM judge to evaluate the prediction against the reference.
|
|
145
|
+
"""
|
|
80
146
|
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
|
|
81
147
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
148
|
+
score = Score(
|
|
149
|
+
extracted_prediction=filtered_prediction,
|
|
150
|
+
prediction=original_prediction,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
question = task_state.input_text
|
|
154
|
+
|
|
155
|
+
# Get grading response
|
|
156
|
+
prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=reference, answer_2=filtered_prediction)
|
|
157
|
+
orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
158
|
+
|
|
159
|
+
# Parse grading response
|
|
88
160
|
if 'YES' in orm_response:
|
|
89
|
-
|
|
161
|
+
accuracy = 1.0
|
|
90
162
|
else:
|
|
91
|
-
|
|
163
|
+
accuracy = 0.0
|
|
164
|
+
|
|
165
|
+
score.value = {'acc': accuracy}
|
|
166
|
+
score.explanation = f'LLM judge: {orm_response}'
|
|
167
|
+
score.metadata = {
|
|
168
|
+
'source': 'llm_judge',
|
|
169
|
+
'judge_strategy': self.judge_strategy,
|
|
170
|
+
'model': self.llm_judge.model_id
|
|
171
|
+
}
|
|
172
|
+
score.main_score_name = 'acc'
|
|
173
|
+
|
|
174
|
+
return score
|