evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -1,6 +1,15 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
3
|
-
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.metric import Score
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
4
13
|
|
|
5
14
|
TEMPLATE_0SHOT = """Please read the following text and answer the question below.
|
|
6
15
|
|
|
@@ -13,52 +22,83 @@ TEMPLATE_0SHOT = """Please read the following text and answer the question below
|
|
|
13
22
|
Format your response as follows: "Therefore, the answer is (insert answer here)"."""
|
|
14
23
|
|
|
15
24
|
|
|
16
|
-
@
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
eval_split='test',
|
|
29
|
-
prompt_template=TEMPLATE_0SHOT,
|
|
25
|
+
@register_benchmark(
|
|
26
|
+
BenchmarkMeta(
|
|
27
|
+
name='frames',
|
|
28
|
+
pretty_name='FRAMES',
|
|
29
|
+
tags=[Tags.REASONING, Tags.LONG_CONTEXT],
|
|
30
|
+
description=
|
|
31
|
+
'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.', # noqa: E501
|
|
32
|
+
dataset_id='iic/frames',
|
|
33
|
+
metric_list=['acc'],
|
|
34
|
+
eval_split='test',
|
|
35
|
+
prompt_template=TEMPLATE_0SHOT,
|
|
36
|
+
)
|
|
30
37
|
)
|
|
31
|
-
class FramesAdapter(
|
|
38
|
+
class FramesAdapter(DefaultDataAdapter):
|
|
32
39
|
|
|
33
40
|
def __init__(self, **kwargs):
|
|
34
41
|
super().__init__(**kwargs)
|
|
42
|
+
self._use_llm_judge = True # Enable LLM judge for FRAMES
|
|
43
|
+
|
|
44
|
+
def load(self):
|
|
45
|
+
# Try to load dataset from local disk
|
|
46
|
+
dataset_name_or_path = self.dataset_id
|
|
47
|
+
if os.path.exists(dataset_name_or_path):
|
|
48
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
49
|
+
dataset_path = dataset_name_or_path
|
|
50
|
+
else:
|
|
51
|
+
from modelscope import dataset_snapshot_download
|
|
35
52
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
return data_dict
|
|
53
|
+
# Load dataset from remote
|
|
54
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
55
|
+
# download dataset snapshot
|
|
56
|
+
dataset_path = dataset_snapshot_download(dataset_name_or_path, allow_file_pattern='test.jsonl')
|
|
41
57
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
58
|
+
dataset = LocalDataLoader(
|
|
59
|
+
data_id_or_path=dataset_path,
|
|
60
|
+
split=self.eval_split,
|
|
61
|
+
sample_fields=self.record_to_sample,
|
|
62
|
+
subset='test',
|
|
63
|
+
limit=self.limit,
|
|
64
|
+
repeats=self.repeats,
|
|
65
|
+
shuffle=self.shuffle,
|
|
66
|
+
).load()
|
|
50
67
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
68
|
+
test_dataset = DatasetDict({'test': dataset})
|
|
69
|
+
|
|
70
|
+
return test_dataset, None
|
|
71
|
+
|
|
72
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
54
73
|
"""
|
|
55
|
-
|
|
74
|
+
Convert a data record to a Sample object.
|
|
56
75
|
|
|
57
|
-
|
|
76
|
+
Args:
|
|
77
|
+
record (Dict[str, Any]): Input data record.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Sample: Sample object with input, target, and metadata.
|
|
81
|
+
"""
|
|
82
|
+
context = '\n'.join([f"{i['title']}\n{i['text']}" for i in record['wiki_items']])
|
|
83
|
+
question = record['Prompt']
|
|
84
|
+
|
|
85
|
+
return Sample(
|
|
86
|
+
input=question, target=record['Answer'], metadata={
|
|
87
|
+
'context': context,
|
|
88
|
+
'wiki_items': record['wiki_items']
|
|
89
|
+
}
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def format_prompt_template(self, sample):
|
|
93
|
+
context = sample.metadata['context']
|
|
94
|
+
question = sample.input
|
|
95
|
+
return self.prompt_template.format(context=context, question=question)
|
|
96
|
+
|
|
97
|
+
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
58
98
|
"""
|
|
59
|
-
|
|
99
|
+
Extract the answer from the model prediction.
|
|
60
100
|
"""
|
|
61
|
-
response =
|
|
101
|
+
response = prediction.replace('*', '')
|
|
62
102
|
|
|
63
103
|
if 'the answer is' in response:
|
|
64
104
|
ans = response.rsplit('the answer is', 1)[-1].strip().strip('.').strip()
|
|
@@ -67,25 +107,69 @@ class FramesAdapter(DataAdapter):
|
|
|
67
107
|
|
|
68
108
|
return ans
|
|
69
109
|
|
|
70
|
-
def
|
|
110
|
+
def match_score(
|
|
111
|
+
self,
|
|
112
|
+
original_prediction: str,
|
|
113
|
+
filtered_prediction: str,
|
|
114
|
+
reference: str,
|
|
115
|
+
task_state: TaskState,
|
|
116
|
+
) -> Score:
|
|
71
117
|
"""
|
|
72
|
-
|
|
118
|
+
Calculate accuracy score by matching prediction with reference.
|
|
73
119
|
"""
|
|
120
|
+
from evalscope.metrics import exact_match
|
|
74
121
|
from .utils import normalize_answer
|
|
75
|
-
gold = normalize_answer(gold)
|
|
76
|
-
pred = normalize_answer(pred)
|
|
77
|
-
return exact_match(gold=gold, pred=pred)
|
|
78
122
|
|
|
79
|
-
|
|
123
|
+
score = Score(
|
|
124
|
+
extracted_prediction=filtered_prediction,
|
|
125
|
+
prediction=original_prediction,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
gold = normalize_answer(reference)
|
|
129
|
+
pred = normalize_answer(filtered_prediction)
|
|
130
|
+
accuracy = exact_match(gold=gold, pred=pred)
|
|
131
|
+
|
|
132
|
+
score.value = {'acc': accuracy}
|
|
133
|
+
score.main_score_name = 'acc'
|
|
134
|
+
|
|
135
|
+
return score
|
|
136
|
+
|
|
137
|
+
def llm_match_score(
|
|
138
|
+
self,
|
|
139
|
+
original_prediction: str,
|
|
140
|
+
filtered_prediction: str,
|
|
141
|
+
reference: str,
|
|
142
|
+
task_state: TaskState,
|
|
143
|
+
) -> Score:
|
|
144
|
+
"""
|
|
145
|
+
Use LLM judge to evaluate the prediction against the reference.
|
|
146
|
+
"""
|
|
80
147
|
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
|
|
81
148
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
149
|
+
score = Score(
|
|
150
|
+
extracted_prediction=filtered_prediction,
|
|
151
|
+
prediction=original_prediction,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
question = task_state.input_text
|
|
155
|
+
|
|
156
|
+
# Get grading response
|
|
157
|
+
prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=reference, answer_2=filtered_prediction)
|
|
158
|
+
orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
159
|
+
|
|
160
|
+
# Parse grading response
|
|
88
161
|
if 'YES' in orm_response:
|
|
89
|
-
|
|
162
|
+
accuracy = 1.0
|
|
90
163
|
else:
|
|
91
|
-
|
|
164
|
+
accuracy = 0.0
|
|
165
|
+
|
|
166
|
+
score.value = {'acc': accuracy}
|
|
167
|
+
score.explanation = f'LLM judge: {orm_response}'
|
|
168
|
+
score.metadata = {
|
|
169
|
+
'source': 'llm_judge',
|
|
170
|
+
'judge_strategy': self.judge_strategy,
|
|
171
|
+
'model': self.llm_judge.model_id
|
|
172
|
+
}
|
|
173
|
+
score.main_score_name = 'acc'
|
|
174
|
+
|
|
175
|
+
return score
|
|
@@ -1,16 +1,19 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
1
2
|
import glob
|
|
2
3
|
import os
|
|
3
4
|
from collections import defaultdict
|
|
4
|
-
from typing import Any, List
|
|
5
|
-
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
8
|
+
from evalscope.api.dataset import DatasetDict, DictDataLoader, Sample
|
|
9
|
+
from evalscope.api.evaluator import TaskState
|
|
10
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
11
|
+
from evalscope.api.metric import AggScore, SampleScore, Score
|
|
12
|
+
from evalscope.api.registry import register_benchmark
|
|
13
|
+
from evalscope.constants import Tags
|
|
9
14
|
from evalscope.report import Report, ReportKey
|
|
10
15
|
from evalscope.utils.logger import get_logger
|
|
11
16
|
|
|
12
|
-
# flake8: noqa
|
|
13
|
-
|
|
14
17
|
logger = get_logger()
|
|
15
18
|
|
|
16
19
|
GRADER_SYSTEM_PROMPT = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." # noqa: E501
|
|
@@ -19,59 +22,81 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
|
|
|
19
22
|
) # noqa: E501
|
|
20
23
|
|
|
21
24
|
|
|
22
|
-
@
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
'
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
25
|
+
@register_benchmark(
|
|
26
|
+
BenchmarkMeta(
|
|
27
|
+
name='general_arena',
|
|
28
|
+
pretty_name='GeneralArena',
|
|
29
|
+
tags=[Tags.CUSTOM, Tags.ARENA],
|
|
30
|
+
description=
|
|
31
|
+
'GeneralArena is a custom benchmark designed to evaluate the performance of large language models in a competitive setting, '
|
|
32
|
+
'where models are pitted against each other in custom tasks to determine their relative strengths and weaknesses. You should '
|
|
33
|
+
'provide the model outputs in the format of a list of dictionaries, where each dictionary contains the model name and its report path. '
|
|
34
|
+
'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/arena.html).',
|
|
35
|
+
dataset_id='general_arena',
|
|
36
|
+
metric_list=['winrate'],
|
|
37
|
+
few_shot_num=0,
|
|
38
|
+
train_split=None,
|
|
39
|
+
eval_split='test',
|
|
40
|
+
system_prompt=GRADER_SYSTEM_PROMPT,
|
|
41
|
+
prompt_template=GRADER_TEMPLATE,
|
|
42
|
+
extra_params={
|
|
43
|
+
'models': [{
|
|
44
|
+
'name': 'qwen-plus',
|
|
45
|
+
'report_path': 'outputs/20250627_172550/reports/qwen-plus'
|
|
46
|
+
}, {
|
|
47
|
+
'name': 'qwen2.5-7b',
|
|
48
|
+
'report_path': 'outputs/20250627_172817/reports/qwen2.5-7b-instruct'
|
|
49
|
+
}],
|
|
50
|
+
'baseline':
|
|
51
|
+
'qwen2.5-7b'
|
|
52
|
+
}
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
class GeneralArenaAdapter(DefaultDataAdapter):
|
|
50
56
|
|
|
51
57
|
def __init__(self, *args, **kwargs):
|
|
52
58
|
super().__init__(*args, **kwargs)
|
|
53
59
|
|
|
54
|
-
|
|
55
|
-
metric_registry.register(Metric(name='winrate', object=mean))
|
|
60
|
+
self._use_llm_judge = True
|
|
56
61
|
|
|
57
|
-
|
|
58
|
-
self.
|
|
62
|
+
self.models = self.extra_params.get('models', [])
|
|
63
|
+
self.baseline = self.extra_params.get('baseline', None)
|
|
59
64
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
self.baseline = extra_params.get('baseline', None)
|
|
63
|
-
|
|
64
|
-
def load(self, **kwargs):
|
|
65
|
+
def load(self):
|
|
66
|
+
"""Load dataset by processing model reports."""
|
|
65
67
|
self._check_names()
|
|
66
68
|
self._check_reports()
|
|
67
69
|
self._check_datasets()
|
|
68
70
|
logger.info(f'Overall datasets: {self.overall_datasets}')
|
|
69
71
|
dataset_model_dict = self._load_common_datasets()
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
72
|
+
datasets = self._build_pair_wise_data(dataset_model_dict)
|
|
73
|
+
|
|
74
|
+
# Convert to DatasetDict format
|
|
75
|
+
dataset_dict = {}
|
|
76
|
+
for subset_name, samples in datasets.items():
|
|
77
|
+
dataset = DictDataLoader(
|
|
78
|
+
dict_list=samples,
|
|
79
|
+
limit=self.limit,
|
|
80
|
+
shuffle=self.shuffle,
|
|
81
|
+
repeats=self.repeats,
|
|
82
|
+
sample_fields=self.record_to_sample
|
|
83
|
+
).load()
|
|
84
|
+
dataset_dict[subset_name] = dataset
|
|
85
|
+
|
|
86
|
+
test_dataset = DatasetDict(dataset_dict)
|
|
87
|
+
return test_dataset, None
|
|
88
|
+
|
|
89
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
90
|
+
"""Convert a data record to a Sample object."""
|
|
91
|
+
return Sample(
|
|
92
|
+
input=[ChatMessageUser(content=record['question'])],
|
|
93
|
+
target=record['answer_2'], # baseline answer
|
|
94
|
+
metadata={
|
|
95
|
+
'answer_1': record['answer_1'],
|
|
96
|
+
'model_1': record['model_1'],
|
|
97
|
+
'model_2': record['model_2'],
|
|
98
|
+
}
|
|
99
|
+
)
|
|
75
100
|
|
|
76
101
|
def _check_names(self):
|
|
77
102
|
"""Check the names of the models and baseline."""
|
|
@@ -119,7 +144,8 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
119
144
|
|
|
120
145
|
def _load_common_datasets(self):
|
|
121
146
|
"""Load common datasets from the local path."""
|
|
122
|
-
from evalscope.utils import OutputsStructure
|
|
147
|
+
from evalscope.utils import OutputsStructure
|
|
148
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
123
149
|
|
|
124
150
|
dataset_dict = defaultdict(dict)
|
|
125
151
|
for dataset_name, subset_name in self.overall_datasets:
|
|
@@ -128,7 +154,8 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
128
154
|
dataset_file_path = os.path.join(dataset_path, f'{dataset_name}_{subset_name}.jsonl')
|
|
129
155
|
if not os.path.exists(dataset_file_path):
|
|
130
156
|
raise ValueError(
|
|
131
|
-
f'Dataset {dataset_name} with subset {subset_name} not found in model {model["name"]}.'
|
|
157
|
+
f'Dataset {dataset_name} with subset {subset_name} not found in model {model["name"]}.'
|
|
158
|
+
)
|
|
132
159
|
dataset = jsonl_to_list(dataset_file_path)
|
|
133
160
|
# sort by index
|
|
134
161
|
dataset.sort(key=lambda x: x.get('index'))
|
|
@@ -138,9 +165,10 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
138
165
|
|
|
139
166
|
def _build_pair_wise_data(self, dataset_dict):
|
|
140
167
|
"""Build pairwise data for the models."""
|
|
168
|
+
from evalscope.api.evaluator import ReviewResult
|
|
141
169
|
from .utils import process_review_item
|
|
142
170
|
|
|
143
|
-
pairwise_data = defaultdict(
|
|
171
|
+
pairwise_data = defaultdict(list)
|
|
144
172
|
for (dataset_name, subset_name), model_data in dataset_dict.items():
|
|
145
173
|
if len(model_data) < 2:
|
|
146
174
|
logger.warning(f'Not enough models for dataset {dataset_name} with subset {subset_name}. Skipping.')
|
|
@@ -152,8 +180,13 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
152
180
|
continue
|
|
153
181
|
pairs = []
|
|
154
182
|
for model_item, baseline_item in zip(model_data[name], model_data[self.baseline]):
|
|
183
|
+
# Convert to ReviewResult objects like in get_model_prediction
|
|
184
|
+
model_review = ReviewResult.model_validate(model_item)
|
|
185
|
+
baseline_review = ReviewResult.model_validate(baseline_item)
|
|
186
|
+
|
|
155
187
|
for model_choice, baseline_choice in zip(
|
|
156
|
-
|
|
188
|
+
process_review_item(model_review), process_review_item(baseline_review)
|
|
189
|
+
):
|
|
157
190
|
pairs.append({
|
|
158
191
|
'question': model_choice['Question'],
|
|
159
192
|
'answer_1': model_choice['Generated'],
|
|
@@ -161,23 +194,26 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
161
194
|
'model_1': name,
|
|
162
195
|
'model_2': self.baseline
|
|
163
196
|
})
|
|
164
|
-
pairwise_data[f'{dataset_name}&{subset_name}@{name}&{self.baseline}']
|
|
197
|
+
pairwise_data[f'{dataset_name}&{subset_name}@{name}&{self.baseline}'] = pairs
|
|
165
198
|
|
|
166
199
|
return pairwise_data
|
|
167
200
|
|
|
168
|
-
def
|
|
201
|
+
def llm_match_score(
|
|
202
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
203
|
+
) -> Score:
|
|
204
|
+
"""Use LLM as a judge to evaluate the predicted answer against the baseline."""
|
|
169
205
|
from .utils import get_judge_score, post_process_result
|
|
170
206
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
207
|
+
score = Score(
|
|
208
|
+
extracted_prediction=filtered_prediction,
|
|
209
|
+
prediction=original_prediction,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
question = task_state.input_text
|
|
213
|
+
answer_1 = task_state.metadata['answer_1']
|
|
214
|
+
answer_2 = reference # baseline answer
|
|
215
|
+
model_1 = task_state.metadata['model_1']
|
|
216
|
+
model_2 = task_state.metadata['model_2']
|
|
181
217
|
|
|
182
218
|
system_template = self.system_prompt
|
|
183
219
|
prompt_template = self.prompt_template
|
|
@@ -185,9 +221,11 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
185
221
|
prompt1 = prompt_template.format(question=question, answer_1=answer_1, answer_2=answer_2)
|
|
186
222
|
# reverse the order
|
|
187
223
|
prompt2 = prompt_template.format(question=question, answer_1=answer_2, answer_2=answer_1)
|
|
224
|
+
|
|
188
225
|
# get grading response
|
|
189
|
-
game1_response = judge(prompt1, system_prompt=system_template)
|
|
190
|
-
game2_response = judge(prompt2, system_prompt=system_template)
|
|
226
|
+
game1_response = self.llm_judge.judge(prompt1, system_prompt=system_template)
|
|
227
|
+
game2_response = self.llm_judge.judge(prompt2, system_prompt=system_template)
|
|
228
|
+
|
|
191
229
|
# parse grading response
|
|
192
230
|
# game1
|
|
193
231
|
res1 = post_process_result(game1_response)
|
|
@@ -195,9 +233,9 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
195
233
|
# game2
|
|
196
234
|
res2 = post_process_result(game2_response)
|
|
197
235
|
score2 = get_judge_score(res2, reverse=True)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
236
|
+
|
|
237
|
+
battle_result = {
|
|
238
|
+
'score': (score1 + score2) / 2,
|
|
201
239
|
'games': [
|
|
202
240
|
{
|
|
203
241
|
'model_a': model_1,
|
|
@@ -214,31 +252,38 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
214
252
|
]
|
|
215
253
|
}
|
|
216
254
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
255
|
+
score.value = {'score': battle_result['score']}
|
|
256
|
+
score.explanation = f'LLM judge battles: Game1: {game1_response[:100]}... Game2: {game2_response[:100]}...'
|
|
257
|
+
score.metadata = {
|
|
258
|
+
'source': 'llm_judge',
|
|
259
|
+
'judge_strategy': getattr(self, 'judge_strategy', 'default'),
|
|
260
|
+
'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown',
|
|
261
|
+
'battle_result': battle_result
|
|
262
|
+
}
|
|
263
|
+
score.main_score_name = 'score'
|
|
264
|
+
|
|
265
|
+
return score
|
|
266
|
+
|
|
267
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
268
|
+
"""Aggregate scores to compute winrate."""
|
|
221
269
|
import numpy as np
|
|
222
270
|
import pandas as pd
|
|
223
271
|
|
|
224
272
|
from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
|
|
225
273
|
|
|
226
|
-
|
|
227
|
-
review_res_list = [item for sublist in review_res_list for item in sublist]
|
|
228
|
-
|
|
229
|
-
battles = pd.concat([get_battles_from_row(res) for res in review_res_list])
|
|
274
|
+
battles = pd.concat([get_battles_from_row(res.score.metadata['battle_result']) for res in sample_scores])
|
|
230
275
|
|
|
231
276
|
bt_model_coef = compute_mle_elo(battles, baseline_model=self.baseline)
|
|
232
277
|
|
|
233
278
|
bootstrap_model_coef = get_bootstrap_result(
|
|
234
|
-
battles, func_compute_elo=compute_mle_elo, num_round=100, baseline_model=self.baseline
|
|
279
|
+
battles, func_compute_elo=compute_mle_elo, num_round=100, baseline_model=self.baseline
|
|
280
|
+
)
|
|
235
281
|
|
|
236
282
|
stats = pd.DataFrame()
|
|
237
283
|
stats['results'] = None
|
|
238
284
|
stats['results'] = stats['results'].astype('object')
|
|
239
285
|
|
|
240
286
|
for i, model in enumerate(bt_model_coef.index):
|
|
241
|
-
# assert model in bootstrap_elo_lu.columns
|
|
242
287
|
stats.at[i, 'model'] = model
|
|
243
288
|
stats.at[i, 'score'] = bt_model_coef[model]
|
|
244
289
|
stats.at[i, 'lower'] = np.percentile(bootstrap_model_coef[model], 2.5)
|
|
@@ -249,20 +294,25 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
249
294
|
metrics_dict['winrate_lower'] = get_win_rate_column(stats, 'lower', self.baseline).to_dict()
|
|
250
295
|
metrics_dict['winrate_upper'] = get_win_rate_column(stats, 'upper', self.baseline).to_dict()
|
|
251
296
|
|
|
252
|
-
|
|
297
|
+
agg_scores = []
|
|
253
298
|
for metric_name, models in metrics_dict.items():
|
|
254
|
-
for model_name,
|
|
299
|
+
for model_name, score_val in models.items():
|
|
255
300
|
if model_name == self.baseline:
|
|
256
301
|
continue
|
|
257
|
-
|
|
258
|
-
|
|
302
|
+
agg_scores.append(AggScore(score=score_val, metric_name=metric_name, num=len(sample_scores)))
|
|
303
|
+
|
|
304
|
+
return agg_scores
|
|
259
305
|
|
|
260
|
-
def
|
|
306
|
+
def extract_answer(self, prediction, task_state):
|
|
307
|
+
# NOTE: This is a hacky way to extract the answer from the prediction
|
|
308
|
+
return task_state.metadata['answer_1']
|
|
309
|
+
|
|
310
|
+
def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
|
|
261
311
|
"""Post-process the report to convert it to a DataFrame with winrate leaderboards."""
|
|
262
312
|
import pandas as pd
|
|
263
313
|
import tabulate
|
|
264
314
|
|
|
265
|
-
report_path =
|
|
315
|
+
report_path = output_dir
|
|
266
316
|
leaderboard_file = os.path.join(report_path, 'leaderboard.txt')
|
|
267
317
|
|
|
268
318
|
# Ensure report directory exists
|
|
@@ -288,7 +338,8 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
288
338
|
"""Format DataFrame as leaderboard with CI."""
|
|
289
339
|
# Pivot to get winrate, winrate_lower, winrate_upper as columns
|
|
290
340
|
pivot_df = data_df.pivot_table(
|
|
291
|
-
index=[ReportKey.model_name], columns=ReportKey.metric_name, values=ReportKey.score, aggfunc='first'
|
|
341
|
+
index=[ReportKey.model_name], columns=ReportKey.metric_name, values=ReportKey.score, aggfunc='first'
|
|
342
|
+
)
|
|
292
343
|
|
|
293
344
|
# Add baseline model with 50% winrate
|
|
294
345
|
baseline_data = {'winrate': 0.5, 'winrate_lower': 0.5, 'winrate_upper': 0.5}
|
|
@@ -392,20 +443,11 @@ class GeneralArenaAdapter(DataAdapter):
|
|
|
392
443
|
subset_df = parsed_df[(parsed_df['dataset_name'] == dataset_name)
|
|
393
444
|
& (parsed_df['subset_name'] == subset_name)]
|
|
394
445
|
leaderboard_outputs.append(
|
|
395
|
-
format_leaderboard(subset_df, f'=== SUBSET LEADERBOARD: {dataset_name} - {subset_name} ===')
|
|
446
|
+
format_leaderboard(subset_df, f'=== SUBSET LEADERBOARD: {dataset_name} - {subset_name} ===')
|
|
447
|
+
)
|
|
396
448
|
|
|
397
449
|
# Write all leaderboard outputs to file
|
|
398
450
|
with open(leaderboard_file, 'w', encoding='utf-8') as f:
|
|
399
451
|
f.write('\n'.join(leaderboard_outputs))
|
|
400
452
|
|
|
401
453
|
logger.info(f'Leaderboard results saved to: {leaderboard_file}')
|
|
402
|
-
|
|
403
|
-
def get_gold_answer(self, input_d):
|
|
404
|
-
return f"model_1: {input_d['model_1']}\n---\n" + input_d['answer_1']
|
|
405
|
-
|
|
406
|
-
def llm_parse_pred_result(self, result, raw_input_d=None, eval_type=EvalType.CHECKPOINT):
|
|
407
|
-
return f"model_2: {raw_input_d['model_2']}\n---\n" + raw_input_d['answer_2']
|
|
408
|
-
|
|
409
|
-
def match(self, gold, pred):
|
|
410
|
-
logger.warning(f'Please use LLMJudge to match the result for {self.name}')
|
|
411
|
-
return
|