evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,29 +1,14 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
from typing import Any, Dict
|
|
3
2
|
|
|
4
|
-
from evalscope.
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
8
7
|
from evalscope.utils.logger import get_logger
|
|
8
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
9
9
|
|
|
10
10
|
logger = get_logger()
|
|
11
11
|
|
|
12
|
-
SUBSET_LIST = [
|
|
13
|
-
'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
|
|
14
|
-
'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
|
|
15
|
-
'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
|
|
16
|
-
'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
|
|
17
|
-
'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
|
|
18
|
-
'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
|
|
19
|
-
'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
|
|
20
|
-
'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
|
|
21
|
-
'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
|
|
22
|
-
'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
|
|
23
|
-
'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
|
|
24
|
-
'world_religions'
|
|
25
|
-
]
|
|
26
|
-
|
|
27
12
|
SUBJECT_MAPPING = {
|
|
28
13
|
'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
|
|
29
14
|
'anatomy': ['Anatomy', 'health', 'Other'],
|
|
@@ -84,25 +69,31 @@ SUBJECT_MAPPING = {
|
|
|
84
69
|
'world_religions': ['World Religions', 'philosophy', 'Humanities'],
|
|
85
70
|
}
|
|
86
71
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
72
|
+
SUBSET_LIST = list(SUBJECT_MAPPING.keys())
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@register_benchmark(
|
|
76
|
+
BenchmarkMeta(
|
|
77
|
+
name='mmlu_redux',
|
|
78
|
+
pretty_name='MMLU-Redux',
|
|
79
|
+
tags=[Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
|
|
80
|
+
description=
|
|
81
|
+
'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options. ' # noqa: E501
|
|
82
|
+
'The bad answers are corrected.', # noqa: E501
|
|
83
|
+
dataset_id='AI-ModelScope/mmlu-redux-2.0',
|
|
84
|
+
subset_list=SUBSET_LIST,
|
|
85
|
+
metric_list=[{
|
|
86
|
+
'acc': {
|
|
87
|
+
'allow_inclusion': True
|
|
88
|
+
}
|
|
89
|
+
}],
|
|
90
|
+
few_shot_num=0,
|
|
91
|
+
train_split=None,
|
|
92
|
+
eval_split='test',
|
|
93
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
|
|
94
|
+
)
|
|
104
95
|
)
|
|
105
|
-
class MMLUReduxAdapter(
|
|
96
|
+
class MMLUReduxAdapter(MultiChoiceAdapter):
|
|
106
97
|
|
|
107
98
|
def __init__(self, **kwargs):
|
|
108
99
|
super().__init__(**kwargs)
|
|
@@ -111,75 +102,38 @@ class MMLUReduxAdapter(DataAdapter):
|
|
|
111
102
|
self.few_shot_num = 0
|
|
112
103
|
logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
|
|
113
104
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
Returns:
|
|
150
|
-
The parsed input. e.g. gold answer ... Depending on the dataset.
|
|
151
|
-
"""
|
|
152
|
-
answer_index = int(input_d['answer'])
|
|
153
|
-
return self.choices[answer_index]
|
|
154
|
-
|
|
155
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
156
|
-
"""
|
|
157
|
-
Parse the predicted result and extract proper answer.
|
|
158
|
-
|
|
159
|
-
Args:
|
|
160
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
161
|
-
raw_input_d: The raw input. Depending on the dataset.
|
|
162
|
-
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
163
|
-
|
|
164
|
-
Returns:
|
|
165
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
166
|
-
"""
|
|
167
|
-
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
168
|
-
return result
|
|
169
|
-
else:
|
|
170
|
-
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
171
|
-
|
|
172
|
-
def match(self, gold: str, pred: str) -> float:
|
|
173
|
-
"""
|
|
174
|
-
Match the gold answer and the predicted answer.
|
|
175
|
-
|
|
176
|
-
Args:
|
|
177
|
-
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
|
|
178
|
-
e.g. 'A', extracted from get_gold_answer method.
|
|
179
|
-
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
|
|
180
|
-
e.g. 'B', extracted from parse_pred_result method.
|
|
181
|
-
|
|
182
|
-
Returns:
|
|
183
|
-
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
184
|
-
"""
|
|
185
|
-
return exact_match(gold=gold, pred=pred)
|
|
105
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
106
|
+
error_type = record['error_type']
|
|
107
|
+
choices = record['choices']
|
|
108
|
+
target_index_list = [int(record['answer'])]
|
|
109
|
+
correct_answer = record['correct_answer']
|
|
110
|
+
if error_type == 'no_correct_answer' and correct_answer:
|
|
111
|
+
choices[target_index_list[0]] = correct_answer
|
|
112
|
+
elif error_type == 'wrong_groundtruth' and correct_answer:
|
|
113
|
+
try:
|
|
114
|
+
target_index_list = [int(correct_answer)]
|
|
115
|
+
except ValueError:
|
|
116
|
+
choice_index = ord(correct_answer) - ord('A')
|
|
117
|
+
target_index_list = [choice_index]
|
|
118
|
+
elif error_type == 'multiple_correct_answers' and correct_answer:
|
|
119
|
+
correct_answer = correct_answer.strip('()')
|
|
120
|
+
try:
|
|
121
|
+
correct_answer = correct_answer.replace(' and ', ',').replace(' or ', ',')
|
|
122
|
+
target_index_list = list(map(int, correct_answer.split(',')))
|
|
123
|
+
except ValueError:
|
|
124
|
+
try:
|
|
125
|
+
target_index_list = [ord(c) - ord('A') for c in correct_answer.split(',')]
|
|
126
|
+
except TypeError:
|
|
127
|
+
# find the index of the correct answer in choices
|
|
128
|
+
target_index_list = [choices.index(c) for c in correct_answer.split(',') if c in choices]
|
|
129
|
+
|
|
130
|
+
return Sample(
|
|
131
|
+
input=record['question'],
|
|
132
|
+
choices=choices,
|
|
133
|
+
target=['ABCD'[i] for i in target_index_list] if target_index_list else ['A', 'B', 'C', 'D'],
|
|
134
|
+
metadata={
|
|
135
|
+
'error_type': error_type,
|
|
136
|
+
'correct_answer': correct_answer,
|
|
137
|
+
'potential_reason': record.get('potential_reason', ''),
|
|
138
|
+
},
|
|
139
|
+
)
|
|
@@ -1,74 +1,43 @@
|
|
|
1
1
|
import ast
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
|
-
from evalscope.
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.registry import register_benchmark
|
|
7
|
+
from evalscope.constants import Tags
|
|
8
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_benchmark(
|
|
12
|
+
BenchmarkMeta(
|
|
13
|
+
name='musr',
|
|
14
|
+
pretty_name='MuSR',
|
|
15
|
+
tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
|
|
16
|
+
description=
|
|
17
|
+
'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.', # noqa: E501
|
|
18
|
+
dataset_id='AI-ModelScope/MuSR',
|
|
19
|
+
metric_list=['acc'],
|
|
20
|
+
subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
|
|
21
|
+
few_shot_num=0,
|
|
22
|
+
train_split=None,
|
|
23
|
+
eval_split='test',
|
|
24
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
|
|
25
|
+
)
|
|
26
26
|
)
|
|
27
|
-
class MuSRAdapter(
|
|
27
|
+
class MuSRAdapter(MultiChoiceAdapter):
|
|
28
28
|
|
|
29
29
|
def __init__(self, **kwargs):
|
|
30
30
|
super().__init__(**kwargs)
|
|
31
31
|
|
|
32
|
-
self.
|
|
33
|
-
|
|
34
|
-
def load(self, **kwargs):
|
|
35
|
-
# default load all levels
|
|
36
|
-
kwargs['split_as_subset'] = True
|
|
37
|
-
data_dict = super().load(**kwargs)
|
|
38
|
-
return data_dict
|
|
39
|
-
|
|
40
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
41
|
-
|
|
42
|
-
choices = self.format_choice(ast.literal_eval(input_d['choices']))
|
|
43
|
-
|
|
44
|
-
full_prompt = self.prompt_template.format(
|
|
45
|
-
narrative=input_d['narrative'], question=input_d['question'], choices=choices)
|
|
46
|
-
|
|
47
|
-
return self.gen_prompt_data(full_prompt)
|
|
48
|
-
|
|
49
|
-
def format_choice(self, options: list):
|
|
50
|
-
option_str = ''
|
|
51
|
-
for opt, choice in zip(options, self.choices):
|
|
52
|
-
option_str += f'({choice}): {opt}\n'
|
|
53
|
-
return option_str
|
|
54
|
-
|
|
55
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
56
|
-
"""
|
|
57
|
-
Parse the raw input labels (gold).
|
|
58
|
-
"""
|
|
59
|
-
return self.choices[input_d['answer_index']]
|
|
32
|
+
self.split_as_subset = True
|
|
60
33
|
|
|
61
|
-
def
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
66
|
-
return result
|
|
67
|
-
else:
|
|
68
|
-
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
34
|
+
def record_to_sample(self, record) -> Sample:
|
|
35
|
+
choices = ast.literal_eval(record['choices'])
|
|
36
|
+
choice_letters = ['A', 'B', 'C', 'D', 'E', 'F']
|
|
37
|
+
target_letter = choice_letters[record['answer_index']]
|
|
69
38
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
39
|
+
return Sample(
|
|
40
|
+
input=f"{record['narrative']}\n\n{record['question']}",
|
|
41
|
+
choices=choices,
|
|
42
|
+
target=target_letter,
|
|
43
|
+
)
|