evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,58 +1,51 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
from
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.evaluator import TaskState
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
6
11
|
|
|
7
12
|
logger = get_logger()
|
|
8
13
|
|
|
9
14
|
|
|
10
|
-
@
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
15
|
+
@register_benchmark(
|
|
16
|
+
BenchmarkMeta(
|
|
17
|
+
name='math_500',
|
|
18
|
+
pretty_name='MATH-500',
|
|
19
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
20
|
+
description=
|
|
21
|
+
"MATH-500 is a benchmark for evaluating mathematical reasoning capabilities of AI models. It consists of 500 diverse math problems across five levels of difficulty, designed to test a model's ability to solve complex mathematical problems by generating step-by-step solutions and providing the correct final answer.", # noqa: E501
|
|
22
|
+
dataset_id='AI-ModelScope/MATH-500',
|
|
23
|
+
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
24
|
+
metric_list=[{
|
|
25
|
+
'acc': {
|
|
26
|
+
'numeric': True
|
|
27
|
+
}
|
|
28
|
+
}],
|
|
29
|
+
few_shot_num=0,
|
|
30
|
+
train_split=None,
|
|
31
|
+
eval_split='test',
|
|
32
|
+
prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
33
|
+
)
|
|
23
34
|
)
|
|
24
|
-
class Math500Adapter(
|
|
35
|
+
class Math500Adapter(DefaultDataAdapter):
|
|
25
36
|
|
|
26
37
|
def __init__(self, *args, **kwargs):
|
|
27
38
|
super().__init__(*args, **kwargs)
|
|
28
39
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
return self.gen_prompt_data(full_prompt)
|
|
43
|
-
|
|
44
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
45
|
-
# Extract the gold answer from the input dict.
|
|
46
|
-
return strip_answer_string(input_d['answer'])
|
|
47
|
-
|
|
48
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
49
|
-
"""
|
|
50
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
51
|
-
"""
|
|
52
|
-
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
53
|
-
result = strip_answer_string(extract_answer(result))
|
|
54
|
-
return result
|
|
55
|
-
|
|
56
|
-
def match(self, gold: str, pred: str) -> float:
|
|
57
|
-
res = math_equal(pred, gold)
|
|
58
|
-
return 1.0 if res else 0.0
|
|
40
|
+
self.reformat_subset = True
|
|
41
|
+
|
|
42
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
43
|
+
return Sample(
|
|
44
|
+
input=record['problem'],
|
|
45
|
+
target=record['answer'],
|
|
46
|
+
subset_key=f"Level {record['level']}",
|
|
47
|
+
metadata={
|
|
48
|
+
'question_id': record['unique_id'],
|
|
49
|
+
'solution': record['solution'],
|
|
50
|
+
},
|
|
51
|
+
)
|
|
@@ -1,77 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import csv
|
|
3
|
-
import os
|
|
4
2
|
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
9
7
|
from evalscope.utils.logger import get_logger
|
|
10
|
-
|
|
11
|
-
# flake8: noqa
|
|
8
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
12
9
|
|
|
13
10
|
logger = get_logger()
|
|
14
11
|
|
|
15
|
-
SUBSET_LIST = [
|
|
16
|
-
'high_school_european_history',
|
|
17
|
-
'business_ethics',
|
|
18
|
-
'clinical_knowledge',
|
|
19
|
-
'medical_genetics',
|
|
20
|
-
'high_school_us_history',
|
|
21
|
-
'high_school_physics',
|
|
22
|
-
'high_school_world_history',
|
|
23
|
-
'virology',
|
|
24
|
-
'high_school_microeconomics',
|
|
25
|
-
'econometrics',
|
|
26
|
-
'college_computer_science',
|
|
27
|
-
'high_school_biology',
|
|
28
|
-
'abstract_algebra',
|
|
29
|
-
'professional_accounting',
|
|
30
|
-
'philosophy',
|
|
31
|
-
'professional_medicine',
|
|
32
|
-
'nutrition',
|
|
33
|
-
'global_facts',
|
|
34
|
-
'machine_learning',
|
|
35
|
-
'security_studies',
|
|
36
|
-
'public_relations',
|
|
37
|
-
'professional_psychology',
|
|
38
|
-
'prehistory',
|
|
39
|
-
'anatomy',
|
|
40
|
-
'human_sexuality',
|
|
41
|
-
'college_medicine',
|
|
42
|
-
'high_school_government_and_politics',
|
|
43
|
-
'college_chemistry',
|
|
44
|
-
'logical_fallacies',
|
|
45
|
-
'high_school_geography',
|
|
46
|
-
'elementary_mathematics',
|
|
47
|
-
'human_aging',
|
|
48
|
-
'college_mathematics',
|
|
49
|
-
'high_school_psychology',
|
|
50
|
-
'formal_logic',
|
|
51
|
-
'high_school_statistics',
|
|
52
|
-
'international_law',
|
|
53
|
-
'high_school_mathematics',
|
|
54
|
-
'high_school_computer_science',
|
|
55
|
-
'conceptual_physics',
|
|
56
|
-
'miscellaneous',
|
|
57
|
-
'high_school_chemistry',
|
|
58
|
-
'marketing',
|
|
59
|
-
'professional_law',
|
|
60
|
-
'management',
|
|
61
|
-
'college_physics',
|
|
62
|
-
'jurisprudence',
|
|
63
|
-
'world_religions',
|
|
64
|
-
'sociology',
|
|
65
|
-
'us_foreign_policy',
|
|
66
|
-
'high_school_macroeconomics',
|
|
67
|
-
'computer_security',
|
|
68
|
-
'moral_scenarios',
|
|
69
|
-
'moral_disputes',
|
|
70
|
-
'electrical_engineering',
|
|
71
|
-
'astronomy',
|
|
72
|
-
'college_biology',
|
|
73
|
-
]
|
|
74
|
-
|
|
75
12
|
SUBJECT_MAPPING = {
|
|
76
13
|
'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
|
|
77
14
|
'anatomy': ['Anatomy', 'health', 'Other'],
|
|
@@ -133,148 +70,38 @@ SUBJECT_MAPPING = {
|
|
|
133
70
|
}
|
|
134
71
|
|
|
135
72
|
|
|
136
|
-
@
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
73
|
+
@register_benchmark(
|
|
74
|
+
BenchmarkMeta(
|
|
75
|
+
name='mmlu',
|
|
76
|
+
pretty_name='MMLU',
|
|
77
|
+
tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
|
|
78
|
+
description=
|
|
79
|
+
"The MMLU (Massive Multitask Language Understanding) benchmark is a comprehensive evaluation suite designed to assess the performance of language models across a wide range of subjects and tasks. It includes multiple-choice questions from various domains, such as history, science, mathematics, and more, providing a robust measure of a model's understanding and reasoning capabilities.", # noqa: E501
|
|
80
|
+
dataset_id='cais/mmlu',
|
|
81
|
+
metric_list=['acc'],
|
|
82
|
+
subset_list=list(SUBJECT_MAPPING.keys()),
|
|
83
|
+
default_subset='all',
|
|
84
|
+
few_shot_num=5,
|
|
85
|
+
train_split='dev',
|
|
86
|
+
eval_split='test',
|
|
87
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
|
|
88
|
+
)
|
|
152
89
|
)
|
|
153
|
-
class MMLUAdapter(
|
|
90
|
+
class MMLUAdapter(MultiChoiceAdapter):
|
|
154
91
|
|
|
155
92
|
def __init__(self, **kwargs):
|
|
156
93
|
|
|
157
|
-
few_shot_num = kwargs.get('few_shot_num', 5)
|
|
158
|
-
if few_shot_num > 5:
|
|
159
|
-
logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
|
|
160
|
-
kwargs['few_shot_num'] = 5
|
|
161
|
-
|
|
162
94
|
super().__init__(**kwargs)
|
|
163
95
|
|
|
96
|
+
self.reformat_subset = True
|
|
164
97
|
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
165
|
-
self.choices = ['A', 'B', 'C', 'D']
|
|
166
|
-
|
|
167
|
-
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
168
|
-
data_dict = {}
|
|
169
|
-
for subset_name in subset_list:
|
|
170
|
-
data_dict[subset_name] = {}
|
|
171
|
-
|
|
172
|
-
for split_name in [self.train_split, self.eval_split]:
|
|
173
|
-
if split_name == 'train':
|
|
174
|
-
split_name_suffix = 'dev'
|
|
175
|
-
elif split_name == 'test':
|
|
176
|
-
split_name_suffix = 'test'
|
|
177
|
-
elif split_name == 'validation':
|
|
178
|
-
split_name_suffix = 'val'
|
|
179
|
-
else:
|
|
180
|
-
raise ValueError(f'Invalid split name: {split_name}')
|
|
181
|
-
|
|
182
|
-
if os.path.exists(dataset_name_or_path):
|
|
183
|
-
file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name_suffix}.csv')
|
|
184
|
-
else:
|
|
185
|
-
file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name_suffix}.csv')
|
|
186
|
-
|
|
187
|
-
if os.path.exists(file_path):
|
|
188
|
-
with open(file_path, encoding='utf-8') as f:
|
|
189
|
-
rows = []
|
|
190
|
-
reader = csv.reader(f)
|
|
191
|
-
for row in reader:
|
|
192
|
-
if len(row) != 6:
|
|
193
|
-
logger.error(f'Mismatch len of row: {row}, len of row should be 6. Skip this row.')
|
|
194
|
-
continue
|
|
195
|
-
rows.append({
|
|
196
|
-
'input': row[0],
|
|
197
|
-
'A': row[1],
|
|
198
|
-
'B': row[2],
|
|
199
|
-
'C': row[3],
|
|
200
|
-
'D': row[4],
|
|
201
|
-
'target': row[5],
|
|
202
|
-
})
|
|
203
|
-
|
|
204
|
-
data_dict[subset_name].update({split_name: rows})
|
|
205
|
-
|
|
206
|
-
return data_dict
|
|
207
|
-
|
|
208
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
209
|
-
"""
|
|
210
|
-
Generate model prompt from raw input, unify the prompt format for MMLU benchmark.
|
|
211
|
-
|
|
212
|
-
Args:
|
|
213
|
-
input_d (dict): The raw input. A single data format of the MMLU:
|
|
214
|
-
|
|
215
|
-
{'input': '___________ is based on the idea that customer expectations of the service they will receive shape their perception of the actual service encounter.',
|
|
216
|
-
'A': 'Service quality.',
|
|
217
|
-
'B': 'Service action.',
|
|
218
|
-
'C': 'Service recovery.',
|
|
219
|
-
'D': 'Service satisfaction.',
|
|
220
|
-
'target': 'A'}
|
|
221
|
-
|
|
222
|
-
Returns:
|
|
223
|
-
{'data': [full_prompt], 'multi_choices': self.choices}
|
|
224
|
-
|
|
225
|
-
"""
|
|
226
|
-
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
227
|
-
|
|
228
|
-
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
229
|
-
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
230
|
-
|
|
231
|
-
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
|
|
232
|
-
|
|
233
|
-
return self.gen_prompt_data(full_prompt)
|
|
234
|
-
|
|
235
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
236
|
-
# Get the gold choice
|
|
237
|
-
return input_d.get('target', '')
|
|
238
|
-
|
|
239
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
240
|
-
"""
|
|
241
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
242
|
-
|
|
243
|
-
Args:
|
|
244
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
245
|
-
raw_input_d: The raw input. Depending on the dataset.
|
|
246
|
-
eval_type: 'checkpoint' or 'service' or 'custom'
|
|
247
|
-
|
|
248
|
-
Returns:
|
|
249
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
250
|
-
"""
|
|
251
|
-
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
252
|
-
return result
|
|
253
|
-
else:
|
|
254
|
-
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
255
|
-
|
|
256
|
-
def match(self, gold: str, pred: str) -> float:
|
|
257
|
-
return exact_match(gold=gold, pred=pred)
|
|
258
|
-
|
|
259
|
-
def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
|
|
260
|
-
|
|
261
|
-
input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
|
|
262
|
-
|
|
263
|
-
example: str = input_d['input']
|
|
264
|
-
for j in range(len(self.choices)):
|
|
265
|
-
example += f'\n{self.choices[j]}) {input_choices[j]}'
|
|
266
|
-
|
|
267
|
-
if include_answer:
|
|
268
|
-
example += f"\nAnswer: {input_d['target']}\n\n"
|
|
269
|
-
else:
|
|
270
|
-
example += '\nAnswer: \n\n'
|
|
271
|
-
|
|
272
|
-
return example
|
|
273
98
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
99
|
+
def record_to_sample(self, record) -> Sample:
|
|
100
|
+
return Sample(
|
|
101
|
+
input=record['question'],
|
|
102
|
+
choices=record['choices'],
|
|
103
|
+
# converts 0 -> A, 1 -> B, etc.
|
|
104
|
+
target=('ABCD'[record['answer']]),
|
|
105
|
+
subset_key=record['subject'],
|
|
106
|
+
metadata={'subject': record['subject']},
|
|
107
|
+
)
|
|
@@ -1,10 +1,30 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
from typing import Any, Dict
|
|
3
2
|
|
|
4
|
-
from evalscope.
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
# Based on the prompt provided here:
|
|
12
|
+
# https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/mmlu_pro
|
|
13
|
+
SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE = """
|
|
14
|
+
The following are multiple choice questions (with answers) about {subject}. Think step by step and then finish your answer with 'ANSWER: $LETTER' (without quotes) where LETTER is the correct letter choice.
|
|
15
|
+
|
|
16
|
+
{examples}
|
|
17
|
+
""".lstrip() # noqa: E501
|
|
18
|
+
|
|
19
|
+
# Based on MultipleChoiceTemplate.SINGLE_ANSWER provided in the multiple choice solver:
|
|
20
|
+
# https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/solver/_multiple_choice.py
|
|
21
|
+
USER_PROMPT_TEMPLATE = """Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
|
|
22
|
+
|
|
23
|
+
Question:
|
|
24
|
+
{question}
|
|
25
|
+
Options:
|
|
26
|
+
{choices}
|
|
27
|
+
""".lstrip() # noqa: E501
|
|
8
28
|
|
|
9
29
|
SUBSET_LIST = [
|
|
10
30
|
'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
|
|
@@ -12,102 +32,63 @@ SUBSET_LIST = [
|
|
|
12
32
|
]
|
|
13
33
|
|
|
14
34
|
|
|
15
|
-
@
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
35
|
+
@register_benchmark(
|
|
36
|
+
BenchmarkMeta(
|
|
37
|
+
name='mmlu_pro',
|
|
38
|
+
pretty_name='MMLU-Pro',
|
|
39
|
+
tags=[Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
|
|
40
|
+
description=
|
|
41
|
+
'MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
|
|
42
|
+
dataset_id='modelscope/MMLU-Pro',
|
|
43
|
+
subset_list=SUBSET_LIST,
|
|
44
|
+
metric_list=['acc'],
|
|
45
|
+
few_shot_num=5,
|
|
46
|
+
train_split='validation',
|
|
47
|
+
eval_split='test',
|
|
48
|
+
prompt_template=USER_PROMPT_TEMPLATE,
|
|
49
|
+
few_shot_prompt_template=SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE + USER_PROMPT_TEMPLATE,
|
|
50
|
+
)
|
|
31
51
|
)
|
|
32
|
-
class MMLUProAdapter(
|
|
52
|
+
class MMLUProAdapter(MultiChoiceAdapter):
|
|
33
53
|
|
|
34
54
|
def __init__(self, **kwargs):
|
|
35
55
|
super().__init__(**kwargs)
|
|
36
56
|
|
|
37
|
-
self.
|
|
38
|
-
|
|
39
|
-
def
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
input_d: input raw data. Depending on the dataset.
|
|
77
|
-
|
|
78
|
-
Returns:
|
|
79
|
-
The parsed input. e.g. gold answer ... Depending on the dataset.
|
|
80
|
-
"""
|
|
81
|
-
return input_d['answer']
|
|
82
|
-
|
|
83
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
84
|
-
"""
|
|
85
|
-
Parse the predicted result and extract proper answer.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
89
|
-
raw_input_d: The raw input. Depending on the dataset.
|
|
90
|
-
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
91
|
-
|
|
92
|
-
Returns:
|
|
93
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
94
|
-
"""
|
|
95
|
-
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
96
|
-
return result
|
|
97
|
-
else:
|
|
98
|
-
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
99
|
-
|
|
100
|
-
def match(self, gold: str, pred: str) -> float:
|
|
101
|
-
"""
|
|
102
|
-
Match the gold answer and the predicted answer.
|
|
103
|
-
|
|
104
|
-
Args:
|
|
105
|
-
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
|
|
106
|
-
e.g. 'A', extracted from get_gold_answer method.
|
|
107
|
-
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
|
|
108
|
-
e.g. 'B', extracted from parse_pred_result method.
|
|
109
|
-
|
|
110
|
-
Returns:
|
|
111
|
-
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
112
|
-
"""
|
|
113
|
-
return exact_match(gold=gold, pred=pred)
|
|
57
|
+
self.reformat_subset = True
|
|
58
|
+
|
|
59
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
60
|
+
return Sample(
|
|
61
|
+
input=record['question'],
|
|
62
|
+
choices=record['options'],
|
|
63
|
+
target=record['answer'],
|
|
64
|
+
subset_key=record['category'].lower(),
|
|
65
|
+
metadata={
|
|
66
|
+
'cot_content': record['cot_content'],
|
|
67
|
+
'subject': record['category'].lower(),
|
|
68
|
+
'question_id': record['question_id'],
|
|
69
|
+
},
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def sample_to_fewshot(self, sample: Sample) -> str:
|
|
73
|
+
q_str = f"""Question:\n{str(sample.input)}"""
|
|
74
|
+
options = sample.choices if sample.choices is not None else []
|
|
75
|
+
opt_str_list = []
|
|
76
|
+
for i, opt in enumerate(options):
|
|
77
|
+
opt_str_list.append(f"""{chr(65 + i)} {opt}""")
|
|
78
|
+
opt_str = '\n'.join(opt_str_list)
|
|
79
|
+
opt_str = f"""Options:\n{opt_str}"""
|
|
80
|
+
ans_str = sample.metadata['cot_content'] if sample.metadata is not None else ''
|
|
81
|
+
ans_str = ans_str.replace('The answer is', 'ANSWER:')
|
|
82
|
+
ans_opt = ans_str.split('ANSWER:')[-1].split('.')[0].strip().strip('(').strip(')')
|
|
83
|
+
ans_str = ans_str.replace(f'ANSWER: ({ans_opt})', f'ANSWER: {ans_opt}')
|
|
84
|
+
final_str = '\n'.join([q_str, opt_str, ans_str])
|
|
85
|
+
|
|
86
|
+
return final_str
|
|
87
|
+
|
|
88
|
+
def format_fewshot_template(self, fewshot, sample):
|
|
89
|
+
fewshot_str = SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE.format(
|
|
90
|
+
subject=sample.metadata['subject'],
|
|
91
|
+
examples=fewshot,
|
|
92
|
+
)
|
|
93
|
+
prompt_str = self.format_prompt_template(sample)
|
|
94
|
+
return fewshot_str + '\n' + prompt_str
|