PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (273) hide show

evalscope/__init__.py +4 -1
evalscope/api/__init__.py +0 -0
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +3 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
evalscope/api/benchmark/benchmark.py +321 -0
evalscope/api/benchmark/meta.py +115 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +261 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +355 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +264 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +11 -0
evalscope/api/messages/chat_message.py +198 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +105 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/dataset_mixin.py +105 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +157 -0
evalscope/api/model/model.py +383 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +23 -11
evalscope/app/utils/data_utils.py +42 -26
evalscope/app/utils/text_utils.py +0 -2
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +6 -7
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -3
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +2 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +135 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +95 -54
evalscope/constants.py +29 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +277 -423
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +32 -30
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +47 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +123 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +698 -0
evalscope/perf/benchmark.py +2 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +8 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -2
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +101 -6
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +26 -44
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +2 -1
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/io_utils.py +100 -5
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +10 -7
evalscope/utils/multi_choices.py +271 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
tests/aigc/test_t2i.py +22 -4
tests/benchmark/__init__.py +1 -0
tests/benchmark/test_eval.py +386 -0
tests/cli/test_all.py +3 -5
tests/cli/test_collection.py +13 -4
tests/cli/test_custom.py +22 -15
tests/rag/test_clip_benchmark.py +1 -0
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py CHANGED Viewed

@@ -1,29 +1,14 @@
-from collections import defaultdict
 from typing import Any, Dict
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import EvalType, OutputType
-from evalscope.metrics import exact_match
-from evalscope.metrics.completion_parsers import ResponseParser
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
 from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import MultipleChoiceTemplate
 logger = get_logger()
-SUBSET_LIST = [
-    'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
-    'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
-    'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
-    'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
-    'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
-    'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
-    'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
-    'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
-    'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
-    'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
-    'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
-    'world_religions'
-]
 SUBJECT_MAPPING = {
     'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
     'anatomy': ['Anatomy', 'health', 'Other'],
@@ -84,25 +69,31 @@ SUBJECT_MAPPING = {
     'world_religions': ['World Religions', 'philosophy', 'Humanities'],
 }
-@Benchmark.register(
-    name='mmlu_redux',
-    pretty_name='MMLU-Redux',
-    tags=['MCQ', 'Knowledge'],
-    description=
-    'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.',  # noqa: E501
-    dataset_id='AI-ModelScope/mmlu-redux-2.0',
-    model_adapter=OutputType.GENERATION,
-    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
-    subset_list=SUBSET_LIST,
-    metric_list=['AverageAccuracy'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-    prompt_template=
-    'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}',  # noqa: E501
+SUBSET_LIST = list(SUBJECT_MAPPING.keys())
+@register_benchmark(
+    BenchmarkMeta(
+        name='mmlu_redux',
+        pretty_name='MMLU-Redux',
+        tags=[Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
+        description=
+        'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options. '  # noqa: E501
+        'The bad answers are corrected.',  # noqa: E501
+        dataset_id='AI-ModelScope/mmlu-redux-2.0',
+        subset_list=SUBSET_LIST,
+        metric_list=[{
+            'acc': {
+                'allow_inclusion': True
+            }
+        }],
+        few_shot_num=0,
+        train_split=None,
+        eval_split='test',
+        prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
+    )
 )
-class MMLUReduxAdapter(DataAdapter):
+class MMLUReduxAdapter(MultiChoiceAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -111,75 +102,38 @@ class MMLUReduxAdapter(DataAdapter):
             self.few_shot_num = 0
             logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
-        self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
-        self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
-    def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
-        if self.few_shot_num > 0:
-            prefix = self.format_fewshot_examples(few_shot_list)
-        else:
-            prefix = ''
-        query = prefix + 'Q: ' + input_d['question'] + '\n' + \
-            self.__form_options(input_d['choices']) + '\n'
-        full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
-        return self.gen_prompt_data(full_prompt)
-    def format_fewshot_examples(self, few_shot_list):
-        # load few-shot prompts for each category
-        prompts = ''
-        for index, d in enumerate(few_shot_list):
-            prompts += 'Q: ' + d['question'] + '\n' + \
-                self.__form_options(d['choices']) + '\n'
-        return prompts
-    def __form_options(self, options: list):
-        option_str = 'Options are:\n'
-        for opt, choice in zip(options, self.choices):
-            option_str += f'({choice}): {opt}' + '\n'
-        return option_str
-    def get_gold_answer(self, input_d: dict) -> str:
-        """
-        Parse the raw input labels (gold).
-        Args:
-            input_d: input raw data. Depending on the dataset.
-        Returns:
-            The parsed input. e.g. gold answer ... Depending on the dataset.
-        """
-        answer_index = int(input_d['answer'])
-        return self.choices[answer_index]
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
-        """
-        Parse the predicted result and extract proper answer.
-        Args:
-            result: Predicted answer from the model. Usually a string for chat.
-            raw_input_d: The raw input. Depending on the dataset.
-            eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
-        Returns:
-            The parsed answer. Depending on the dataset. Usually a string for chat.
-        """
-        if self.model_adapter == OutputType.MULTIPLE_CHOICE:
-            return result
-        else:
-            return ResponseParser.parse_first_option(result, options=self.choices)
-    def match(self, gold: str, pred: str) -> float:
-        """
-        Match the gold answer and the predicted answer.
-        Args:
-            gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
-                        e.g. 'A', extracted from get_gold_answer method.
-            pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
-                        e.g. 'B', extracted from parse_pred_result method.
-        Returns:
-            The match result. Usually a score (float) for chat/multiple-choice-questions.
-        """
-        return exact_match(gold=gold, pred=pred)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        error_type = record['error_type']
+        choices = record['choices']
+        target_index_list = [int(record['answer'])]
+        correct_answer = record['correct_answer']
+        if error_type == 'no_correct_answer' and correct_answer:
+            choices[target_index_list[0]] = correct_answer
+        elif error_type == 'wrong_groundtruth' and correct_answer:
+            try:
+                target_index_list = [int(correct_answer)]
+            except ValueError:
+                choice_index = ord(correct_answer) - ord('A')
+                target_index_list = [choice_index]
+        elif error_type == 'multiple_correct_answers' and correct_answer:
+            correct_answer = correct_answer.strip('()')
+            try:
+                correct_answer = correct_answer.replace(' and ', ',').replace(' or ', ',')
+                target_index_list = list(map(int, correct_answer.split(',')))
+            except ValueError:
+                try:
+                    target_index_list = [ord(c) - ord('A') for c in correct_answer.split(',')]
+                except TypeError:
+                    # find the index of the correct answer in choices
+                    target_index_list = [choices.index(c) for c in correct_answer.split(',') if c in choices]
+        return Sample(
+            input=record['question'],
+            choices=choices,
+            target=['ABCD'[i] for i in target_index_list] if target_index_list else ['A', 'B', 'C', 'D'],
+            metadata={
+                'error_type': error_type,
+                'correct_answer': correct_answer,
+                'potential_reason': record.get('potential_reason', ''),
+            },
+        )

evalscope/benchmarks/musr/musr_adapter.py CHANGED Viewed

@@ -1,74 +1,43 @@
 import ast
 from typing import Any
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import EvalType, OutputType
-from evalscope.metrics import exact_match
-from evalscope.metrics.completion_parsers import ResponseParser
-@Benchmark.register(
-    name='musr',
-    pretty_name='MuSR',
-    tags=['Reasoning', 'MCQ'],
-    description=
-    'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.',  # noqa: E501
-    dataset_id='AI-ModelScope/MuSR',
-    model_adapter=OutputType.GENERATION,
-    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
-    subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
-    metric_list=['AverageAccuracy'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-    prompt_template=
-    '{narrative}\n\n{question}\n\n{choices}\nThink step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.',  # noqa: E501
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.multi_choices import MultipleChoiceTemplate
+@register_benchmark(
+    BenchmarkMeta(
+        name='musr',
+        pretty_name='MuSR',
+        tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
+        description=
+        'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.',  # noqa: E501
+        dataset_id='AI-ModelScope/MuSR',
+        metric_list=['acc'],
+        subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
+        few_shot_num=0,
+        train_split=None,
+        eval_split='test',
+        prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
+    )
 )
-class MuSRAdapter(DataAdapter):
+class MuSRAdapter(MultiChoiceAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.choices = ['A', 'B', 'C', 'D', 'E', 'F']
-    def load(self, **kwargs):
-        # default load all levels
-        kwargs['split_as_subset'] = True
-        data_dict = super().load(**kwargs)
-        return data_dict
-    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
-        choices = self.format_choice(ast.literal_eval(input_d['choices']))
-        full_prompt = self.prompt_template.format(
-            narrative=input_d['narrative'], question=input_d['question'], choices=choices)
-        return self.gen_prompt_data(full_prompt)
-    def format_choice(self, options: list):
-        option_str = ''
-        for opt, choice in zip(options, self.choices):
-            option_str += f'({choice}): {opt}\n'
-        return option_str
-    def get_gold_answer(self, input_d: dict) -> str:
-        """
-        Parse the raw input labels (gold).
-        """
-        return self.choices[input_d['answer_index']]
+        self.split_as_subset = True
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
-        """
-        Parse the predicted result and extract proper answer.
-        """
-        if self.model_adapter == OutputType.MULTIPLE_CHOICE:
-            return result
-        else:
-            return ResponseParser.parse_first_option(result, options=self.choices)
+    def record_to_sample(self, record) -> Sample:
+        choices = ast.literal_eval(record['choices'])
+        choice_letters = ['A', 'B', 'C', 'D', 'E', 'F']
+        target_letter = choice_letters[record['answer_index']]
-    def match(self, gold: str, pred: str) -> float:
-        """
-        Match the gold answer and the predicted answer.
-        """
-        return exact_match(gold=gold, pred=pred)
+        return Sample(
+            input=f"{record['narrative']}\n\n{record['question']}",
+            choices=choices,
+            target=target_letter,
+        )

evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl