evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.evaluator import TaskState
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
# flake8: noqa
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@register_benchmark(
|
|
18
|
+
BenchmarkMeta(
|
|
19
|
+
name='aime24',
|
|
20
|
+
pretty_name='AIME-2024',
|
|
21
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
22
|
+
description=
|
|
23
|
+
'The AIME 2024 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model\'s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.', # noqa: E501
|
|
24
|
+
dataset_id='HuggingFaceH4/aime_2024',
|
|
25
|
+
subset_list=['default'],
|
|
26
|
+
metric_list=[{
|
|
27
|
+
'acc': {
|
|
28
|
+
'numeric': True
|
|
29
|
+
}
|
|
30
|
+
}],
|
|
31
|
+
few_shot_num=0,
|
|
32
|
+
train_split=None,
|
|
33
|
+
eval_split='train', # Only train set is available
|
|
34
|
+
prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
class AIME24Adapter(DefaultDataAdapter):
|
|
38
|
+
|
|
39
|
+
def __init__(self, *args, **kwargs):
|
|
40
|
+
super().__init__(*args, **kwargs)
|
|
41
|
+
|
|
42
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
43
|
+
return Sample(
|
|
44
|
+
input=record['problem'],
|
|
45
|
+
target=record['answer'],
|
|
46
|
+
metadata={
|
|
47
|
+
'problem_id': record.get('id', ''),
|
|
48
|
+
'solution': record.get('solution', ''),
|
|
49
|
+
},
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def extract_answer(self, prediction: str, task_state):
|
|
53
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
54
|
+
|
|
55
|
+
return extract_answer(prediction)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
7
|
+
from evalscope.api.dataset import Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.metric import Score
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
# flake8: noqa
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
JUDGE_PROMPT = """
|
|
19
|
+
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
|
|
20
|
+
|
|
21
|
+
Examples:
|
|
22
|
+
|
|
23
|
+
Expression 1: $2x+3$
|
|
24
|
+
Expression 2: $3+2x$
|
|
25
|
+
|
|
26
|
+
Yes
|
|
27
|
+
|
|
28
|
+
Expression 1: 3/2
|
|
29
|
+
Expression 2: 1.5
|
|
30
|
+
|
|
31
|
+
Yes
|
|
32
|
+
|
|
33
|
+
Expression 1: $x^2+2x+1$
|
|
34
|
+
Expression 2: $y^2+2y+1$
|
|
35
|
+
|
|
36
|
+
No
|
|
37
|
+
|
|
38
|
+
Expression 1: $x^2+2x+1$
|
|
39
|
+
Expression 2: $(x+1)^2$
|
|
40
|
+
|
|
41
|
+
Yes
|
|
42
|
+
|
|
43
|
+
Expression 1: 3245/5
|
|
44
|
+
Expression 2: 649
|
|
45
|
+
|
|
46
|
+
No
|
|
47
|
+
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
|
|
48
|
+
|
|
49
|
+
Expression 1: 2/(-3)
|
|
50
|
+
Expression 2: -2/3
|
|
51
|
+
|
|
52
|
+
Yes
|
|
53
|
+
(trivial simplifications are allowed)
|
|
54
|
+
|
|
55
|
+
Expression 1: 72 degrees
|
|
56
|
+
Expression 2: 72
|
|
57
|
+
|
|
58
|
+
Yes
|
|
59
|
+
(give benefit of the doubt to units)
|
|
60
|
+
|
|
61
|
+
Expression 1: 64
|
|
62
|
+
Expression 2: 64 square feet
|
|
63
|
+
|
|
64
|
+
Yes
|
|
65
|
+
(give benefit of the doubt to units)
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
YOUR TASK
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
Respond with only "Yes" or "No" (without quotes). Do not include a rationale.
|
|
73
|
+
|
|
74
|
+
Expression 1: {expression1}
|
|
75
|
+
Expression 2: {expression2}
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
PROMPT_TEMPLATE = """
|
|
80
|
+
Solve the following math problem step by step. Put your answer inside \\boxed{{}}.
|
|
81
|
+
|
|
82
|
+
{question}
|
|
83
|
+
|
|
84
|
+
Remember to put your answer inside \\boxed{{}}."""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@register_benchmark(
|
|
88
|
+
BenchmarkMeta(
|
|
89
|
+
name='aime25',
|
|
90
|
+
pretty_name='AIME-2025',
|
|
91
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
92
|
+
description=
|
|
93
|
+
'The AIME 2025 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model\'s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.',
|
|
94
|
+
dataset_id='opencompass/AIME2025',
|
|
95
|
+
subset_list=['AIME2025-I', 'AIME2025-II'],
|
|
96
|
+
metric_list=[{
|
|
97
|
+
'acc': {
|
|
98
|
+
'numeric': True
|
|
99
|
+
}
|
|
100
|
+
}],
|
|
101
|
+
few_shot_num=0,
|
|
102
|
+
train_split=None,
|
|
103
|
+
eval_split='test',
|
|
104
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
class AIME25Adapter(DefaultDataAdapter):
|
|
108
|
+
|
|
109
|
+
def __init__(self, *args, **kwargs):
|
|
110
|
+
super().__init__(*args, **kwargs)
|
|
111
|
+
|
|
112
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
113
|
+
return Sample(
|
|
114
|
+
input=record['question'],
|
|
115
|
+
target=record['answer'],
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
119
|
+
"""
|
|
120
|
+
Args:
|
|
121
|
+
prediction (str): The model prediction to extract from
|
|
122
|
+
task_state (TaskState): The task state for additional context
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
str: The extracted answer
|
|
126
|
+
"""
|
|
127
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
128
|
+
from .math_normalize import normalize_answer
|
|
129
|
+
|
|
130
|
+
extracted_pred = extract_answer(prediction)
|
|
131
|
+
filtered_pred = normalize_answer(extracted_pred)
|
|
132
|
+
return filtered_pred
|
|
133
|
+
|
|
134
|
+
def match_score(
|
|
135
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
136
|
+
) -> Score:
|
|
137
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
138
|
+
from .grader import grade_answer
|
|
139
|
+
|
|
140
|
+
score = Score(
|
|
141
|
+
extracted_prediction=filtered_prediction,
|
|
142
|
+
prediction=original_prediction,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Use the custom grade_answer function for evaluation
|
|
146
|
+
try:
|
|
147
|
+
is_correct = grade_answer(extract_answer(original_prediction), reference)
|
|
148
|
+
accuracy_score = 1.0 if is_correct else 0.0
|
|
149
|
+
score.value['acc'] = accuracy_score
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.error(f'Error in custom grading: {e}')
|
|
152
|
+
score.value['acc'] = 0.0
|
|
153
|
+
score.metadata['acc'] = f'grading_error: {str(e)}'
|
|
154
|
+
return score
|
|
155
|
+
|
|
156
|
+
def llm_match_score(
|
|
157
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
158
|
+
) -> Score:
|
|
159
|
+
score = Score(
|
|
160
|
+
extracted_prediction=filtered_prediction,
|
|
161
|
+
prediction=original_prediction,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
judge_prompt = JUDGE_PROMPT.format(expression1=original_prediction, expression2=reference)
|
|
165
|
+
|
|
166
|
+
# Request judge and obtain score
|
|
167
|
+
judge_response = self.llm_judge.judge(prompt=judge_prompt)
|
|
168
|
+
|
|
169
|
+
# Parse judge response to get accuracy score
|
|
170
|
+
is_correct = bool(re.search(r'\bYes\b', judge_response, re.IGNORECASE))
|
|
171
|
+
score.value = {
|
|
172
|
+
'acc': 1.0 if is_correct else 0.0,
|
|
173
|
+
}
|
|
174
|
+
score.explanation = f'LLM judge: {judge_response}'
|
|
175
|
+
score.metadata = {
|
|
176
|
+
'source': 'llm_judge',
|
|
177
|
+
'judge_strategy': self.judge_strategy,
|
|
178
|
+
'model': self.llm_judge.model_id,
|
|
179
|
+
}
|
|
180
|
+
score.main_score_name = 'acc'
|
|
181
|
+
return score
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Answer checker API that uses sympy to simplify expressions and check for equality.
|
|
3
|
+
|
|
4
|
+
Call grade_answer(given_answer: str, ground_truth: str).
|
|
5
|
+
|
|
6
|
+
This file is adapted from OpenAI's PRM800K repository:
|
|
7
|
+
https://github.com/openai/prm800k/blob/main/prm800k/grading/grader.py
|
|
8
|
+
|
|
9
|
+
Original License:
|
|
10
|
+
MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2023 OpenAI
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE.
|
|
31
|
+
"""
|
|
32
|
+
# flake8: noqa
|
|
33
|
+
import re
|
|
34
|
+
import sympy
|
|
35
|
+
from pylatexenc import latex2text
|
|
36
|
+
from sympy.parsing import sympy_parser
|
|
37
|
+
|
|
38
|
+
from . import math_normalize
|
|
39
|
+
|
|
40
|
+
# sympy might hang -- we don't care about trying to be lenient in these cases
|
|
41
|
+
BAD_SUBSTRINGS = ['^{', '^(']
|
|
42
|
+
BAD_REGEXES = ['\^[0-9]+\^', '\^[0-9][0-9]+']
|
|
43
|
+
TUPLE_CHARS = '()[]'
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _sympy_parse(expr: str):
|
|
47
|
+
"""Parses an expression with sympy."""
|
|
48
|
+
py_expr = expr.replace('^', '**')
|
|
49
|
+
return sympy_parser.parse_expr(
|
|
50
|
+
py_expr,
|
|
51
|
+
transformations=(sympy_parser.standard_transformations + (sympy_parser.implicit_multiplication_application, )),
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _parse_latex(expr: str) -> str:
|
|
56
|
+
"""Attempts to parse latex to an expression sympy can read."""
|
|
57
|
+
expr = expr.replace('\\tfrac', '\\frac')
|
|
58
|
+
expr = expr.replace('\\dfrac', '\\frac')
|
|
59
|
+
expr = expr.replace('\\frac', ' \\frac') # Play nice with mixed numbers.
|
|
60
|
+
expr = latex2text.LatexNodes2Text().latex_to_text(expr)
|
|
61
|
+
|
|
62
|
+
# Replace the specific characters that this parser uses.
|
|
63
|
+
expr = expr.replace('√', 'sqrt')
|
|
64
|
+
expr = expr.replace('π', 'pi')
|
|
65
|
+
expr = expr.replace('∞', 'inf')
|
|
66
|
+
expr = expr.replace('∪', 'U')
|
|
67
|
+
expr = expr.replace('·', '*')
|
|
68
|
+
expr = expr.replace('×', '*')
|
|
69
|
+
|
|
70
|
+
return expr.strip()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _is_float(num: str) -> bool:
|
|
74
|
+
try:
|
|
75
|
+
float(num)
|
|
76
|
+
return True
|
|
77
|
+
except ValueError:
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _is_int(x: float) -> bool:
|
|
82
|
+
try:
|
|
83
|
+
return abs(x - int(round(x))) <= 1e-7
|
|
84
|
+
except:
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _is_frac(expr: str) -> bool:
|
|
89
|
+
return bool(re.search(r'^-?[0-9]+.?/0*[1-9][0-9]*.?$', expr))
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _str_is_int(x: str) -> bool:
|
|
93
|
+
try:
|
|
94
|
+
x = _strip_properly_formatted_commas(x)
|
|
95
|
+
x = float(x)
|
|
96
|
+
return abs(x - int(round(x))) <= 1e-7
|
|
97
|
+
except:
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _str_to_int(x: str) -> bool:
|
|
102
|
+
x = x.replace(',', '')
|
|
103
|
+
x = float(x)
|
|
104
|
+
return int(x)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _inject_implicit_mixed_number(step: str):
|
|
108
|
+
"""
|
|
109
|
+
Automatically make a mixed number evalable
|
|
110
|
+
e.g. 7 3/4 => 7+3/4
|
|
111
|
+
"""
|
|
112
|
+
p1 = re.compile('([0-9]) +([0-9])')
|
|
113
|
+
step = p1.sub('\\1+\\2', step) ## implicit mults
|
|
114
|
+
return step
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _strip_properly_formatted_commas(expr: str):
|
|
118
|
+
# We want to be careful because we don't want to strip tuple commas
|
|
119
|
+
p1 = re.compile('(\d)(,)(\d\d\d)($|\D)')
|
|
120
|
+
while True:
|
|
121
|
+
next_expr = p1.sub('\\1\\3\\4', expr)
|
|
122
|
+
if next_expr == expr:
|
|
123
|
+
break
|
|
124
|
+
expr = next_expr
|
|
125
|
+
return next_expr
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _normalize(expr: str) -> str:
|
|
129
|
+
"""Normalize answer expressions."""
|
|
130
|
+
if expr is None:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
# Remove enclosing `\text{}`.
|
|
134
|
+
m = re.search('^\\\\text\{(?P<text>.+?)\}$', expr)
|
|
135
|
+
if m is not None:
|
|
136
|
+
expr = m.group('text')
|
|
137
|
+
|
|
138
|
+
expr = expr.replace('\\%', '%')
|
|
139
|
+
expr = expr.replace('\\$', '$')
|
|
140
|
+
expr = expr.replace('$', '')
|
|
141
|
+
expr = expr.replace('%', '')
|
|
142
|
+
expr = expr.replace(' or ', ' , ')
|
|
143
|
+
expr = expr.replace(' and ', ' , ')
|
|
144
|
+
|
|
145
|
+
expr = expr.replace('million', '*10^6')
|
|
146
|
+
expr = expr.replace('billion', '*10^9')
|
|
147
|
+
expr = expr.replace('trillion', '*10^12')
|
|
148
|
+
|
|
149
|
+
for unit in [
|
|
150
|
+
'degree',
|
|
151
|
+
'cm',
|
|
152
|
+
'centimeter',
|
|
153
|
+
'meter',
|
|
154
|
+
'mile',
|
|
155
|
+
'second',
|
|
156
|
+
'minute',
|
|
157
|
+
'hour',
|
|
158
|
+
'day',
|
|
159
|
+
'week',
|
|
160
|
+
'month',
|
|
161
|
+
'year',
|
|
162
|
+
'foot',
|
|
163
|
+
'feet',
|
|
164
|
+
'inch',
|
|
165
|
+
'yard',
|
|
166
|
+
]:
|
|
167
|
+
expr = re.sub(f'{unit}(es)?(s)? *(\^[0-9]+)?', '', expr)
|
|
168
|
+
expr = re.sub(f'\^ *\\\\circ', '', expr)
|
|
169
|
+
|
|
170
|
+
if len(expr) > 0 and expr[0] == '{' and expr[-1] == '}':
|
|
171
|
+
expr = expr[1:-1]
|
|
172
|
+
|
|
173
|
+
expr = re.sub(',\\\\! *', '', expr)
|
|
174
|
+
if _is_float(expr) and _is_int(float(expr)):
|
|
175
|
+
expr = str(int(round(float(expr))))
|
|
176
|
+
if '\\' in expr:
|
|
177
|
+
try:
|
|
178
|
+
expr = _parse_latex(expr)
|
|
179
|
+
except:
|
|
180
|
+
pass
|
|
181
|
+
|
|
182
|
+
# edge case with mixed numbers and negative signs
|
|
183
|
+
expr = re.sub('- *', '-', expr)
|
|
184
|
+
|
|
185
|
+
expr = _inject_implicit_mixed_number(expr)
|
|
186
|
+
expr = expr.replace(' ', '')
|
|
187
|
+
|
|
188
|
+
# if we somehow still have latex braces here, just drop them
|
|
189
|
+
expr = expr.replace('{', '')
|
|
190
|
+
expr = expr.replace('}', '')
|
|
191
|
+
|
|
192
|
+
# don't be case sensitive for text answers
|
|
193
|
+
expr = expr.lower()
|
|
194
|
+
|
|
195
|
+
if _str_is_int(expr):
|
|
196
|
+
expr = str(_str_to_int(expr))
|
|
197
|
+
|
|
198
|
+
return expr
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def count_unknown_letters_in_expr(expr: str):
|
|
202
|
+
expr = expr.replace('sqrt', '')
|
|
203
|
+
expr = expr.replace('frac', '')
|
|
204
|
+
letters_in_expr = set([x for x in expr if x.isalpha()])
|
|
205
|
+
return len(letters_in_expr)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def should_allow_eval(expr: str):
|
|
209
|
+
# we don't want to try parsing unknown text or functions of more than two variables
|
|
210
|
+
if count_unknown_letters_in_expr(expr) > 2:
|
|
211
|
+
return False
|
|
212
|
+
|
|
213
|
+
for bad_string in BAD_SUBSTRINGS:
|
|
214
|
+
if bad_string in expr:
|
|
215
|
+
return False
|
|
216
|
+
|
|
217
|
+
for bad_regex in BAD_REGEXES:
|
|
218
|
+
if re.search(bad_regex, expr) is not None:
|
|
219
|
+
return False
|
|
220
|
+
|
|
221
|
+
return True
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def are_equal_under_sympy(ground_truth_normalized: str, given_normalized: str):
|
|
225
|
+
are_equal = False
|
|
226
|
+
try:
|
|
227
|
+
expr = f'({ground_truth_normalized})-({given_normalized})'
|
|
228
|
+
if should_allow_eval(expr):
|
|
229
|
+
sympy_diff = _sympy_parse(expr)
|
|
230
|
+
simplified = sympy.simplify(sympy_diff)
|
|
231
|
+
if simplified == 0:
|
|
232
|
+
are_equal = True
|
|
233
|
+
except:
|
|
234
|
+
pass
|
|
235
|
+
return are_equal
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def split_tuple(expr: str):
|
|
239
|
+
"""
|
|
240
|
+
Split the elements in a tuple/interval, while handling well-formatted commas in large numbers
|
|
241
|
+
"""
|
|
242
|
+
expr = _strip_properly_formatted_commas(expr)
|
|
243
|
+
if len(expr) == 0:
|
|
244
|
+
return []
|
|
245
|
+
if (
|
|
246
|
+
len(expr) > 2 and expr[0] in TUPLE_CHARS and expr[-1] in TUPLE_CHARS
|
|
247
|
+
and all([ch not in expr[1:-1] for ch in TUPLE_CHARS])
|
|
248
|
+
):
|
|
249
|
+
elems = [elem.strip() for elem in expr[1:-1].split(',')]
|
|
250
|
+
else:
|
|
251
|
+
elems = [expr]
|
|
252
|
+
return elems
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def grade_answer(given_answer: str, ground_truth: str) -> bool:
|
|
256
|
+
"""
|
|
257
|
+
The answer will be considered correct if:
|
|
258
|
+
(a) it normalizes to the same string as the ground truth answer
|
|
259
|
+
OR
|
|
260
|
+
(b) sympy can simplify the difference between the expressions to 0
|
|
261
|
+
"""
|
|
262
|
+
if given_answer is None:
|
|
263
|
+
return False
|
|
264
|
+
|
|
265
|
+
ground_truth_normalized_mathd = math_normalize.normalize_answer(ground_truth)
|
|
266
|
+
given_answer_normalized_mathd = math_normalize.normalize_answer(given_answer)
|
|
267
|
+
|
|
268
|
+
# be at least as lenient as mathd
|
|
269
|
+
if ground_truth_normalized_mathd == given_answer_normalized_mathd:
|
|
270
|
+
return True
|
|
271
|
+
|
|
272
|
+
ground_truth_normalized = _normalize(ground_truth)
|
|
273
|
+
given_normalized = _normalize(given_answer)
|
|
274
|
+
|
|
275
|
+
if ground_truth_normalized is None:
|
|
276
|
+
return False
|
|
277
|
+
|
|
278
|
+
if ground_truth_normalized == given_normalized:
|
|
279
|
+
return True
|
|
280
|
+
|
|
281
|
+
if len(given_normalized) == 0:
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
ground_truth_elems = split_tuple(ground_truth_normalized)
|
|
285
|
+
given_elems = split_tuple(given_normalized)
|
|
286
|
+
|
|
287
|
+
if len(ground_truth_elems) > 1 and (
|
|
288
|
+
ground_truth_normalized[0] != given_normalized[0] or ground_truth_normalized[-1] != given_normalized[-1]
|
|
289
|
+
):
|
|
290
|
+
is_correct = False
|
|
291
|
+
elif len(ground_truth_elems) != len(given_elems):
|
|
292
|
+
is_correct = False
|
|
293
|
+
else:
|
|
294
|
+
for ground_truth_elem, given_elem in zip(ground_truth_elems, given_elems):
|
|
295
|
+
if _is_frac(ground_truth_elem) and _is_frac(given_elem):
|
|
296
|
+
# if fractions aren't reduced, then shouldn't be marked as correct
|
|
297
|
+
# so, we don't want to allow sympy.simplify in this case
|
|
298
|
+
is_correct = ground_truth_elem == given_elem
|
|
299
|
+
elif _str_is_int(ground_truth_elem) != _str_is_int(given_elem):
|
|
300
|
+
# if the ground truth answer is an integer, we require the given answer to be a strict match (no sympy.simplify)
|
|
301
|
+
is_correct = False
|
|
302
|
+
else:
|
|
303
|
+
is_correct = are_equal_under_sympy(ground_truth_elem, given_elem)
|
|
304
|
+
if not is_correct:
|
|
305
|
+
break
|
|
306
|
+
|
|
307
|
+
return is_correct
|