evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,544 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
import ast
|
|
3
|
+
import faulthandler
|
|
4
|
+
import json
|
|
5
|
+
import numpy as np
|
|
6
|
+
import platform
|
|
7
|
+
|
|
8
|
+
# to run the solution files we're using a timing based approach
|
|
9
|
+
import signal
|
|
10
|
+
import sys
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
# used for debugging to time steps
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from decimal import Decimal
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from functools import partial
|
|
18
|
+
from io import StringIO
|
|
19
|
+
|
|
20
|
+
# from pyext import RuntimeModule
|
|
21
|
+
from types import ModuleType
|
|
22
|
+
|
|
23
|
+
# used for testing the code that reads from input
|
|
24
|
+
from unittest.mock import mock_open, patch
|
|
25
|
+
|
|
26
|
+
from evalscope.utils.logger import get_logger
|
|
27
|
+
|
|
28
|
+
logger = get_logger()
|
|
29
|
+
|
|
30
|
+
import_string = 'from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n'
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def truncatefn(s, length=300):
|
|
34
|
+
if isinstance(s, str):
|
|
35
|
+
pass
|
|
36
|
+
else:
|
|
37
|
+
s = str(s)
|
|
38
|
+
if len(s) <= length:
|
|
39
|
+
return s
|
|
40
|
+
|
|
41
|
+
return s[:length // 2] + '...(truncated) ...' + s[-length // 2:]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class CODE_TYPE(Enum):
|
|
45
|
+
call_based = 0
|
|
46
|
+
standard_input = 1
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# stuff for setting up signal timer
|
|
50
|
+
class TimeoutException(Exception):
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def timeout_handler(debug, signum, frame):
|
|
55
|
+
if debug:
|
|
56
|
+
logger.info('timeout occured: alarm went off')
|
|
57
|
+
raise TimeoutException
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# used to capture stdout as a list
|
|
61
|
+
# from https://stackoverflow.com/a/16571630/6416660
|
|
62
|
+
# alternative use redirect_stdout() from contextlib
|
|
63
|
+
class Capturing(list):
|
|
64
|
+
|
|
65
|
+
def __enter__(self):
|
|
66
|
+
self._stdout = sys.stdout
|
|
67
|
+
sys.stdout = self._stringio = StringIO()
|
|
68
|
+
# Make closing the StringIO a no-op
|
|
69
|
+
self._stringio.close = lambda x: 1
|
|
70
|
+
return self
|
|
71
|
+
|
|
72
|
+
def __exit__(self, *args):
|
|
73
|
+
self.append(self._stringio.getvalue())
|
|
74
|
+
del self._stringio # free up some memory
|
|
75
|
+
sys.stdout = self._stdout
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def clean_if_name(code: str) -> str:
|
|
79
|
+
try:
|
|
80
|
+
astree = ast.parse(code)
|
|
81
|
+
last_block = astree.body[-1]
|
|
82
|
+
if isinstance(last_block, ast.If):
|
|
83
|
+
condition = last_block.test
|
|
84
|
+
if ast.unparse(condition).strip() == "__name__ == '__main__'":
|
|
85
|
+
code = (
|
|
86
|
+
ast.unparse(astree.body[:-1]) + '\n' + ast.unparse(last_block.body) # type: ignore
|
|
87
|
+
)
|
|
88
|
+
except:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
return code
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def make_function(code: str) -> str:
|
|
95
|
+
try:
|
|
96
|
+
import_stmts = []
|
|
97
|
+
all_other_stmts = []
|
|
98
|
+
astree = ast.parse(code)
|
|
99
|
+
for stmt in astree.body:
|
|
100
|
+
if isinstance(stmt, (ast.Import, ast.ImportFrom)):
|
|
101
|
+
import_stmts.append(stmt)
|
|
102
|
+
else:
|
|
103
|
+
all_other_stmts.append(stmt)
|
|
104
|
+
|
|
105
|
+
function_ast = ast.FunctionDef(
|
|
106
|
+
name='wrapped_function',
|
|
107
|
+
args=ast.arguments(posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]),
|
|
108
|
+
body=all_other_stmts,
|
|
109
|
+
decorator_list=[],
|
|
110
|
+
lineno=-1,
|
|
111
|
+
)
|
|
112
|
+
main_code = (
|
|
113
|
+
import_string + '\n' + ast.unparse(import_stmts) # type: ignore
|
|
114
|
+
+ '\n' + ast.unparse(function_ast) # type: ignore
|
|
115
|
+
)
|
|
116
|
+
return main_code
|
|
117
|
+
except Exception as e:
|
|
118
|
+
return code
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def call_method(method, inputs):
|
|
122
|
+
|
|
123
|
+
if isinstance(inputs, list):
|
|
124
|
+
inputs = '\n'.join(inputs)
|
|
125
|
+
|
|
126
|
+
inputs_line_iterator = iter(inputs.split('\n'))
|
|
127
|
+
|
|
128
|
+
# sys.setrecursionlimit(10000)
|
|
129
|
+
|
|
130
|
+
# @patch('builtins.input', side_effect=inputs.split("\n"))
|
|
131
|
+
@patch('builtins.open', mock_open(read_data=inputs))
|
|
132
|
+
@patch('sys.stdin', StringIO(inputs))
|
|
133
|
+
@patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
|
|
134
|
+
@patch('sys.stdin.readlines', lambda *args: inputs.split('\n'))
|
|
135
|
+
@patch('sys.stdin.read', lambda *args: inputs)
|
|
136
|
+
# @patch('sys.stdout.write', print)
|
|
137
|
+
def _inner_call_method(_method):
|
|
138
|
+
try:
|
|
139
|
+
return _method()
|
|
140
|
+
except SystemExit as e:
|
|
141
|
+
pass
|
|
142
|
+
finally:
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
return _inner_call_method(method)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def get_function(compiled_sol, fn_name: str): # type: ignore
|
|
149
|
+
try:
|
|
150
|
+
assert hasattr(compiled_sol, fn_name)
|
|
151
|
+
return getattr(compiled_sol, fn_name)
|
|
152
|
+
except Exception as e:
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def compile_code(code: str, timeout: int):
|
|
157
|
+
signal.alarm(timeout)
|
|
158
|
+
try:
|
|
159
|
+
tmp_sol = ModuleType('tmp_sol', '')
|
|
160
|
+
exec(code, tmp_sol.__dict__)
|
|
161
|
+
if 'class Solution' in code:
|
|
162
|
+
# leetcode wraps solutions in `Solution`
|
|
163
|
+
# this is a hack to check if it is leetcode solution or not
|
|
164
|
+
# currently livecodebench only supports LeetCode but
|
|
165
|
+
# else condition allows future extensibility to other platforms
|
|
166
|
+
compiled_sol = tmp_sol.Solution()
|
|
167
|
+
else:
|
|
168
|
+
# do nothing in the other case since function is accesible
|
|
169
|
+
compiled_sol = tmp_sol
|
|
170
|
+
|
|
171
|
+
assert compiled_sol is not None
|
|
172
|
+
finally:
|
|
173
|
+
signal.alarm(0)
|
|
174
|
+
|
|
175
|
+
return compiled_sol
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def convert_line_to_decimals(line: str) -> tuple[bool, list[Decimal]]:
|
|
179
|
+
try:
|
|
180
|
+
decimal_line = [Decimal(elem) for elem in line.split()]
|
|
181
|
+
except:
|
|
182
|
+
return False, []
|
|
183
|
+
return True, decimal_line
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def get_stripped_lines(val: str):
|
|
187
|
+
## you don't want empty lines to add empty list after splitlines!
|
|
188
|
+
val = val.strip()
|
|
189
|
+
|
|
190
|
+
return [val_line.strip() for val_line in val.split('\n')]
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def grade_call_based(code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int):
|
|
194
|
+
# call-based clean up logic
|
|
195
|
+
# need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
|
|
196
|
+
code = import_string + '\n\n' + code
|
|
197
|
+
compiled_sol = compile_code(code, timeout)
|
|
198
|
+
|
|
199
|
+
if compiled_sol is None:
|
|
200
|
+
return
|
|
201
|
+
|
|
202
|
+
method = get_function(compiled_sol, fn_name)
|
|
203
|
+
|
|
204
|
+
if method is None:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
all_inputs = [[json.loads(line) for line in inputs.split('\n')] for inputs in all_inputs]
|
|
208
|
+
|
|
209
|
+
all_outputs = [json.loads(output) for output in all_outputs]
|
|
210
|
+
|
|
211
|
+
total_execution = 0
|
|
212
|
+
all_results = []
|
|
213
|
+
for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
|
|
214
|
+
signal.alarm(timeout)
|
|
215
|
+
# faulthandler.enable()
|
|
216
|
+
try:
|
|
217
|
+
# can lock here so time is useful
|
|
218
|
+
start = time.time()
|
|
219
|
+
prediction = method(*gt_inp)
|
|
220
|
+
total_execution += time.time() - start
|
|
221
|
+
signal.alarm(0)
|
|
222
|
+
|
|
223
|
+
# don't penalize model if it produces tuples instead of lists
|
|
224
|
+
# ground truth sequences are not tuples
|
|
225
|
+
if isinstance(prediction, tuple):
|
|
226
|
+
prediction = list(prediction)
|
|
227
|
+
|
|
228
|
+
tmp_result = prediction == gt_out
|
|
229
|
+
|
|
230
|
+
# handle floating point comparisons
|
|
231
|
+
|
|
232
|
+
all_results.append(tmp_result)
|
|
233
|
+
|
|
234
|
+
if not tmp_result:
|
|
235
|
+
return all_results, {
|
|
236
|
+
'output': truncatefn(prediction),
|
|
237
|
+
'inputs': truncatefn(gt_inp),
|
|
238
|
+
'expected': truncatefn(gt_out),
|
|
239
|
+
'error_code': -2,
|
|
240
|
+
'error_message': 'Wrong Answer',
|
|
241
|
+
}
|
|
242
|
+
except Exception as e:
|
|
243
|
+
signal.alarm(0)
|
|
244
|
+
if 'timeoutexception' in repr(e).lower():
|
|
245
|
+
all_results.append(-3)
|
|
246
|
+
return all_results, {
|
|
247
|
+
'error': repr(e),
|
|
248
|
+
'error_code': -3,
|
|
249
|
+
'error_message': 'Time Limit Exceeded',
|
|
250
|
+
'inputs': truncatefn(gt_inp),
|
|
251
|
+
'expected': truncatefn(gt_out),
|
|
252
|
+
}
|
|
253
|
+
else:
|
|
254
|
+
all_results.append(-4)
|
|
255
|
+
return all_results, {
|
|
256
|
+
'error': repr(e),
|
|
257
|
+
'error_code': -4,
|
|
258
|
+
'error_message': 'Runtime Error',
|
|
259
|
+
'inputs': truncatefn(gt_inp),
|
|
260
|
+
'expected': truncatefn(gt_out),
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
finally:
|
|
264
|
+
signal.alarm(0)
|
|
265
|
+
# faulthandler.disable()
|
|
266
|
+
|
|
267
|
+
return all_results, {'execution time': total_execution}
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def grade_stdio(
|
|
271
|
+
code: str,
|
|
272
|
+
all_inputs: list,
|
|
273
|
+
all_outputs: list,
|
|
274
|
+
timeout: int,
|
|
275
|
+
):
|
|
276
|
+
## runtime doesn't interact well with __name__ == '__main__'
|
|
277
|
+
code = clean_if_name(code)
|
|
278
|
+
|
|
279
|
+
## we wrap the given code inside another function
|
|
280
|
+
code = make_function(code)
|
|
281
|
+
|
|
282
|
+
compiled_sol = compile_code(code, timeout)
|
|
283
|
+
if compiled_sol is None:
|
|
284
|
+
return
|
|
285
|
+
|
|
286
|
+
method = get_function(compiled_sol, 'wrapped_function')
|
|
287
|
+
|
|
288
|
+
if method is None:
|
|
289
|
+
return
|
|
290
|
+
|
|
291
|
+
all_results = []
|
|
292
|
+
total_execution_time = 0
|
|
293
|
+
for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
|
|
294
|
+
signal.alarm(timeout)
|
|
295
|
+
# faulthandler.enable()
|
|
296
|
+
|
|
297
|
+
with Capturing() as captured_output:
|
|
298
|
+
try:
|
|
299
|
+
start = time.time()
|
|
300
|
+
call_method(method, gt_inp)
|
|
301
|
+
total_execution_time += time.time() - start
|
|
302
|
+
# reset the alarm
|
|
303
|
+
signal.alarm(0)
|
|
304
|
+
except Exception as e:
|
|
305
|
+
signal.alarm(0)
|
|
306
|
+
if 'timeoutexception' in repr(e).lower():
|
|
307
|
+
all_results.append(-3)
|
|
308
|
+
return all_results, {
|
|
309
|
+
'error': repr(e),
|
|
310
|
+
'error_code': -3,
|
|
311
|
+
'error_message': 'Time Limit Exceeded',
|
|
312
|
+
'inputs': truncatefn(gt_inp),
|
|
313
|
+
'expected': truncatefn(gt_out),
|
|
314
|
+
}
|
|
315
|
+
else:
|
|
316
|
+
all_results.append(-4)
|
|
317
|
+
return all_results, {
|
|
318
|
+
'error': repr(e),
|
|
319
|
+
'error_code': -4,
|
|
320
|
+
'error_message': 'Runtime Error',
|
|
321
|
+
'inputs': truncatefn(gt_inp),
|
|
322
|
+
'expected': truncatefn(gt_out),
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
finally:
|
|
326
|
+
signal.alarm(0)
|
|
327
|
+
# faulthandler.disable()
|
|
328
|
+
|
|
329
|
+
prediction = captured_output[0]
|
|
330
|
+
|
|
331
|
+
stripped_prediction_lines = get_stripped_lines(prediction)
|
|
332
|
+
stripped_gt_out_lines = get_stripped_lines(gt_out)
|
|
333
|
+
|
|
334
|
+
## WA happens in multiple circumstances
|
|
335
|
+
## so cache the return to make it clean!
|
|
336
|
+
WA_send_args = {
|
|
337
|
+
'output': truncatefn(prediction),
|
|
338
|
+
'inputs': truncatefn(gt_inp),
|
|
339
|
+
'expected': truncatefn(gt_out),
|
|
340
|
+
'error_code': -2,
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
if len(stripped_prediction_lines) != len(stripped_gt_out_lines):
|
|
344
|
+
all_results.append(-2)
|
|
345
|
+
WA_send_args['error_message'] = 'Wrong answer: mismatched output length'
|
|
346
|
+
return all_results, WA_send_args
|
|
347
|
+
|
|
348
|
+
for output_line_idx, (
|
|
349
|
+
stripped_prediction_line,
|
|
350
|
+
stripped_gt_out_line,
|
|
351
|
+
) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
|
|
352
|
+
WA_send_args['error_message'] = (
|
|
353
|
+
f'Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}'
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
## CASE 1: exact match
|
|
357
|
+
if stripped_prediction_line == stripped_gt_out_line:
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
## CASE 2: element-wise comparision
|
|
361
|
+
## if there are floating elements
|
|
362
|
+
## use `decimal` library for good floating point comparision
|
|
363
|
+
## otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
|
|
364
|
+
## note that we should always be able to convert to decimals
|
|
365
|
+
|
|
366
|
+
success, decimal_prediction_line = convert_line_to_decimals(stripped_prediction_line)
|
|
367
|
+
if not success:
|
|
368
|
+
all_results.append(-2)
|
|
369
|
+
return all_results, WA_send_args
|
|
370
|
+
success, decimal_gtout_line = convert_line_to_decimals(stripped_gt_out_line)
|
|
371
|
+
if not success:
|
|
372
|
+
all_results.append(-2)
|
|
373
|
+
return all_results, WA_send_args
|
|
374
|
+
|
|
375
|
+
if decimal_prediction_line == decimal_gtout_line:
|
|
376
|
+
continue
|
|
377
|
+
|
|
378
|
+
all_results.append(-2)
|
|
379
|
+
return all_results, WA_send_args
|
|
380
|
+
all_results.append(True)
|
|
381
|
+
|
|
382
|
+
return all_results, {'execution time': total_execution_time}
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def run_test(sample, test=None, debug=False, timeout=6):
|
|
386
|
+
"""
|
|
387
|
+
if test(generated_code) is not None it'll try to run the code.
|
|
388
|
+
otherwise it'll just return an input and output pair.
|
|
389
|
+
"""
|
|
390
|
+
timeout_handler_wrapper = partial(timeout_handler, debug)
|
|
391
|
+
signal.signal(signal.SIGALRM, timeout_handler_wrapper)
|
|
392
|
+
|
|
393
|
+
# Disable functionalities that can make destructive changes to the test.
|
|
394
|
+
# max memory is set to 4GB
|
|
395
|
+
reliability_guard()
|
|
396
|
+
|
|
397
|
+
if debug:
|
|
398
|
+
logger.info(f'start = {datetime.now().time()}')
|
|
399
|
+
|
|
400
|
+
try:
|
|
401
|
+
in_outs = json.loads(sample['input_output'])
|
|
402
|
+
except ValueError as e:
|
|
403
|
+
raise e
|
|
404
|
+
in_outs = None
|
|
405
|
+
|
|
406
|
+
if in_outs:
|
|
407
|
+
if in_outs.get('fn_name') is None:
|
|
408
|
+
which_type = CODE_TYPE.standard_input # Standard input
|
|
409
|
+
method_name = None
|
|
410
|
+
|
|
411
|
+
else:
|
|
412
|
+
which_type = CODE_TYPE.call_based # Call-based
|
|
413
|
+
method_name = in_outs['fn_name']
|
|
414
|
+
|
|
415
|
+
if debug:
|
|
416
|
+
logger.info(f'loaded input_output = {datetime.now().time()}')
|
|
417
|
+
|
|
418
|
+
if test is None:
|
|
419
|
+
assert False, 'should not happen: test code is none'
|
|
420
|
+
return in_outs, {'error': 'no test code provided'}
|
|
421
|
+
elif test is not None:
|
|
422
|
+
results = []
|
|
423
|
+
sol = import_string
|
|
424
|
+
if debug:
|
|
425
|
+
logger.info(f'loading test code = {datetime.now().time()}')
|
|
426
|
+
|
|
427
|
+
if which_type == CODE_TYPE.call_based:
|
|
428
|
+
signal.alarm(timeout)
|
|
429
|
+
try:
|
|
430
|
+
results, metadata = grade_call_based(
|
|
431
|
+
code=test,
|
|
432
|
+
all_inputs=in_outs['inputs'],
|
|
433
|
+
all_outputs=in_outs['outputs'],
|
|
434
|
+
fn_name=method_name,
|
|
435
|
+
timeout=timeout,
|
|
436
|
+
)
|
|
437
|
+
return results, metadata
|
|
438
|
+
except Exception as e:
|
|
439
|
+
return [-4], {
|
|
440
|
+
'error_code': -4,
|
|
441
|
+
'error_message': f'Error during testing: {e}',
|
|
442
|
+
}
|
|
443
|
+
finally:
|
|
444
|
+
signal.alarm(0)
|
|
445
|
+
elif which_type == CODE_TYPE.standard_input:
|
|
446
|
+
# sol
|
|
447
|
+
# if code has if __name__ == "__main__": then remove it
|
|
448
|
+
|
|
449
|
+
signal.alarm(timeout)
|
|
450
|
+
try:
|
|
451
|
+
results, metadata = grade_stdio(
|
|
452
|
+
code=test,
|
|
453
|
+
all_inputs=in_outs['inputs'],
|
|
454
|
+
all_outputs=in_outs['outputs'],
|
|
455
|
+
timeout=timeout,
|
|
456
|
+
)
|
|
457
|
+
return results, metadata
|
|
458
|
+
except Exception as e:
|
|
459
|
+
return [-4], {
|
|
460
|
+
'error_code': -4,
|
|
461
|
+
'error_message': f'Error during testing: {e}',
|
|
462
|
+
}
|
|
463
|
+
finally:
|
|
464
|
+
signal.alarm(0)
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def reliability_guard(maximum_memory_bytes=None):
|
|
468
|
+
"""
|
|
469
|
+
This disables various destructive functions and prevents the generated code
|
|
470
|
+
from interfering with the test (e.g. fork bomb, killing other processes,
|
|
471
|
+
removing filesystem files, etc.)
|
|
472
|
+
WARNING
|
|
473
|
+
This function is NOT a security sandbox. Untrusted code, including, model-
|
|
474
|
+
generated code, should not be blindly executed outside of one. See the
|
|
475
|
+
Codex paper for more information about OpenAI's code sandbox, and proceed
|
|
476
|
+
with caution.
|
|
477
|
+
"""
|
|
478
|
+
|
|
479
|
+
if maximum_memory_bytes is not None:
|
|
480
|
+
import resource
|
|
481
|
+
|
|
482
|
+
resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
|
|
483
|
+
resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
|
|
484
|
+
if not platform.uname().system == 'Darwin':
|
|
485
|
+
resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
|
|
486
|
+
|
|
487
|
+
# faulthandler.disable()
|
|
488
|
+
|
|
489
|
+
import builtins
|
|
490
|
+
|
|
491
|
+
# builtins.exit = None
|
|
492
|
+
builtins.quit = None
|
|
493
|
+
|
|
494
|
+
import os
|
|
495
|
+
|
|
496
|
+
os.environ['OMP_NUM_THREADS'] = '1'
|
|
497
|
+
|
|
498
|
+
os.kill = None
|
|
499
|
+
os.system = None
|
|
500
|
+
os.putenv = None
|
|
501
|
+
os.remove = None
|
|
502
|
+
os.removedirs = None
|
|
503
|
+
os.rmdir = None
|
|
504
|
+
os.fchdir = None
|
|
505
|
+
os.setuid = None
|
|
506
|
+
os.fork = None
|
|
507
|
+
os.forkpty = None
|
|
508
|
+
os.killpg = None
|
|
509
|
+
os.rename = None
|
|
510
|
+
os.renames = None
|
|
511
|
+
os.truncate = None
|
|
512
|
+
os.replace = None
|
|
513
|
+
os.unlink = None
|
|
514
|
+
os.fchmod = None
|
|
515
|
+
os.fchown = None
|
|
516
|
+
os.chmod = None
|
|
517
|
+
os.chown = None
|
|
518
|
+
os.chroot = None
|
|
519
|
+
os.fchdir = None
|
|
520
|
+
os.lchflags = None
|
|
521
|
+
os.lchmod = None
|
|
522
|
+
os.lchown = None
|
|
523
|
+
os.getcwd = None
|
|
524
|
+
os.chdir = None
|
|
525
|
+
|
|
526
|
+
import shutil
|
|
527
|
+
|
|
528
|
+
shutil.rmtree = None
|
|
529
|
+
shutil.move = None
|
|
530
|
+
shutil.chown = None
|
|
531
|
+
|
|
532
|
+
import subprocess
|
|
533
|
+
|
|
534
|
+
subprocess.Popen = None # type: ignore
|
|
535
|
+
|
|
536
|
+
__builtins__['help'] = None
|
|
537
|
+
|
|
538
|
+
import sys
|
|
539
|
+
|
|
540
|
+
sys.modules['ipdb'] = None
|
|
541
|
+
sys.modules['joblib'] = None
|
|
542
|
+
sys.modules['resource'] = None
|
|
543
|
+
sys.modules['psutil'] = None
|
|
544
|
+
sys.modules['tkinter'] = None
|
|
File without changes
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
7
|
+
|
|
8
|
+
DESCRIPTION = 'LogiQA is a dataset sourced from expert-written questions for testing human Logical reasoning.'
|
|
9
|
+
|
|
10
|
+
PROMPT_TEMPLATE = r"""
|
|
11
|
+
Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
|
|
12
|
+
|
|
13
|
+
{question}
|
|
14
|
+
|
|
15
|
+
{choices}
|
|
16
|
+
""".strip()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@register_benchmark(
|
|
20
|
+
BenchmarkMeta(
|
|
21
|
+
name='logi_qa',
|
|
22
|
+
pretty_name='LogiQA',
|
|
23
|
+
tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
|
|
24
|
+
description=DESCRIPTION.strip(),
|
|
25
|
+
dataset_id='extraordinarylab/logiqa',
|
|
26
|
+
metric_list=['acc'],
|
|
27
|
+
few_shot_num=0,
|
|
28
|
+
train_split='validation',
|
|
29
|
+
eval_split='test',
|
|
30
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
31
|
+
)
|
|
32
|
+
)
|
|
33
|
+
class LogiQAAdapter(MultiChoiceAdapter):
|
|
34
|
+
|
|
35
|
+
def record_to_sample(self, record) -> Sample:
|
|
36
|
+
return Sample(
|
|
37
|
+
input=f"{record['context']}\n{record['question']}",
|
|
38
|
+
choices=record['choices'],
|
|
39
|
+
target=record['answer'],
|
|
40
|
+
metadata={},
|
|
41
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
7
|
+
|
|
8
|
+
MARITIME_PROMPT_TEMPLATE = '请回答单选题。要求只输出选项,不输出解释,将选项放在[]里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:[A]\n 当前题目\n {question}\n选项:\n{choices}' # noqa: E501
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_benchmark(
|
|
12
|
+
BenchmarkMeta(
|
|
13
|
+
name='maritime_bench',
|
|
14
|
+
pretty_name='MaritimeBench',
|
|
15
|
+
tags=[Tags.CHINESE, Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
|
|
16
|
+
description=
|
|
17
|
+
'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.', # noqa: E501
|
|
18
|
+
dataset_id='HiDolphin/MaritimeBench',
|
|
19
|
+
metric_list=['acc'],
|
|
20
|
+
few_shot_num=0,
|
|
21
|
+
eval_split='test',
|
|
22
|
+
prompt_template=MARITIME_PROMPT_TEMPLATE,
|
|
23
|
+
)
|
|
24
|
+
)
|
|
25
|
+
class MaritimeBenchAdapter(MultiChoiceAdapter):
|
|
26
|
+
|
|
27
|
+
def __init__(self, **kwargs):
|
|
28
|
+
super().__init__(**kwargs)
|
|
29
|
+
|
|
30
|
+
self.reformat_subset = True
|
|
31
|
+
|
|
32
|
+
def record_to_sample(self, record) -> Sample:
|
|
33
|
+
# Extract available choices from the record
|
|
34
|
+
choices = []
|
|
35
|
+
choice_letters = ['A', 'B', 'C', 'D']
|
|
36
|
+
for letter in choice_letters:
|
|
37
|
+
if letter in record and record[letter]:
|
|
38
|
+
choices.append(record[letter])
|
|
39
|
+
|
|
40
|
+
return Sample(
|
|
41
|
+
input=record['question'],
|
|
42
|
+
choices=choices,
|
|
43
|
+
target=record['answer'],
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def format_prompt_template(self, sample):
|
|
47
|
+
choices = '\n'.join([f'{chr(65 + i)}. {choice}' for i, choice in enumerate(sample.choices)])
|
|
48
|
+
return MARITIME_PROMPT_TEMPLATE.format(question=sample.input, choices=choices)
|
|
49
|
+
|
|
50
|
+
def extract_answer(self, prediction, task_state):
|
|
51
|
+
# use regex to extract the answer from the prediction
|
|
52
|
+
import re
|
|
53
|
+
match = re.search(r'\[([A-D])\]', prediction)
|
|
54
|
+
if match:
|
|
55
|
+
return match.group(1)
|
|
56
|
+
return ''
|
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='math_500',
|
|
17
|
+
pretty_name='MATH-500',
|
|
18
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
19
|
+
description=
|
|
20
|
+
"MATH-500 is a benchmark for evaluating mathematical reasoning capabilities of AI models. It consists of 500 diverse math problems across five levels of difficulty, designed to test a model's ability to solve complex mathematical problems by generating step-by-step solutions and providing the correct final answer.", # noqa: E501
|
|
21
|
+
dataset_id='AI-ModelScope/MATH-500',
|
|
22
|
+
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
23
|
+
metric_list=[{
|
|
24
|
+
'acc': {
|
|
25
|
+
'numeric': True
|
|
26
|
+
}
|
|
27
|
+
}],
|
|
28
|
+
few_shot_num=0,
|
|
29
|
+
train_split=None,
|
|
30
|
+
eval_split='test',
|
|
31
|
+
prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
class Math500Adapter(DefaultDataAdapter):
|
|
35
|
+
|
|
36
|
+
def __init__(self, *args, **kwargs):
|
|
37
|
+
super().__init__(*args, **kwargs)
|
|
38
|
+
|
|
39
|
+
self.reformat_subset = True
|
|
40
|
+
|
|
41
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
42
|
+
return Sample(
|
|
43
|
+
input=record['problem'],
|
|
44
|
+
target=record['answer'],
|
|
45
|
+
subset_key=f"Level {record['level']}",
|
|
46
|
+
metadata={
|
|
47
|
+
'question_id': record['unique_id'],
|
|
48
|
+
'solution': record['solution'],
|
|
49
|
+
},
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def extract_answer(self, prediction: str, task_state):
|
|
53
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
54
|
+
|
|
55
|
+
return extract_answer(prediction)
|
|
File without changes
|