evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,291 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os.path
|
|
3
|
-
import random
|
|
4
|
-
from abc import ABC, abstractmethod
|
|
5
|
-
from typing import Any, List, Optional
|
|
6
|
-
|
|
7
|
-
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
8
|
-
from evalscope.metrics import Metric
|
|
9
|
-
from evalscope.report import Report, ReportGenerator
|
|
10
|
-
from evalscope.utils.logger import get_logger
|
|
11
|
-
|
|
12
|
-
logger = get_logger()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class DataAdapter(ABC):
|
|
16
|
-
|
|
17
|
-
def __init__(self,
|
|
18
|
-
name: str,
|
|
19
|
-
subset_list: list,
|
|
20
|
-
metric_list: List[Metric],
|
|
21
|
-
few_shot_num: Optional[int] = 0,
|
|
22
|
-
train_split: Optional[str] = None,
|
|
23
|
-
eval_split: Optional[str] = None,
|
|
24
|
-
prompt_template: Optional[str] = None,
|
|
25
|
-
**kwargs):
|
|
26
|
-
"""
|
|
27
|
-
Data Adapter for the benchmark. You need to implement the following methods:
|
|
28
|
-
- gen_prompt
|
|
29
|
-
- get_gold_answer
|
|
30
|
-
- parse_pred_result
|
|
31
|
-
- match
|
|
32
|
-
Args:
|
|
33
|
-
name: str, the name of the benchmark.
|
|
34
|
-
subset_list: list of subset names for the dataset.
|
|
35
|
-
metric_list: list, the metric list to evaluate the model on specific benchmark.
|
|
36
|
-
few_shot_num: int, number of few-shot examples. Default: 0
|
|
37
|
-
train_split: str, usually for few-shot examples. e.g. 'train'
|
|
38
|
-
eval_split: str, the target eval split name. e.g. 'test'
|
|
39
|
-
prompt_template: str, the prompt template for the benchmark,
|
|
40
|
-
e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
|
|
41
|
-
the form of A or B or C or D, do not output explanation:`
|
|
42
|
-
"""
|
|
43
|
-
self.name = name
|
|
44
|
-
self.subset_list = subset_list
|
|
45
|
-
self.metric_list = metric_list
|
|
46
|
-
self.few_shot_num = few_shot_num
|
|
47
|
-
self.train_split = train_split
|
|
48
|
-
self.eval_split = eval_split
|
|
49
|
-
self.prompt_template = prompt_template
|
|
50
|
-
self.config_kwargs = kwargs
|
|
51
|
-
self.category_map = kwargs.get('category_map', {})
|
|
52
|
-
|
|
53
|
-
def load(self,
|
|
54
|
-
dataset_name_or_path: str,
|
|
55
|
-
subset_list: list = None,
|
|
56
|
-
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
57
|
-
datasets_hub: str = HubType.MODELSCOPE,
|
|
58
|
-
**kwargs) -> dict:
|
|
59
|
-
"""
|
|
60
|
-
Load the dataset. Remote and local datasets are supported.
|
|
61
|
-
You can rewrite this method to support your own local dataset, just follow the format of the output.
|
|
62
|
-
|
|
63
|
-
Returns: {'subset_name': {'train': train_dataset, 'test': test_dataset}}
|
|
64
|
-
train_dataset, test_dataset: Iterable dataset, object each item of which is a dict.
|
|
65
|
-
|
|
66
|
-
"""
|
|
67
|
-
dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
68
|
-
subset_list = subset_list or self.subset_list
|
|
69
|
-
|
|
70
|
-
# Try to load dataset from local disk
|
|
71
|
-
if os.path.exists(dataset_name_or_path):
|
|
72
|
-
logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
|
|
73
|
-
subsets: {subset_list}')
|
|
74
|
-
data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
75
|
-
if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
|
|
76
|
-
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
77
|
-
else:
|
|
78
|
-
from modelscope.msdatasets import MsDataset
|
|
79
|
-
|
|
80
|
-
# Load dataset from remote
|
|
81
|
-
logger.info(
|
|
82
|
-
f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
83
|
-
data_dict = {}
|
|
84
|
-
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
85
|
-
if len(split_list) == 0:
|
|
86
|
-
logger.error(f'Got empty split list: {split_list}')
|
|
87
|
-
|
|
88
|
-
for sub_name in subset_list:
|
|
89
|
-
data_dict[sub_name] = {}
|
|
90
|
-
# e.g. train: few-shot, test: target dataset to evaluate
|
|
91
|
-
for split in split_list:
|
|
92
|
-
dataset = MsDataset.load(
|
|
93
|
-
dataset_name=dataset_name_or_path,
|
|
94
|
-
subset_name=sub_name,
|
|
95
|
-
split=split,
|
|
96
|
-
cache_dir=work_dir,
|
|
97
|
-
hub=datasets_hub,
|
|
98
|
-
**kwargs)
|
|
99
|
-
|
|
100
|
-
data_dict[sub_name].update({split: dataset})
|
|
101
|
-
|
|
102
|
-
return data_dict
|
|
103
|
-
|
|
104
|
-
def load_from_disk(self, *args, **kwargs) -> dict:
|
|
105
|
-
"""
|
|
106
|
-
Load the dataset from local disk.
|
|
107
|
-
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
|
|
108
|
-
"""
|
|
109
|
-
return {}
|
|
110
|
-
|
|
111
|
-
def gen_prompts(self, data_dict: dict) -> dict:
|
|
112
|
-
"""
|
|
113
|
-
Generate dataset prompts from raw input, unify the prompt format for different datasets.
|
|
114
|
-
|
|
115
|
-
Args:
|
|
116
|
-
data_dict: Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load
|
|
117
|
-
|
|
118
|
-
Returns:
|
|
119
|
-
{'subset_name': [prompt_d_1, prompt_d_2, ...]}
|
|
120
|
-
prompt_d_i (dict): refer to the output of gen_prompt method.
|
|
121
|
-
|
|
122
|
-
e.g. train -- few-shot data, test -- target dataset to evaluate.
|
|
123
|
-
"""
|
|
124
|
-
res_dict: dict = {}
|
|
125
|
-
|
|
126
|
-
if self.few_shot_num and self.few_shot_num < 0:
|
|
127
|
-
raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
|
|
128
|
-
|
|
129
|
-
logger.info(f'Use default settings: '
|
|
130
|
-
f'> few_shot_num: {self.few_shot_num}, '
|
|
131
|
-
f'> few_shot_split: {self.train_split}, '
|
|
132
|
-
f'> target_eval_split: {self.eval_split}')
|
|
133
|
-
|
|
134
|
-
for sub_name, sub_data_dict in data_dict.items():
|
|
135
|
-
few_shot_data = []
|
|
136
|
-
if self.few_shot_num and self.few_shot_num > 0:
|
|
137
|
-
few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
|
|
138
|
-
few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
|
|
139
|
-
self.few_shot_num,
|
|
140
|
-
few_shot_random=few_shot_random)
|
|
141
|
-
|
|
142
|
-
res_dict[sub_name] = []
|
|
143
|
-
for sample_d in sub_data_dict[self.eval_split]:
|
|
144
|
-
prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data)
|
|
145
|
-
prompt_d[AnswerKeys.RAW_INPUT] = sample_d
|
|
146
|
-
res_dict[sub_name].append(prompt_d)
|
|
147
|
-
|
|
148
|
-
return res_dict
|
|
149
|
-
|
|
150
|
-
def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
|
|
151
|
-
|
|
152
|
-
if k > len(data_list):
|
|
153
|
-
k = len(data_list)
|
|
154
|
-
if few_shot_random:
|
|
155
|
-
return random.sample(data_list, k)
|
|
156
|
-
else:
|
|
157
|
-
return data_list[:k]
|
|
158
|
-
|
|
159
|
-
def compute_metric(self, review_res_list: list) -> List[dict]:
|
|
160
|
-
"""
|
|
161
|
-
Compute evaluation result by specific metrics.
|
|
162
|
-
|
|
163
|
-
Args:
|
|
164
|
-
review_res_list: list, the review result list, each item of which is match result for gold and pred.
|
|
165
|
-
|
|
166
|
-
Returns:
|
|
167
|
-
Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]
|
|
168
|
-
"""
|
|
169
|
-
if len(self.metric_list) == 0:
|
|
170
|
-
raise ValueError('No metric list found for the benchmark.')
|
|
171
|
-
|
|
172
|
-
res_list = []
|
|
173
|
-
for metric in self.metric_list:
|
|
174
|
-
metric_name = metric.name
|
|
175
|
-
metric_func = metric.object
|
|
176
|
-
res_list.append({
|
|
177
|
-
'metric_name': metric_name,
|
|
178
|
-
'score': metric_func(review_res_list),
|
|
179
|
-
'num': len(review_res_list)
|
|
180
|
-
})
|
|
181
|
-
return res_list
|
|
182
|
-
|
|
183
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
|
|
184
|
-
"""
|
|
185
|
-
Generate report for the evaluation results for all subsets.
|
|
186
|
-
|
|
187
|
-
Args:
|
|
188
|
-
subset_score_map: The subset-score map.
|
|
189
|
-
e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
|
|
190
|
-
|
|
191
|
-
report_name: str, the user-defined report name. Default: None
|
|
192
|
-
|
|
193
|
-
Returns: The evaluation report.
|
|
194
|
-
|
|
195
|
-
Here is a format example for gsm8k:
|
|
196
|
-
{
|
|
197
|
-
"name": "qwen2.5_gsm8k",
|
|
198
|
-
"metrics": [
|
|
199
|
-
{
|
|
200
|
-
"name": "AverageAccuracy",
|
|
201
|
-
"categories": [
|
|
202
|
-
{
|
|
203
|
-
"name": "default",
|
|
204
|
-
"subsets": [
|
|
205
|
-
{
|
|
206
|
-
"name": "main",
|
|
207
|
-
"score": 0.0,
|
|
208
|
-
"num": 2
|
|
209
|
-
}
|
|
210
|
-
],
|
|
211
|
-
"num": 2,
|
|
212
|
-
"score": 0.0,
|
|
213
|
-
"macro_score": 0.0
|
|
214
|
-
}
|
|
215
|
-
],
|
|
216
|
-
"num": 2,
|
|
217
|
-
"score": 0.0,
|
|
218
|
-
"macro_score": 0.0
|
|
219
|
-
}
|
|
220
|
-
],
|
|
221
|
-
"dataset_name": "gsm8k",
|
|
222
|
-
"model_name": "qwen2.5"
|
|
223
|
-
}
|
|
224
|
-
""" # noqa: E501
|
|
225
|
-
kwargs['category_map'] = self.category_map
|
|
226
|
-
kwargs['metric_list'] = self.metric_list
|
|
227
|
-
return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
|
|
228
|
-
|
|
229
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
230
|
-
"""
|
|
231
|
-
Generate model prompt from raw input, unify the prompt format for different datasets.
|
|
232
|
-
The input format is compatible with OpenAI Chat Completions APIs.
|
|
233
|
-
|
|
234
|
-
Args:
|
|
235
|
-
input_d (Any): The raw input. Depending on the dataset.
|
|
236
|
-
subset_name (str): The subset name.
|
|
237
|
-
few_shot_list (list): The few-shot examples.
|
|
238
|
-
|
|
239
|
-
Returns:
|
|
240
|
-
For class ChatGenerationModelAdapter, the output format is:
|
|
241
|
-
{'data': [full_prompt], 'system_prompt': (str, optional)}, -- full_prompt: str, the constructed prompt for each sample from dataset.
|
|
242
|
-
For class MultiChoiceModelAdapter, the output format is:
|
|
243
|
-
{'data': [full_prompt], 'multi_choices': self.choices} -- full_prompt: str, the constructed prompt for each sample from dataset.
|
|
244
|
-
For class ContinuationEvalModelAdapter, the output format is:
|
|
245
|
-
{'data': ctx_continuation_pair_list, 'multi_choices': self.choices} -- ctx_continuation_pair_list: list, the context-continuation pair list.
|
|
246
|
-
""" # noqa: E501
|
|
247
|
-
raise NotImplementedError
|
|
248
|
-
|
|
249
|
-
@abstractmethod
|
|
250
|
-
def get_gold_answer(self, input_d: Any) -> Any:
|
|
251
|
-
"""
|
|
252
|
-
Parse the raw input labels (gold).
|
|
253
|
-
|
|
254
|
-
Args:
|
|
255
|
-
input_d: input raw data. Depending on the dataset.
|
|
256
|
-
|
|
257
|
-
Returns:
|
|
258
|
-
The parsed input. e.g. gold answer ... Depending on the dataset.
|
|
259
|
-
"""
|
|
260
|
-
raise NotImplementedError
|
|
261
|
-
|
|
262
|
-
@abstractmethod
|
|
263
|
-
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
|
|
264
|
-
"""
|
|
265
|
-
Parse the predicted result and extract proper answer.
|
|
266
|
-
|
|
267
|
-
Args:
|
|
268
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
269
|
-
raw_input_d: The raw input. Depending on the dataset.
|
|
270
|
-
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
271
|
-
|
|
272
|
-
Returns:
|
|
273
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
274
|
-
"""
|
|
275
|
-
raise NotImplementedError
|
|
276
|
-
|
|
277
|
-
@abstractmethod
|
|
278
|
-
def match(self, gold: Any, pred: Any) -> Any:
|
|
279
|
-
"""
|
|
280
|
-
Match the gold answer and the predicted answer.
|
|
281
|
-
|
|
282
|
-
Args:
|
|
283
|
-
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
|
|
284
|
-
e.g. 'A', extracted from get_gold_answer method.
|
|
285
|
-
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
|
|
286
|
-
e.g. 'B', extracted from parse_pred_result method.
|
|
287
|
-
|
|
288
|
-
Returns:
|
|
289
|
-
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
290
|
-
"""
|
|
291
|
-
raise NotImplementedError
|
|
@@ -1,121 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
# flake8: noqa
|
|
16
|
-
"""Grade School Math 8k dataset."""
|
|
17
|
-
|
|
18
|
-
import datasets
|
|
19
|
-
import json
|
|
20
|
-
import textwrap
|
|
21
|
-
|
|
22
|
-
_CITATION = """\
|
|
23
|
-
@misc{cobbe2021training,
|
|
24
|
-
title={Training Verifiers to Solve Math Word Problems},
|
|
25
|
-
author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
|
|
26
|
-
year={2021},
|
|
27
|
-
eprint={2110.14168},
|
|
28
|
-
archivePrefix={arXiv},
|
|
29
|
-
primaryClass={cs.LG}
|
|
30
|
-
}
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
_DESCRIPTION = """\
|
|
34
|
-
GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality
|
|
35
|
-
linguistically diverse grade school math word problems. The
|
|
36
|
-
dataset was created to support the task of question answering
|
|
37
|
-
on basic mathematical problems that require multi-step reasoning.
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
_HOMEPAGE = 'https://openai.com/blog/grade-school-math'
|
|
41
|
-
_MODELSCOPE_PAGE = 'https://modelscope.cn/datasets/modelscope/gsm8k/summary'
|
|
42
|
-
|
|
43
|
-
_LICENSE = 'MIT'
|
|
44
|
-
|
|
45
|
-
# _BASE_URL = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/"
|
|
46
|
-
TRAIN_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/gsm8k/train.jsonl'
|
|
47
|
-
TEST_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/gsm8k/test.jsonl'
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class Gsm8kConfig(datasets.BuilderConfig):
|
|
51
|
-
"""BuilderConfig for GSM8K."""
|
|
52
|
-
|
|
53
|
-
def __init__(self, urls, **kwargs):
|
|
54
|
-
"""BuilderConfig for GSM8K.
|
|
55
|
-
Args:
|
|
56
|
-
urls: *dict[string]*, the urls for each split of the GSM8k set.
|
|
57
|
-
"""
|
|
58
|
-
super().__init__(version=datasets.Version('1.1.0'), **kwargs)
|
|
59
|
-
self.urls = urls
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class Gsm8k(datasets.GeneratorBasedBuilder):
|
|
63
|
-
"""Grade School Math 8k (GSM8K)"""
|
|
64
|
-
|
|
65
|
-
BUILDER_CONFIGS = [
|
|
66
|
-
Gsm8kConfig(
|
|
67
|
-
name='main',
|
|
68
|
-
description=textwrap.dedent(
|
|
69
|
-
"""
|
|
70
|
-
It is segmented into 7.5K training problems and 1K test problems.
|
|
71
|
-
These problems take between 2 and 8 steps to solve, and solutions
|
|
72
|
-
primarily involve performing a sequence of elementary calculations
|
|
73
|
-
using basic arithmetic operations (+ - / *) to reach the final
|
|
74
|
-
answer. A bright middle school student should be able to solve
|
|
75
|
-
every problem.
|
|
76
|
-
""", ),
|
|
77
|
-
urls={
|
|
78
|
-
'train': TRAIN_URL,
|
|
79
|
-
'test': TEST_URL,
|
|
80
|
-
},
|
|
81
|
-
),
|
|
82
|
-
]
|
|
83
|
-
|
|
84
|
-
def _info(self):
|
|
85
|
-
features = datasets.Features({
|
|
86
|
-
'question': datasets.Value('string'),
|
|
87
|
-
'answer': datasets.Value('string'),
|
|
88
|
-
})
|
|
89
|
-
return datasets.DatasetInfo(
|
|
90
|
-
description=_DESCRIPTION,
|
|
91
|
-
features=features,
|
|
92
|
-
homepage=_HOMEPAGE,
|
|
93
|
-
license=_LICENSE,
|
|
94
|
-
citation=_CITATION,
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
def _split_generators(self, dl_manager):
|
|
98
|
-
data_dir = dl_manager.download_and_extract(self.config.urls)
|
|
99
|
-
return [
|
|
100
|
-
datasets.SplitGenerator(
|
|
101
|
-
name=datasets.Split.TRAIN,
|
|
102
|
-
gen_kwargs={
|
|
103
|
-
'filepath': data_dir['train'],
|
|
104
|
-
},
|
|
105
|
-
),
|
|
106
|
-
datasets.SplitGenerator(
|
|
107
|
-
name=datasets.Split.TEST,
|
|
108
|
-
gen_kwargs={
|
|
109
|
-
'filepath': data_dir['test'],
|
|
110
|
-
},
|
|
111
|
-
),
|
|
112
|
-
]
|
|
113
|
-
|
|
114
|
-
def _generate_examples(self, filepath):
|
|
115
|
-
with open(filepath, encoding='utf-8') as f:
|
|
116
|
-
for key, row in enumerate(f):
|
|
117
|
-
data = json.loads(row)
|
|
118
|
-
yield key, {
|
|
119
|
-
'question': data['question'],
|
|
120
|
-
'answer': data['answer'],
|
|
121
|
-
}
|
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
3
|
-
"""HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI.
|
|
4
|
-
A paper was published at ACL2019.
|
|
5
|
-
"""
|
|
6
|
-
"""DO NOT EDIT."""
|
|
7
|
-
|
|
8
|
-
import datasets
|
|
9
|
-
import json
|
|
10
|
-
|
|
11
|
-
# flake8: noqa
|
|
12
|
-
|
|
13
|
-
# HomePage: https://rowanzellers.com/hellaswag/
|
|
14
|
-
# GitHub: https://github.com/rowanz/hellaswag
|
|
15
|
-
|
|
16
|
-
_CITATION = """\
|
|
17
|
-
@inproceedings{zellers2019hellaswag,
|
|
18
|
-
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
|
19
|
-
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
|
|
20
|
-
booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
|
|
21
|
-
year={2019}
|
|
22
|
-
}
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
_DESCRIPTION = """
|
|
26
|
-
HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.
|
|
27
|
-
"""
|
|
28
|
-
_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/hellaswag/'
|
|
29
|
-
_URLS = {
|
|
30
|
-
'train': _URL + 'hellaswag_train.jsonl',
|
|
31
|
-
'test': _URL + 'hellaswag_test.jsonl',
|
|
32
|
-
'dev': _URL + 'hellaswag_val.jsonl',
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class Hellaswag(datasets.GeneratorBasedBuilder):
|
|
37
|
-
"""TODO(hellaswag): Short description of my dataset."""
|
|
38
|
-
|
|
39
|
-
# TODO(hellaswag): Set up version.
|
|
40
|
-
VERSION = datasets.Version('0.1.0')
|
|
41
|
-
|
|
42
|
-
def _info(self):
|
|
43
|
-
# TODO(hellaswag): Specifies the datasets.DatasetInfo object
|
|
44
|
-
return datasets.DatasetInfo(
|
|
45
|
-
# This is the description that will appear on the datasets page.
|
|
46
|
-
description=_DESCRIPTION,
|
|
47
|
-
# datasets.features.FeatureConnectors
|
|
48
|
-
features=datasets.Features({
|
|
49
|
-
# These are the features of your dataset like images, labels ...
|
|
50
|
-
'ind': datasets.Value('int32'),
|
|
51
|
-
'activity_label': datasets.Value('string'),
|
|
52
|
-
'ctx_a': datasets.Value('string'),
|
|
53
|
-
'ctx_b': datasets.Value('string'),
|
|
54
|
-
'ctx': datasets.Value('string'),
|
|
55
|
-
'endings': datasets.features.Sequence(datasets.Value('string')),
|
|
56
|
-
'source_id': datasets.Value('string'),
|
|
57
|
-
'split': datasets.Value('string'),
|
|
58
|
-
'split_type': datasets.Value('string'),
|
|
59
|
-
'label': datasets.Value('string'),
|
|
60
|
-
}),
|
|
61
|
-
# If there's a common (input, target) tuple from the features,
|
|
62
|
-
# specify them here. They'll be used if as_supervised=True in
|
|
63
|
-
# builder.as_dataset.
|
|
64
|
-
supervised_keys=None,
|
|
65
|
-
# Homepage of the dataset for documentation
|
|
66
|
-
homepage='https://rowanzellers.com/hellaswag/',
|
|
67
|
-
citation=_CITATION,
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
def _split_generators(self, dl_manager):
|
|
71
|
-
"""Returns SplitGenerators."""
|
|
72
|
-
# TODO(hellaswag): Downloads the data and defines the splits
|
|
73
|
-
# dl_manager is a datasets.download.DownloadManager that can be used to
|
|
74
|
-
# download and extract URLs
|
|
75
|
-
urls_to_download = _URLS
|
|
76
|
-
dl_dir = dl_manager.download_and_extract(urls_to_download)
|
|
77
|
-
return [
|
|
78
|
-
datasets.SplitGenerator(
|
|
79
|
-
name=datasets.Split.TRAIN,
|
|
80
|
-
# These kwargs will be passed to _generate_examples
|
|
81
|
-
gen_kwargs={'filepath': dl_dir['train']},
|
|
82
|
-
),
|
|
83
|
-
datasets.SplitGenerator(
|
|
84
|
-
name=datasets.Split.TEST,
|
|
85
|
-
# These kwargs will be passed to _generate_examples
|
|
86
|
-
gen_kwargs={'filepath': dl_dir['test']},
|
|
87
|
-
),
|
|
88
|
-
datasets.SplitGenerator(
|
|
89
|
-
name=datasets.Split.VALIDATION,
|
|
90
|
-
# These kwargs will be passed to _generate_examples
|
|
91
|
-
gen_kwargs={'filepath': dl_dir['dev']},
|
|
92
|
-
),
|
|
93
|
-
]
|
|
94
|
-
|
|
95
|
-
def _generate_examples(self, filepath):
|
|
96
|
-
"""Yields examples."""
|
|
97
|
-
# TODO(hellaswag): Yields (key, example) tuples from the dataset
|
|
98
|
-
with open(filepath, encoding='utf-8') as f:
|
|
99
|
-
for id_, row in enumerate(f):
|
|
100
|
-
data = json.loads(row)
|
|
101
|
-
yield id_, {
|
|
102
|
-
'ind': int(data['ind']),
|
|
103
|
-
'activity_label': data['activity_label'],
|
|
104
|
-
'ctx_a': data.get('ctx_a', ''),
|
|
105
|
-
'ctx_b': data.get('ctx_b', ''),
|
|
106
|
-
'ctx': data['ctx'],
|
|
107
|
-
'endings': data.get('endings', []),
|
|
108
|
-
'source_id': data['source_id'],
|
|
109
|
-
'split': data['split'],
|
|
110
|
-
'split_type': data['split_type'],
|
|
111
|
-
'label': str(data.get('label', '')),
|
|
112
|
-
}
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import datasets
|
|
3
|
-
import json
|
|
4
|
-
|
|
5
|
-
# flake8: noqa
|
|
6
|
-
|
|
7
|
-
# NOTE: AUTOGENERATED, DO NOT CHANGE.
|
|
8
|
-
|
|
9
|
-
_DESCRIPTION = """\
|
|
10
|
-
The HumanEval dataset released by OpenAI contains 164 handcrafted programming challenges together with unittests to very the viability of a proposed solution.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
# _URL = "https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl.gz"
|
|
14
|
-
_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/humaneval/HumanEval.jsonl.gz'
|
|
15
|
-
|
|
16
|
-
_CITATION = """\
|
|
17
|
-
@misc{chen2021evaluating,
|
|
18
|
-
title={Evaluating Large Language Models Trained on Code},
|
|
19
|
-
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
|
|
20
|
-
year={2021},
|
|
21
|
-
eprint={2107.03374},
|
|
22
|
-
archivePrefix={arXiv},
|
|
23
|
-
primaryClass={cs.LG}
|
|
24
|
-
}"""
|
|
25
|
-
|
|
26
|
-
_HOMEPAGE = 'https://github.com/openai/human-eval'
|
|
27
|
-
|
|
28
|
-
_LICENSE = 'MIT'
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class OpenaiHumaneval(datasets.GeneratorBasedBuilder):
|
|
32
|
-
"""HumanEval: A benchmark for code generation."""
|
|
33
|
-
|
|
34
|
-
VERSION = datasets.Version('1.0.0')
|
|
35
|
-
|
|
36
|
-
BUILDER_CONFIGS = [
|
|
37
|
-
datasets.BuilderConfig(
|
|
38
|
-
name='openai_humaneval',
|
|
39
|
-
version=datasets.Version('1.0.0'),
|
|
40
|
-
description=_DESCRIPTION,
|
|
41
|
-
)
|
|
42
|
-
]
|
|
43
|
-
|
|
44
|
-
def _info(self):
|
|
45
|
-
features = datasets.Features({
|
|
46
|
-
'task_id': datasets.Value('string'),
|
|
47
|
-
'prompt': datasets.Value('string'),
|
|
48
|
-
'canonical_solution': datasets.Value('string'),
|
|
49
|
-
'test': datasets.Value('string'),
|
|
50
|
-
'entry_point': datasets.Value('string'),
|
|
51
|
-
})
|
|
52
|
-
|
|
53
|
-
return datasets.DatasetInfo(
|
|
54
|
-
description=_DESCRIPTION,
|
|
55
|
-
features=features,
|
|
56
|
-
supervised_keys=None,
|
|
57
|
-
homepage=_HOMEPAGE,
|
|
58
|
-
license=_LICENSE,
|
|
59
|
-
citation=_CITATION,
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
def _split_generators(self, dl_manager):
|
|
63
|
-
"""Returns SplitGenerators."""
|
|
64
|
-
data_dir = dl_manager.download_and_extract(_URL)
|
|
65
|
-
return [datasets.SplitGenerator(
|
|
66
|
-
name=datasets.Split.TEST,
|
|
67
|
-
gen_kwargs={
|
|
68
|
-
'filepath': data_dir,
|
|
69
|
-
},
|
|
70
|
-
)]
|
|
71
|
-
|
|
72
|
-
def _generate_examples(self, filepath):
|
|
73
|
-
"""Yields examples."""
|
|
74
|
-
with open(filepath, encoding='utf-8') as file:
|
|
75
|
-
data = [json.loads(line) for line in file]
|
|
76
|
-
id_ = 0
|
|
77
|
-
for sample in data:
|
|
78
|
-
yield id_, sample
|
|
79
|
-
id_ += 1
|