evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
# isort: skip_file
|
|
2
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
3
|
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
4
|
-
#
|
|
5
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
-
# you may not use this file except in compliance with the License.
|
|
7
|
-
# You may obtain a copy of the License at
|
|
8
|
-
#
|
|
9
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
-
#
|
|
11
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
-
# See the License for the specific language governing permissions and
|
|
15
|
-
# limitations under the License.
|
|
16
|
-
# flake8: noqa
|
|
17
|
-
|
|
18
|
-
import datasets
|
|
19
|
-
import os
|
|
20
|
-
import pandas as pd
|
|
21
|
-
"""The MMLU dataset on ModelScope hub. READ ONLY, DO NOT MODIFY."""
|
|
22
|
-
|
|
23
|
-
_CITATION = """\
|
|
24
|
-
@article{hendryckstest2021,
|
|
25
|
-
title={Measuring Massive Multitask Language Understanding},
|
|
26
|
-
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
|
27
|
-
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
|
28
|
-
year={2021}
|
|
29
|
-
}
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
_DESCRIPTION = """\
|
|
33
|
-
Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas
|
|
34
|
-
Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/mmlu/summary'
|
|
38
|
-
|
|
39
|
-
_LICENSE = 'MIT'
|
|
40
|
-
|
|
41
|
-
# _URL = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
|
|
42
|
-
_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/mmlu/repo?Revision=master&FilePath=data.tar'
|
|
43
|
-
|
|
44
|
-
task_list = [
|
|
45
|
-
'high_school_european_history',
|
|
46
|
-
'business_ethics',
|
|
47
|
-
'clinical_knowledge',
|
|
48
|
-
'medical_genetics',
|
|
49
|
-
'high_school_us_history',
|
|
50
|
-
'high_school_physics',
|
|
51
|
-
'high_school_world_history',
|
|
52
|
-
'virology',
|
|
53
|
-
'high_school_microeconomics',
|
|
54
|
-
'econometrics',
|
|
55
|
-
'college_computer_science',
|
|
56
|
-
'high_school_biology',
|
|
57
|
-
'abstract_algebra',
|
|
58
|
-
'professional_accounting',
|
|
59
|
-
'philosophy',
|
|
60
|
-
'professional_medicine',
|
|
61
|
-
'nutrition',
|
|
62
|
-
'global_facts',
|
|
63
|
-
'machine_learning',
|
|
64
|
-
'security_studies',
|
|
65
|
-
'public_relations',
|
|
66
|
-
'professional_psychology',
|
|
67
|
-
'prehistory',
|
|
68
|
-
'anatomy',
|
|
69
|
-
'human_sexuality',
|
|
70
|
-
'college_medicine',
|
|
71
|
-
'high_school_government_and_politics',
|
|
72
|
-
'college_chemistry',
|
|
73
|
-
'logical_fallacies',
|
|
74
|
-
'high_school_geography',
|
|
75
|
-
'elementary_mathematics',
|
|
76
|
-
'human_aging',
|
|
77
|
-
'college_mathematics',
|
|
78
|
-
'high_school_psychology',
|
|
79
|
-
'formal_logic',
|
|
80
|
-
'high_school_statistics',
|
|
81
|
-
'international_law',
|
|
82
|
-
'high_school_mathematics',
|
|
83
|
-
'high_school_computer_science',
|
|
84
|
-
'conceptual_physics',
|
|
85
|
-
'miscellaneous',
|
|
86
|
-
'high_school_chemistry',
|
|
87
|
-
'marketing',
|
|
88
|
-
'professional_law',
|
|
89
|
-
'management',
|
|
90
|
-
'college_physics',
|
|
91
|
-
'jurisprudence',
|
|
92
|
-
'world_religions',
|
|
93
|
-
'sociology',
|
|
94
|
-
'us_foreign_policy',
|
|
95
|
-
'high_school_macroeconomics',
|
|
96
|
-
'computer_security',
|
|
97
|
-
'moral_scenarios',
|
|
98
|
-
'moral_disputes',
|
|
99
|
-
'electrical_engineering',
|
|
100
|
-
'astronomy',
|
|
101
|
-
'college_biology',
|
|
102
|
-
]
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
class MMLUConfig(datasets.BuilderConfig):
|
|
106
|
-
|
|
107
|
-
def __init__(self, **kwargs):
|
|
108
|
-
super().__init__(version=datasets.Version('1.0.0'), **kwargs)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
class MMLU(datasets.GeneratorBasedBuilder):
|
|
112
|
-
BUILDER_CONFIGS = [MMLUConfig(name=task_name, ) for task_name in task_list]
|
|
113
|
-
|
|
114
|
-
def _info(self):
|
|
115
|
-
features = datasets.Features({
|
|
116
|
-
'input': datasets.Value('string'),
|
|
117
|
-
'A': datasets.Value('string'),
|
|
118
|
-
'B': datasets.Value('string'),
|
|
119
|
-
'C': datasets.Value('string'),
|
|
120
|
-
'D': datasets.Value('string'),
|
|
121
|
-
'target': datasets.Value('string'),
|
|
122
|
-
})
|
|
123
|
-
return datasets.DatasetInfo(
|
|
124
|
-
description=_DESCRIPTION,
|
|
125
|
-
features=features,
|
|
126
|
-
homepage=_HOMEPAGE,
|
|
127
|
-
license=_LICENSE,
|
|
128
|
-
citation=_CITATION,
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
def _split_generators(self, dl_manager):
|
|
132
|
-
data_dir = dl_manager.download_and_extract(_URL)
|
|
133
|
-
task_name = self.config.name
|
|
134
|
-
return [
|
|
135
|
-
datasets.SplitGenerator(
|
|
136
|
-
name=datasets.Split.TEST,
|
|
137
|
-
gen_kwargs={
|
|
138
|
-
'filepath': os.path.join(data_dir, 'data', 'test', f'{task_name}_test.csv'),
|
|
139
|
-
},
|
|
140
|
-
),
|
|
141
|
-
datasets.SplitGenerator(
|
|
142
|
-
name=datasets.Split.VALIDATION,
|
|
143
|
-
gen_kwargs={
|
|
144
|
-
'filepath': os.path.join(data_dir, 'data', 'val', f'{task_name}_val.csv'),
|
|
145
|
-
},
|
|
146
|
-
),
|
|
147
|
-
datasets.SplitGenerator(
|
|
148
|
-
name=datasets.Split.TRAIN,
|
|
149
|
-
gen_kwargs={
|
|
150
|
-
'filepath': os.path.join(data_dir, 'data', 'dev', f'{task_name}_dev.csv'),
|
|
151
|
-
},
|
|
152
|
-
),
|
|
153
|
-
]
|
|
154
|
-
|
|
155
|
-
def _generate_examples(self, filepath):
|
|
156
|
-
df = pd.read_csv(filepath)
|
|
157
|
-
df.columns = ['input', 'A', 'B', 'C', 'D', 'target']
|
|
158
|
-
|
|
159
|
-
for i, instance in enumerate(df.to_dict(orient='records')):
|
|
160
|
-
yield i, instance
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
{'input': 'A "dished face" profile is often associated with', 'A': 'a protruding mandible due to reactivation of the condylar cartilage by acromegaly.', 'B': 'a recessive maxilla due to failure of elongation of the cranial base.', 'C': 'an enlarged frontal bone due to hydrocephaly.', 'D': 'defective development of the maxillary air sinus.', 'target': 'B'}
|
|
2
|
-
{'input': '___________ is based on the idea that customer expectations of the service they will receive shape their perception of the actual service encounter.', 'A': 'Service quality.', 'B': 'Service action.', 'C': 'Service recovery.', 'D': 'Service satisfaction.', 'target': 'A'}
|
|
3
|
-
{'input': ' Information collected for the first time specifically for a marketing research study is called:', 'A': 'Secondary research.', 'B': 'Primary research.', 'C': 'Soft research.', 'D': 'Experimental research.', 'target': 'B'}
|
|
4
|
-
{'input': "This includes advertisements that contain 'call-to-response' mechanisms such as telephone numbers, website addresses, email and postal addresses:", 'A': 'Direct response advertising.', 'B': 'Sales promotions.', 'C': 'Mass media advertising.', 'D': 'Public relations.', 'target': 'A'}
|
|
5
|
-
{'input': 'Which of the following is not part of the external marketing environment?', 'A': 'Political.', 'B': 'Legal.', 'C': 'Product.', 'D': 'Socio-cultural.', 'target': 'C'}
|
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
import datasets
|
|
15
|
-
import os
|
|
16
|
-
import pandas as pd
|
|
17
|
-
|
|
18
|
-
_CITATION = """\
|
|
19
|
-
@inproceedings{lai-etal-2017-race,
|
|
20
|
-
title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
|
|
21
|
-
author = "Lai, Guokun and
|
|
22
|
-
Xie, Qizhe and
|
|
23
|
-
Liu, Hanxiao and
|
|
24
|
-
Yang, Yiming and
|
|
25
|
-
Hovy, Eduard",
|
|
26
|
-
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
|
|
27
|
-
month = sep,
|
|
28
|
-
year = "2017",
|
|
29
|
-
address = "Copenhagen, Denmark",
|
|
30
|
-
publisher = "Association for Computational Linguistics",
|
|
31
|
-
url = "https://aclanthology.org/D17-1082",
|
|
32
|
-
doi = "10.18653/v1/D17-1082",
|
|
33
|
-
pages = "785--794",
|
|
34
|
-
}
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
_DESCRIPTION = """\
|
|
38
|
-
RACE is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions.
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/race/summary'
|
|
42
|
-
|
|
43
|
-
_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip'
|
|
44
|
-
|
|
45
|
-
task_list = [
|
|
46
|
-
'high',
|
|
47
|
-
'middle',
|
|
48
|
-
]
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class RACEConfig(datasets.BuilderConfig):
|
|
52
|
-
|
|
53
|
-
def __init__(self, **kwargs):
|
|
54
|
-
super().__init__(version=datasets.Version('1.0.0'), **kwargs)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
class RACE(datasets.GeneratorBasedBuilder):
|
|
58
|
-
BUILDER_CONFIGS = [RACEConfig(name=task_name, ) for task_name in task_list]
|
|
59
|
-
|
|
60
|
-
def _info(self):
|
|
61
|
-
features = datasets.Features({
|
|
62
|
-
'example_id': datasets.Value('string'),
|
|
63
|
-
'article': datasets.Value('string'),
|
|
64
|
-
'answer': datasets.Value('string'),
|
|
65
|
-
'question': datasets.Value('string'),
|
|
66
|
-
'options': [datasets.Value('string')],
|
|
67
|
-
})
|
|
68
|
-
return datasets.DatasetInfo(
|
|
69
|
-
description=_DESCRIPTION,
|
|
70
|
-
features=features,
|
|
71
|
-
homepage=_HOMEPAGE,
|
|
72
|
-
citation=_CITATION,
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
def _split_generators(self, dl_manager):
|
|
76
|
-
data_dir = dl_manager.download_and_extract(_URL)
|
|
77
|
-
task_name = self.config.name
|
|
78
|
-
return [
|
|
79
|
-
datasets.SplitGenerator(
|
|
80
|
-
name=datasets.Split.TEST,
|
|
81
|
-
gen_kwargs={
|
|
82
|
-
'filepath': os.path.join(data_dir, f'race/test/{task_name}-00000-of-00001.parquet'),
|
|
83
|
-
},
|
|
84
|
-
),
|
|
85
|
-
datasets.SplitGenerator(
|
|
86
|
-
name=datasets.Split.VALIDATION,
|
|
87
|
-
gen_kwargs={
|
|
88
|
-
'filepath': os.path.join(data_dir, f'race/val/{task_name}-00000-of-00001.parquet'),
|
|
89
|
-
},
|
|
90
|
-
),
|
|
91
|
-
datasets.SplitGenerator(
|
|
92
|
-
name=datasets.Split.TRAIN,
|
|
93
|
-
gen_kwargs={
|
|
94
|
-
'filepath': os.path.join(data_dir, f'race/train/{task_name}-00000-of-00001.parquet'),
|
|
95
|
-
},
|
|
96
|
-
),
|
|
97
|
-
]
|
|
98
|
-
|
|
99
|
-
def _generate_examples(self, filepath):
|
|
100
|
-
df = pd.read_parquet(filepath)
|
|
101
|
-
df.columns = ['example_id', 'article', 'answer', 'question', 'options']
|
|
102
|
-
|
|
103
|
-
for i, instance in enumerate(df.to_dict(orient='records')):
|
|
104
|
-
yield i, instance
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
{'example_id': 'middle4227.txt', 'article': 'There are many kinds...ealthy.\n,.', 'answer': 'D', 'question': 'We may read this pas... in _ .', 'options': ['a letter', 'a story', 'a newspaper', 'a health magazine']}
|
|
2
|
-
{'example_id': 'middle3329.txt', 'article': 'Do you know why diff...ng at all.', 'answer': 'B', 'question': 'Those pests with dif...of danger.', 'options': ['change their colours', 'hide in the day time...r at night', 'move quietly', 'hide at night and ap...e day time']}
|
|
3
|
-
{'example_id': 'middle3614.txt', 'article': 'The seahorse is a ve...o the sea.', 'answer': 'B', 'question': 'A seahorse eats _ .', 'options': ['sea weed', 'small fish', 'water', 'nothing']}
|
|
4
|
-
{'example_id': 'middle6632.txt', 'article': 'Kids have unbelievab...h at her."', 'answer': 'D', 'question': 'Which is NOT mention...e passage?', 'options': ['Robots keep secrets.', 'Robots give suggestions.', 'Robots do chores.', 'Robots make movies.']}
|
|
5
|
-
{'example_id': 'middle3503.txt', 'article': 'Have you ever heard ...eir lives.', 'answer': 'B', 'question': 'Which of the followi...lue moon"?', 'options': ['Simon often tells jo...blue moon.', 'Tom rarely remembers...blue moon.', 'Mary likes to go sho...blue moon.', 'Cindy hates to stay ...blue moon.']}
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
import datasets
|
|
15
|
-
import json
|
|
16
|
-
import os
|
|
17
|
-
import pandas as pd
|
|
18
|
-
|
|
19
|
-
_CITATION = """\
|
|
20
|
-
@article{2017arXivtriviaqa,
|
|
21
|
-
author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
|
|
22
|
-
Daniel and {Zettlemoyer}, Luke},
|
|
23
|
-
title = "{triviaqa: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}",
|
|
24
|
-
journal = {arXiv e-prints},
|
|
25
|
-
year = 2017,
|
|
26
|
-
eid = {arXiv:1705.03551},
|
|
27
|
-
pages = {arXiv:1705.03551},
|
|
28
|
-
archivePrefix = {arXiv},
|
|
29
|
-
eprint = {1705.03551},
|
|
30
|
-
}
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
_DESCRIPTION = """\
|
|
34
|
-
TriviaqQA is a reading comprehension dataset containing over 650K question-answer-evidence triples.
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/trivia_qa/summary'
|
|
38
|
-
|
|
39
|
-
_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip'
|
|
40
|
-
|
|
41
|
-
task_list = ['default']
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class TriviaQAConfig(datasets.BuilderConfig):
|
|
45
|
-
|
|
46
|
-
def __init__(self, **kwargs):
|
|
47
|
-
super().__init__(version=datasets.Version('1.0.0'), **kwargs)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class TriviaQA(datasets.GeneratorBasedBuilder):
|
|
51
|
-
BUILDER_CONFIGS = [TriviaQAConfig(name=task_name, ) for task_name in task_list]
|
|
52
|
-
|
|
53
|
-
def _info(self):
|
|
54
|
-
features = datasets.Features({
|
|
55
|
-
'input': [{
|
|
56
|
-
'role': datasets.features.Value('string'),
|
|
57
|
-
'content': datasets.features.Value('string'),
|
|
58
|
-
}],
|
|
59
|
-
'ideal': [datasets.Value('string')],
|
|
60
|
-
})
|
|
61
|
-
return datasets.DatasetInfo(
|
|
62
|
-
description=_DESCRIPTION,
|
|
63
|
-
features=features,
|
|
64
|
-
homepage=_HOMEPAGE,
|
|
65
|
-
citation=_CITATION,
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
def _split_generators(self, dl_manager):
|
|
69
|
-
data_dir = dl_manager.download_and_extract(_URL)
|
|
70
|
-
return [
|
|
71
|
-
datasets.SplitGenerator(
|
|
72
|
-
name=datasets.Split.TEST,
|
|
73
|
-
gen_kwargs={
|
|
74
|
-
'filepath': os.path.join(data_dir, 'trivia_qa/test.jsonl'),
|
|
75
|
-
},
|
|
76
|
-
),
|
|
77
|
-
datasets.SplitGenerator(
|
|
78
|
-
name=datasets.Split('dev'),
|
|
79
|
-
gen_kwargs={
|
|
80
|
-
'filepath': os.path.join(data_dir, 'trivia_qa/dev.jsonl'),
|
|
81
|
-
},
|
|
82
|
-
),
|
|
83
|
-
]
|
|
84
|
-
|
|
85
|
-
def _generate_examples(self, filepath):
|
|
86
|
-
with open(filepath, encoding='utf-8') as f:
|
|
87
|
-
contents = [json.loads(line) for line in f.readlines()]
|
|
88
|
-
for i, instance in enumerate(contents):
|
|
89
|
-
yield i, instance
|
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
"""TruthfulQA dataset."""
|
|
16
|
-
# flake8: noqa
|
|
17
|
-
|
|
18
|
-
import csv
|
|
19
|
-
import datasets
|
|
20
|
-
import json
|
|
21
|
-
|
|
22
|
-
_CITATION = """\
|
|
23
|
-
@misc{lin2021truthfulqa,
|
|
24
|
-
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
|
25
|
-
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
|
26
|
-
year={2021},
|
|
27
|
-
eprint={2109.07958},
|
|
28
|
-
archivePrefix={arXiv},
|
|
29
|
-
primaryClass={cs.CL}
|
|
30
|
-
}
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
_DESCRIPTION = """\
|
|
34
|
-
TruthfulQA is a benchmark to measure whether a language model is truthful in
|
|
35
|
-
generating answers to questions. The benchmark comprises 817 questions that
|
|
36
|
-
span 38 categories, including health, law, finance and politics. Questions are
|
|
37
|
-
crafted so that some humans would answer falsely due to a false belief or
|
|
38
|
-
misconception. To perform well, models must avoid generating false answers
|
|
39
|
-
learned from imitating human texts.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
_HOMEPAGE = 'https://github.com/sylinrl/TruthfulQA'
|
|
43
|
-
|
|
44
|
-
_LICENSE = 'Apache License 2.0'
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class TruthfulQaConfig(datasets.BuilderConfig):
|
|
48
|
-
"""BuilderConfig for TruthfulQA."""
|
|
49
|
-
|
|
50
|
-
def __init__(self, url, features, **kwargs):
|
|
51
|
-
"""BuilderConfig for TruthfulQA.
|
|
52
|
-
Args:
|
|
53
|
-
url: *string*, the url to the configuration's data.
|
|
54
|
-
features: *list[string]*, list of features that'll appear in the feature dict.
|
|
55
|
-
**kwargs: keyword arguments forwarded to super.
|
|
56
|
-
"""
|
|
57
|
-
super().__init__(version=datasets.Version('1.1.0'), **kwargs)
|
|
58
|
-
self.url = url
|
|
59
|
-
self.features = features
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class TruthfulQa(datasets.GeneratorBasedBuilder):
|
|
63
|
-
"""TruthfulQA is a benchmark to measure whether a language model is truthful in generating answers to questions."""
|
|
64
|
-
|
|
65
|
-
BUILDER_CONFIGS = [
|
|
66
|
-
TruthfulQaConfig(
|
|
67
|
-
name='generation',
|
|
68
|
-
# url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv",
|
|
69
|
-
url='https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/truthful_qa/TruthfulQA.csv',
|
|
70
|
-
features=datasets.Features({
|
|
71
|
-
'type': datasets.Value('string'),
|
|
72
|
-
'category': datasets.Value('string'),
|
|
73
|
-
'question': datasets.Value('string'),
|
|
74
|
-
'best_answer': datasets.Value('string'),
|
|
75
|
-
'correct_answers': datasets.features.Sequence(datasets.Value('string')),
|
|
76
|
-
'incorrect_answers': datasets.features.Sequence(datasets.Value('string')),
|
|
77
|
-
'source': datasets.Value('string'),
|
|
78
|
-
}),
|
|
79
|
-
description=
|
|
80
|
-
"The Generation TruthfulQA (main) task tests a model's ability to generate 1-2 sentence answers for a given question truthfully.",
|
|
81
|
-
),
|
|
82
|
-
TruthfulQaConfig(
|
|
83
|
-
name='multiple_choice',
|
|
84
|
-
# url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json",
|
|
85
|
-
url='https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/truthful_qa/mc_task.json',
|
|
86
|
-
features=datasets.Features({
|
|
87
|
-
'question': datasets.Value('string'),
|
|
88
|
-
'mc1_targets': {
|
|
89
|
-
'choices': datasets.features.Sequence(datasets.Value('string')),
|
|
90
|
-
'labels': datasets.features.Sequence(datasets.Value('int32')),
|
|
91
|
-
},
|
|
92
|
-
'mc2_targets': {
|
|
93
|
-
'choices': datasets.features.Sequence(datasets.Value('string')),
|
|
94
|
-
'labels': datasets.features.Sequence(datasets.Value('int32')),
|
|
95
|
-
},
|
|
96
|
-
}),
|
|
97
|
-
description=
|
|
98
|
-
"The Multiple-Choice TruthfulQA task provides a multiple-choice option to test a model's ability to identify true statements.",
|
|
99
|
-
),
|
|
100
|
-
]
|
|
101
|
-
|
|
102
|
-
def _info(self):
|
|
103
|
-
return datasets.DatasetInfo(
|
|
104
|
-
description=_DESCRIPTION,
|
|
105
|
-
features=self.config.features,
|
|
106
|
-
homepage=_HOMEPAGE,
|
|
107
|
-
license=_LICENSE,
|
|
108
|
-
citation=_CITATION,
|
|
109
|
-
)
|
|
110
|
-
|
|
111
|
-
def _split_generators(self, dl_manager):
|
|
112
|
-
data_dir = dl_manager.download(self.config.url)
|
|
113
|
-
return [
|
|
114
|
-
datasets.SplitGenerator(
|
|
115
|
-
name=datasets.Split.VALIDATION,
|
|
116
|
-
gen_kwargs={
|
|
117
|
-
'filepath': data_dir,
|
|
118
|
-
},
|
|
119
|
-
),
|
|
120
|
-
]
|
|
121
|
-
|
|
122
|
-
def _split_csv_list(self, csv_list: str, delimiter: str = ';') -> str:
|
|
123
|
-
"""
|
|
124
|
-
Splits a csv list field, delimited by `delimiter` (';'), into a list
|
|
125
|
-
of strings.
|
|
126
|
-
"""
|
|
127
|
-
csv_list = csv_list.strip().split(delimiter)
|
|
128
|
-
return [item.strip() for item in csv_list]
|
|
129
|
-
|
|
130
|
-
def _generate_examples(self, filepath):
|
|
131
|
-
if self.config.name == 'multiple_choice':
|
|
132
|
-
# Multiple choice data is in a `JSON` file.
|
|
133
|
-
with open(filepath, encoding='utf-8') as f:
|
|
134
|
-
contents = json.load(f)
|
|
135
|
-
for key, row in enumerate(contents):
|
|
136
|
-
yield key, {
|
|
137
|
-
'question': row['question'],
|
|
138
|
-
'mc1_targets': {
|
|
139
|
-
'choices': list(row['mc1_targets'].keys()),
|
|
140
|
-
'labels': list(row['mc1_targets'].values()),
|
|
141
|
-
},
|
|
142
|
-
'mc2_targets': {
|
|
143
|
-
'choices': list(row['mc2_targets'].keys()),
|
|
144
|
-
'labels': list(row['mc2_targets'].values()),
|
|
145
|
-
},
|
|
146
|
-
}
|
|
147
|
-
else:
|
|
148
|
-
# Generation data is in a `CSV` file.
|
|
149
|
-
with open(filepath, newline='', encoding='utf-8-sig') as f:
|
|
150
|
-
contents = csv.DictReader(f)
|
|
151
|
-
for key, row in enumerate(contents):
|
|
152
|
-
# Ensure that references exist.
|
|
153
|
-
if not row['Correct Answers'] or not row['Incorrect Answers']:
|
|
154
|
-
continue
|
|
155
|
-
yield key, {
|
|
156
|
-
'type': row['Type'],
|
|
157
|
-
'category': row['Category'],
|
|
158
|
-
'question': row['Question'],
|
|
159
|
-
'best_answer': row['Best Answer'],
|
|
160
|
-
'correct_answers': self._split_csv_list(row['Correct Answers']),
|
|
161
|
-
'incorrect_answers': self._split_csv_list(row['Incorrect Answers']),
|
|
162
|
-
'source': row['Source'],
|
|
163
|
-
}
|