evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
DESCRIPTION = (
|
|
14
|
+
'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
|
|
15
|
+
'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
|
|
16
|
+
'or rhetorically subversive.'
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
PROMPT_TEMPLATE = """
|
|
20
|
+
#Instruction#:
|
|
21
|
+
Classify whether the given text is a Drivelology sample or not.
|
|
22
|
+
|
|
23
|
+
#Definition#:
|
|
24
|
+
- Drivelology: Statements that appear logically coherent but contain deeper, often paradoxical meanings.
|
|
25
|
+
These challenge conventional interpretation by blending surface-level nonsense with underlying depth,
|
|
26
|
+
often incorporating elements of humor, irony, or sarcasm, and requiring contextual understanding and
|
|
27
|
+
emotional insight to unravel their true significance.
|
|
28
|
+
- non-Drivelology: This includes pure nonsense (grammatically correct but semantically meaningless
|
|
29
|
+
statements, such as "Colourless green ideas sleep furiously") and normal sentences, including quotes
|
|
30
|
+
or proverbs, that convey clear or straightforward information without the layered complexity
|
|
31
|
+
characteristic of Drivelology.
|
|
32
|
+
|
|
33
|
+
#Output Format#:
|
|
34
|
+
You should try your best to answer "Yes" if the given input text is Drivelology, otherwise specify "No".
|
|
35
|
+
The answer you give MUST be \"Yes\" or \"No\"".
|
|
36
|
+
|
|
37
|
+
#Input Text#: {text}
|
|
38
|
+
#Your Answer#:
|
|
39
|
+
""".strip() # noqa: E501
|
|
40
|
+
|
|
41
|
+
FEWSHOT_PROMPT_TEMPLATE = """
|
|
42
|
+
#Instruction#:
|
|
43
|
+
Classify whether the given text is a Drivelology sample or not.
|
|
44
|
+
|
|
45
|
+
#Definition#:
|
|
46
|
+
- Drivelology: Statements that appear logically coherent but contain deeper, often paradoxical meanings.
|
|
47
|
+
These challenge conventional interpretation by blending surface-level nonsense with underlying depth,
|
|
48
|
+
often incorporating elements of humor, irony, or sarcasm, and requiring contextual understanding and
|
|
49
|
+
emotional insight to unravel their true significance.
|
|
50
|
+
- non-Drivelology: This includes pure nonsense (grammatically correct but semantically meaningless
|
|
51
|
+
statements, such as "Colourless green ideas sleep furiously") and normal sentences, including quotes
|
|
52
|
+
or proverbs, that convey clear or straightforward information without the layered complexity
|
|
53
|
+
characteristic of Drivelology.
|
|
54
|
+
|
|
55
|
+
#Output Format#:
|
|
56
|
+
You should try your best to answer "Yes" if the given input text is Drivelology, otherwise specify "No".
|
|
57
|
+
The answer you give MUST be \"Yes\" or \"No\"".
|
|
58
|
+
|
|
59
|
+
Here are some examples of how to solve similar problems:
|
|
60
|
+
|
|
61
|
+
#Input Text#: Saw a book called "how to solve 50 percent of your problems" so I bought 2 books.
|
|
62
|
+
#Your Answer#: Yes
|
|
63
|
+
|
|
64
|
+
#Input Text#: Colourless green ideas sleep furiously.
|
|
65
|
+
#Your Answer#: No
|
|
66
|
+
|
|
67
|
+
#Input Text#: I went to a restaurant, and saw this guy was choking. I gotta save him. And then I realized he was just speaking French.
|
|
68
|
+
#Your Answer#: Yes
|
|
69
|
+
|
|
70
|
+
#Input Text#: Either it is or it isn't.
|
|
71
|
+
#Your Answer#: No
|
|
72
|
+
|
|
73
|
+
#Input Text#: {text}
|
|
74
|
+
#Your Answer#:
|
|
75
|
+
""".strip() # noqa: E501
|
|
76
|
+
|
|
77
|
+
logger = get_logger()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@register_benchmark(
|
|
81
|
+
BenchmarkMeta(
|
|
82
|
+
name='drivel_binary',
|
|
83
|
+
pretty_name='DrivelologyBinaryClassification',
|
|
84
|
+
tags=[Tags.YES_NO],
|
|
85
|
+
description=DESCRIPTION.strip(),
|
|
86
|
+
dataset_id='extraordinarylab/drivel-hub',
|
|
87
|
+
subset_list=['binary-classification'],
|
|
88
|
+
metric_list=['accuracy', 'precision', 'recall', 'f1_score', 'yes_ratio'],
|
|
89
|
+
aggregation='f1',
|
|
90
|
+
few_shot_num=0,
|
|
91
|
+
eval_split='test',
|
|
92
|
+
prompt_template='{question}',
|
|
93
|
+
few_shot_prompt_template='{question}'
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
class DrivelologyBinaryClassificationAdapter(DefaultDataAdapter):
|
|
97
|
+
|
|
98
|
+
def __init__(self, **kwargs):
|
|
99
|
+
super().__init__(**kwargs)
|
|
100
|
+
self.add_overall_metric = False
|
|
101
|
+
if self.few_shot_num not in [0, 4]:
|
|
102
|
+
logger.warning(f'For DrivelologyBinaryClassification, use 4-shot by default.')
|
|
103
|
+
self.few_shot_num = 4
|
|
104
|
+
|
|
105
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
106
|
+
if self.few_shot_num > 0:
|
|
107
|
+
prompt = FEWSHOT_PROMPT_TEMPLATE.format(text=record['text'])
|
|
108
|
+
else:
|
|
109
|
+
prompt = PROMPT_TEMPLATE.format(text=record['text'])
|
|
110
|
+
content_list: List[Content] = [ContentText(text=prompt)]
|
|
111
|
+
answer = 'YES' if str(record['label']) == 'drivelology' else 'NO' # 'YES' or 'NO'
|
|
112
|
+
return Sample(input=[ChatMessageUser(content=content_list)], target=answer, metadata={
|
|
113
|
+
'answer': answer,
|
|
114
|
+
})
|
|
115
|
+
|
|
116
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
117
|
+
score = Score(
|
|
118
|
+
extracted_prediction=filtered_prediction,
|
|
119
|
+
prediction=original_prediction,
|
|
120
|
+
)
|
|
121
|
+
# Check if the reference answer is in the filtered prediction
|
|
122
|
+
result = 1 if reference in filtered_prediction.strip().upper() else 0
|
|
123
|
+
score.value = {'acc': result}
|
|
124
|
+
return score
|
|
125
|
+
|
|
126
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
127
|
+
"""
|
|
128
|
+
Custom aggregation to compute accuracy, precision, recall, f1_score, and yes_ratio.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def compute_metrics(scores: List[SampleScore]):
|
|
132
|
+
tp = fp = tn = fn = 0
|
|
133
|
+
yes_count = 0
|
|
134
|
+
total_count = len(scores)
|
|
135
|
+
|
|
136
|
+
for ss in scores:
|
|
137
|
+
gt = ss.sample_metadata['answer'].strip().upper()
|
|
138
|
+
# Get prediction based on score
|
|
139
|
+
pred = gt if ss.score.main_value == 1 else ('NO' if gt == 'YES' else 'YES')
|
|
140
|
+
if pred == 'YES':
|
|
141
|
+
yes_count += 1
|
|
142
|
+
if pred == 'YES' and gt == 'YES':
|
|
143
|
+
tp += 1
|
|
144
|
+
elif pred == 'YES' and gt == 'NO':
|
|
145
|
+
fp += 1
|
|
146
|
+
elif pred == 'NO' and gt == 'NO':
|
|
147
|
+
tn += 1
|
|
148
|
+
elif pred == 'NO' and gt == 'YES':
|
|
149
|
+
fn += 1
|
|
150
|
+
|
|
151
|
+
accuracy = (tp + tn) / total_count if total_count > 0 else 0.0
|
|
152
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
153
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
154
|
+
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
155
|
+
yes_ratio = yes_count / total_count if total_count > 0 else 0.0
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
'accuracy': accuracy,
|
|
159
|
+
'precision': precision,
|
|
160
|
+
'recall': recall,
|
|
161
|
+
'f1_score': f1_score,
|
|
162
|
+
'yes_ratio': yes_ratio
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
overall_metrics = compute_metrics(sample_scores)
|
|
166
|
+
agg_scores = []
|
|
167
|
+
for metric_name, value in overall_metrics.items():
|
|
168
|
+
agg_scores.append(AggScore(metric_name=metric_name, score=value, num=len(sample_scores), metadata={}))
|
|
169
|
+
|
|
170
|
+
return agg_scores
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
from evalscope.utils.multi_choices import parse_answers, prompt
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
DESCRIPTION = (
|
|
17
|
+
'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
|
|
18
|
+
'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
|
|
19
|
+
'or rhetorically subversive.'
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
MULTIPLE_ANSWER_TEMPLATE = r"""
|
|
23
|
+
#Instruction#:
|
|
24
|
+
Classify the given text into one or more of the following categories: inversion, wordplay, switchbait, paradox, and misdirection.
|
|
25
|
+
|
|
26
|
+
#Definitions#:
|
|
27
|
+
- inversion: This technique takes a well-known phrase, cliché, or social script and flips it on its head. The humour arises by reversing a familiar structure to creating a new, often satirical, meaning.
|
|
28
|
+
- wordplay: This is the use of linguistic creativity, often by exploiting the phonetics or polysemy of words. It includes puns, double entendres, and similarities.
|
|
29
|
+
- switchbait: This technique hinges on a specific phrase (the "bait") that has a culturally-embedded double meaning. The initial context is then suddenly replaced (the "switch") by a surprising second meaning. The humour is generated by this cynical or culturally-specific reinterpretation of the bait, rather than by derailing a narrative.
|
|
30
|
+
- paradox: This relies on a statement that appears logically self-contradictory but contains a latent, often humorous or profound truth. The core of the technique is the clash of seemingly incompatible ideas.
|
|
31
|
+
- misdirection: This technique leads the listener down an expected path before a final twist reveals a different, often more literal or absurd, ending.
|
|
32
|
+
|
|
33
|
+
Answer the following multiple choice question where multiple answers may be correct.
|
|
34
|
+
The entire content of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}.
|
|
35
|
+
|
|
36
|
+
{question}
|
|
37
|
+
|
|
38
|
+
{choices}
|
|
39
|
+
""".strip() # noqa: E501
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@register_benchmark(
|
|
43
|
+
BenchmarkMeta(
|
|
44
|
+
name='drivel_multilabel',
|
|
45
|
+
pretty_name='DrivelologyMultilabelClassification',
|
|
46
|
+
tags=[Tags.MULTIPLE_CHOICE],
|
|
47
|
+
description=DESCRIPTION.strip(),
|
|
48
|
+
dataset_id='extraordinarylab/drivel-hub',
|
|
49
|
+
subset_list=['multi-label-classification'],
|
|
50
|
+
metric_list=['f1_weighted', 'f1_micro', 'f1_macro', 'exact_match'],
|
|
51
|
+
aggregation='f1_weighted',
|
|
52
|
+
eval_split='test',
|
|
53
|
+
prompt_template='{question}',
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
class DrivelologyMultilabelClassificationAdapter(DefaultDataAdapter):
|
|
57
|
+
|
|
58
|
+
def __init__(self, *args, **kwargs):
|
|
59
|
+
super().__init__(*args, **kwargs)
|
|
60
|
+
self.categories = ['inversion', 'wordplay', 'switchbait', 'paradox', 'misdirection']
|
|
61
|
+
self.choices = {'A': 'inversion', 'B': 'wordplay', 'C': 'switchbait', 'D': 'paradox', 'E': 'misdirection'}
|
|
62
|
+
self.categories_to_letters = {v: k for k, v in self.choices.items()}
|
|
63
|
+
|
|
64
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
65
|
+
text: str = record['text']
|
|
66
|
+
label: List[str] = record['label']
|
|
67
|
+
question = f'Text to classify: {text}'
|
|
68
|
+
choices_list = [f'{key}. {value}' for key, value in self.choices.items()]
|
|
69
|
+
input_text = prompt(question=question, choices=choices_list, template=MULTIPLE_ANSWER_TEMPLATE)
|
|
70
|
+
content_list = [ContentText(text=input_text)]
|
|
71
|
+
target_letters = ''.join(
|
|
72
|
+
sorted([self.categories_to_letters[cat] for cat in label if cat in self.categories_to_letters])
|
|
73
|
+
)
|
|
74
|
+
metadata = {'text': text, 'label': label, 'target_letters': target_letters}
|
|
75
|
+
return Sample(
|
|
76
|
+
input=[ChatMessageUser(content=content_list)],
|
|
77
|
+
choices=choices_list,
|
|
78
|
+
target=target_letters,
|
|
79
|
+
metadata=metadata,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
83
|
+
pattern = r'ANSWER:\s*([A-E]+)'
|
|
84
|
+
match = re.search(pattern, prediction)
|
|
85
|
+
if match:
|
|
86
|
+
letters = match.group(1).strip().upper()
|
|
87
|
+
return ''.join(sorted(set(letters)))
|
|
88
|
+
else:
|
|
89
|
+
try:
|
|
90
|
+
answers = parse_answers(prediction)
|
|
91
|
+
return ''.join(sorted(list(answers)))
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.warning(f'Could not extract answer from: {prediction}. Error: {e}')
|
|
94
|
+
return ''
|
|
95
|
+
|
|
96
|
+
def match_score(
|
|
97
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
98
|
+
) -> Score:
|
|
99
|
+
"""
|
|
100
|
+
Calculate the match score between the prediction and reference for multilabel classification.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
original_prediction: The original model output
|
|
104
|
+
filtered_prediction: The extracted answer (letter format, e.g., "AC")
|
|
105
|
+
reference: The reference answer (letter format, e.g., "AC")
|
|
106
|
+
task_state: The current task state
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Score object with metrics
|
|
110
|
+
"""
|
|
111
|
+
# Create a Score object as required by the API
|
|
112
|
+
score = Score(
|
|
113
|
+
extracted_prediction=filtered_prediction,
|
|
114
|
+
prediction=original_prediction,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Convert letter answers to category sets
|
|
118
|
+
pred_categories = set(self.choices.get(letter, '') for letter in filtered_prediction)
|
|
119
|
+
target_categories = set(self.choices.get(letter, '') for letter in reference)
|
|
120
|
+
|
|
121
|
+
# Remove empty strings (may be caused by invalid letters)
|
|
122
|
+
pred_categories = {cat for cat in pred_categories if cat}
|
|
123
|
+
target_categories = {cat for cat in target_categories if cat}
|
|
124
|
+
|
|
125
|
+
# Calculate TP (true positives), FP (false positives), and FN (false negatives)
|
|
126
|
+
tp = len(pred_categories & target_categories) # intersection
|
|
127
|
+
fp = len(pred_categories - target_categories) # in prediction but not in target
|
|
128
|
+
fn = len(target_categories - pred_categories) # in target but not in prediction
|
|
129
|
+
|
|
130
|
+
# Calculate precision, recall and F1 score
|
|
131
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
132
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
133
|
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
134
|
+
|
|
135
|
+
# Calculate exact match (1.0 if prediction exactly matches target)
|
|
136
|
+
exact_match = 1.0 if pred_categories == target_categories else 0.0
|
|
137
|
+
|
|
138
|
+
# Store category information in metadata for later aggregation
|
|
139
|
+
category_data = {}
|
|
140
|
+
for cat in self.categories:
|
|
141
|
+
in_pred = cat in pred_categories
|
|
142
|
+
in_target = cat in target_categories
|
|
143
|
+
|
|
144
|
+
category_data[cat] = {
|
|
145
|
+
'tp': 1 if in_pred and in_target else 0,
|
|
146
|
+
'fp': 1 if in_pred and not in_target else 0,
|
|
147
|
+
'fn': 1 if not in_pred and in_target else 0,
|
|
148
|
+
'support': 1 if in_target else 0
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# Set simple numerical values in score.value as expected by the API
|
|
152
|
+
score.value = {'f1': f1, 'precision': precision, 'recall': recall, 'exact_match': exact_match}
|
|
153
|
+
|
|
154
|
+
# Store category data in metadata for aggregation
|
|
155
|
+
score.metadata = {'category_data': category_data}
|
|
156
|
+
|
|
157
|
+
return score
|
|
158
|
+
|
|
159
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
160
|
+
"""
|
|
161
|
+
Aggregate scores across all samples.
|
|
162
|
+
Computes weighted, macro, and micro F1 scores for multilabel classification.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
sample_scores: List of sample scores
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
List of aggregated scores
|
|
169
|
+
"""
|
|
170
|
+
if not sample_scores:
|
|
171
|
+
return [
|
|
172
|
+
AggScore(metric_name='f1_weighted', score=0.0, num=0, metadata={}),
|
|
173
|
+
AggScore(metric_name='f1_micro', score=0.0, num=0, metadata={}),
|
|
174
|
+
AggScore(metric_name='f1_macro', score=0.0, num=0, metadata={}),
|
|
175
|
+
AggScore(metric_name='exact_match', score=0.0, num=0, metadata={})
|
|
176
|
+
]
|
|
177
|
+
|
|
178
|
+
# Initialize category statistics
|
|
179
|
+
category_stats = {cat: {'tp': 0, 'fp': 0, 'fn': 0, 'support': 0} for cat in self.categories}
|
|
180
|
+
total_exact_matches = 0
|
|
181
|
+
num_samples = len(sample_scores)
|
|
182
|
+
|
|
183
|
+
# Aggregate statistics across all samples
|
|
184
|
+
for ss in sample_scores:
|
|
185
|
+
# Add exact match score to total
|
|
186
|
+
total_exact_matches += ss.score.value.get('exact_match', 0)
|
|
187
|
+
|
|
188
|
+
# Get category data from metadata
|
|
189
|
+
if 'category_data' in ss.score.metadata:
|
|
190
|
+
cat_data = ss.score.metadata['category_data']
|
|
191
|
+
for cat, stats in cat_data.items():
|
|
192
|
+
if cat in self.categories:
|
|
193
|
+
category_stats[cat]['tp'] += stats.get('tp', 0)
|
|
194
|
+
category_stats[cat]['fp'] += stats.get('fp', 0)
|
|
195
|
+
category_stats[cat]['fn'] += stats.get('fn', 0)
|
|
196
|
+
category_stats[cat]['support'] += stats.get('support', 0)
|
|
197
|
+
|
|
198
|
+
# Calculate F1 scores for each category
|
|
199
|
+
category_f1 = {}
|
|
200
|
+
total_support = sum(stats['support'] for stats in category_stats.values())
|
|
201
|
+
f1_sum = 0.0
|
|
202
|
+
|
|
203
|
+
for cat, stats in category_stats.items():
|
|
204
|
+
tp = stats['tp']
|
|
205
|
+
fp = stats['fp']
|
|
206
|
+
fn = stats['fn']
|
|
207
|
+
|
|
208
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
209
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
210
|
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
211
|
+
|
|
212
|
+
category_f1[cat] = f1
|
|
213
|
+
f1_sum += f1
|
|
214
|
+
|
|
215
|
+
# Calculate micro-average F1 (based on aggregate TP, FP, FN)
|
|
216
|
+
total_tp = sum(stats['tp'] for stats in category_stats.values())
|
|
217
|
+
total_fp = sum(stats['fp'] for stats in category_stats.values())
|
|
218
|
+
total_fn = sum(stats['fn'] for stats in category_stats.values())
|
|
219
|
+
|
|
220
|
+
micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
|
|
221
|
+
micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
|
|
222
|
+
f1_micro = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (
|
|
223
|
+
micro_precision + micro_recall
|
|
224
|
+
) > 0 else 0.0
|
|
225
|
+
|
|
226
|
+
# Calculate macro-average F1 (simple average of category F1 scores)
|
|
227
|
+
f1_macro = f1_sum / len(self.categories) if self.categories else 0.0
|
|
228
|
+
|
|
229
|
+
# Calculate weighted-average F1 (weighted by support)
|
|
230
|
+
f1_weighted = 0.0
|
|
231
|
+
if total_support > 0:
|
|
232
|
+
for cat, stats in category_stats.items():
|
|
233
|
+
cat_f1 = category_f1[cat]
|
|
234
|
+
weight = stats['support'] / total_support
|
|
235
|
+
f1_weighted += cat_f1 * weight
|
|
236
|
+
|
|
237
|
+
# Calculate accuracy (proportion of exact matches)
|
|
238
|
+
exact_match = total_exact_matches / num_samples
|
|
239
|
+
|
|
240
|
+
# Return list of aggregate scores
|
|
241
|
+
return [
|
|
242
|
+
AggScore(
|
|
243
|
+
metric_name='f1_weighted',
|
|
244
|
+
score=f1_weighted,
|
|
245
|
+
num=num_samples,
|
|
246
|
+
metadata={'category_f1': {
|
|
247
|
+
cat: f1
|
|
248
|
+
for cat, f1 in category_f1.items()
|
|
249
|
+
}}
|
|
250
|
+
),
|
|
251
|
+
AggScore(metric_name='f1_micro', score=f1_micro, num=num_samples, metadata={}),
|
|
252
|
+
AggScore(metric_name='f1_macro', score=f1_macro, num=num_samples, metadata={}),
|
|
253
|
+
AggScore(metric_name='exact_match', score=exact_match, num=num_samples, metadata={})
|
|
254
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
2
|
+
from evalscope.api.dataset import Sample
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
|
|
8
|
+
'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
|
|
9
|
+
'or rhetorically subversive.'
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
PROMPT_TEMPLATE = r"""
|
|
13
|
+
Tell me the best option in the following options which represents the underlying narrative of the text?
|
|
14
|
+
The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
|
|
15
|
+
|
|
16
|
+
{question}
|
|
17
|
+
|
|
18
|
+
{choices}
|
|
19
|
+
""".strip() # noqa: E501
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@register_benchmark(
|
|
23
|
+
BenchmarkMeta(
|
|
24
|
+
name='drivel_selection',
|
|
25
|
+
pretty_name='DrivelologyNarrativeSelection',
|
|
26
|
+
tags=[Tags.MULTIPLE_CHOICE],
|
|
27
|
+
description=DESCRIPTION.strip(),
|
|
28
|
+
dataset_id='extraordinarylab/drivel-hub',
|
|
29
|
+
subset_list=['multiple-choice-english-easy', 'multiple-choice-english-hard'],
|
|
30
|
+
metric_list=['acc'],
|
|
31
|
+
few_shot_num=0,
|
|
32
|
+
train_split=None,
|
|
33
|
+
eval_split='test',
|
|
34
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
class DrivelologyNarrativeSelectionAdapter(MultiChoiceAdapter):
|
|
38
|
+
|
|
39
|
+
def __init__(self, **kwargs):
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
self.add_overall_metric = False
|
|
42
|
+
|
|
43
|
+
def record_to_sample(self, record) -> Sample:
|
|
44
|
+
return Sample(
|
|
45
|
+
input=record['text'],
|
|
46
|
+
choices=record['choices'],
|
|
47
|
+
target=record['answer'],
|
|
48
|
+
metadata={},
|
|
49
|
+
)
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
DESCRIPTION = (
|
|
16
|
+
'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
|
|
17
|
+
'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
|
|
18
|
+
'or rhetorically subversive.'
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Keep the original generation and evaluation templates
|
|
22
|
+
NARRATIVE_GENERATION_TEMPLATE = """
|
|
23
|
+
You need to first read and understand the text given. Generate a detailed description to illustrate the implicit narrative of the text.
|
|
24
|
+
|
|
25
|
+
Please provide your response in English, with a clear and comprehensive explanation of the narrative.
|
|
26
|
+
|
|
27
|
+
Text: {text}
|
|
28
|
+
""".strip() # noqa: E501
|
|
29
|
+
|
|
30
|
+
NARRATIVE_EVALUATION_TEMPLATE = """
|
|
31
|
+
Please act as an impartial judge and evaluate how accurately the candidate narrative matches the given reference narrative.
|
|
32
|
+
Your evaluation should consider factors such as the relevance, accuracy, depth, and level of detail of the candidate narrative compared to the reference.
|
|
33
|
+
|
|
34
|
+
Begin your evaluation by providing a short explanation in English. Be as objective as possible.
|
|
35
|
+
|
|
36
|
+
After providing your explanation, you must rate the match on a Likert scale from 1 to 5, where:
|
|
37
|
+
1 = Very poor match
|
|
38
|
+
2 = Poor match
|
|
39
|
+
3 = Moderate match
|
|
40
|
+
4 = Good match
|
|
41
|
+
5 = Excellent match
|
|
42
|
+
|
|
43
|
+
Please format your rating strictly as: "Rating: [[X]]" where X is a whole number from 1 to 5.
|
|
44
|
+
|
|
45
|
+
[Candidate Narrative]
|
|
46
|
+
{candidate}
|
|
47
|
+
|
|
48
|
+
[Reference Narrative]
|
|
49
|
+
{reference}
|
|
50
|
+
""".strip() # noqa: E501
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@register_benchmark(
|
|
54
|
+
BenchmarkMeta(
|
|
55
|
+
name='drivel_writing',
|
|
56
|
+
pretty_name='DrivelologyNarrativeWriting',
|
|
57
|
+
tags=[Tags.KNOWLEDGE, Tags.REASONING],
|
|
58
|
+
description=DESCRIPTION.strip(),
|
|
59
|
+
dataset_id='extraordinarylab/drivel-hub',
|
|
60
|
+
subset_list=['narrative-writing-english'],
|
|
61
|
+
metric_list={
|
|
62
|
+
'bert_score': {
|
|
63
|
+
'model_id_or_path': 'AI-ModelScope/roberta-large',
|
|
64
|
+
'model_type': 'roberta-large'
|
|
65
|
+
},
|
|
66
|
+
'gpt_score': {}
|
|
67
|
+
},
|
|
68
|
+
few_shot_num=0,
|
|
69
|
+
train_split=None,
|
|
70
|
+
eval_split='test',
|
|
71
|
+
prompt_template=NARRATIVE_GENERATION_TEMPLATE
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
class DrivelologyNarrativeWritingAdapter(DefaultDataAdapter):
|
|
75
|
+
|
|
76
|
+
def __init__(self, *args, **kwargs):
|
|
77
|
+
super().__init__(*args, **kwargs)
|
|
78
|
+
self._use_llm_judge = True # Use LLM as a judge by default
|
|
79
|
+
self.use_batch_scoring = True # Enable batch scoring
|
|
80
|
+
|
|
81
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
82
|
+
"""
|
|
83
|
+
Convert a data record to a Sample object.
|
|
84
|
+
"""
|
|
85
|
+
text = record['text']
|
|
86
|
+
reference_narrative = record['narrative']
|
|
87
|
+
|
|
88
|
+
# Format the generation prompt with the text
|
|
89
|
+
input_prompt = NARRATIVE_GENERATION_TEMPLATE.format(text=text)
|
|
90
|
+
|
|
91
|
+
# Create content list for the input
|
|
92
|
+
content_list = [ContentText(text=input_prompt)]
|
|
93
|
+
|
|
94
|
+
return Sample(
|
|
95
|
+
input=[ChatMessageUser(content=content_list)],
|
|
96
|
+
target=reference_narrative,
|
|
97
|
+
metadata={
|
|
98
|
+
'text': text,
|
|
99
|
+
'reference_narrative': reference_narrative
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def batch_match_score(self, original_predictions, filtered_predictions, references, task_states):
|
|
104
|
+
"""
|
|
105
|
+
Batch calculate the match scores using BERTScore.
|
|
106
|
+
"""
|
|
107
|
+
from evalscope.metrics.metric import BertScore
|
|
108
|
+
|
|
109
|
+
score_args = self.metric_list.get('bert_score', {})
|
|
110
|
+
bert_scorer = BertScore(**score_args)
|
|
111
|
+
bert_score_f1 = bert_scorer.apply(filtered_predictions, references)
|
|
112
|
+
scores = []
|
|
113
|
+
for i in range(len(original_predictions)):
|
|
114
|
+
score = Score(
|
|
115
|
+
extracted_prediction=filtered_predictions[i],
|
|
116
|
+
prediction=original_predictions[i],
|
|
117
|
+
value={'bert_score': bert_score_f1[i]}
|
|
118
|
+
)
|
|
119
|
+
scores.append(score)
|
|
120
|
+
return scores
|
|
121
|
+
|
|
122
|
+
def llm_match_score(
|
|
123
|
+
self,
|
|
124
|
+
original_prediction: str,
|
|
125
|
+
filtered_prediction: str,
|
|
126
|
+
reference: str,
|
|
127
|
+
task_state: TaskState,
|
|
128
|
+
) -> Score:
|
|
129
|
+
"""
|
|
130
|
+
Calculate the match score using LLM judge and BERTScore.
|
|
131
|
+
"""
|
|
132
|
+
score = Score(
|
|
133
|
+
extracted_prediction=filtered_prediction,
|
|
134
|
+
prediction=original_prediction,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Initialize score value dictionary
|
|
138
|
+
score.value = {}
|
|
139
|
+
|
|
140
|
+
# Use LLM judge to evaluate narrative quality
|
|
141
|
+
eval_prompt = NARRATIVE_EVALUATION_TEMPLATE.format(candidate=filtered_prediction, reference=reference)
|
|
142
|
+
|
|
143
|
+
judge_response = self.llm_judge.judge(eval_prompt)
|
|
144
|
+
logger.info(f'LLM judge response received (first 100 chars): {judge_response[:100]}...')
|
|
145
|
+
|
|
146
|
+
# Extract rating using regex pattern
|
|
147
|
+
match = re.search(r'Rating:\s*\[\[([1-5])\]\]', judge_response)
|
|
148
|
+
if match:
|
|
149
|
+
rating = int(match.group(1))
|
|
150
|
+
gpt_score = (rating - 1) / 4.0 # Normalize to 0-1 scale
|
|
151
|
+
logger.info(f'Rating extracted: {rating}/5 -> {gpt_score}')
|
|
152
|
+
else:
|
|
153
|
+
# Try alternative pattern
|
|
154
|
+
alt_match = re.search(r'(\[\[|\[)([1-5])(\]\]|\])', judge_response)
|
|
155
|
+
if alt_match:
|
|
156
|
+
rating = int(alt_match.group(2))
|
|
157
|
+
gpt_score = (rating - 1) / 4.0
|
|
158
|
+
logger.info(f'Rating extracted (alt pattern): {rating}/5 -> {gpt_score}')
|
|
159
|
+
else:
|
|
160
|
+
# Last resort: standalone digit
|
|
161
|
+
number_match = re.search(r'(?<!\d)[1-5](?!\d)', judge_response)
|
|
162
|
+
if number_match:
|
|
163
|
+
rating = int(number_match.group(0))
|
|
164
|
+
gpt_score = (rating - 1) / 4.0
|
|
165
|
+
logger.info(f'Rating extracted (fallback): {rating}/5 -> {gpt_score}')
|
|
166
|
+
else:
|
|
167
|
+
gpt_score = 0.0
|
|
168
|
+
logger.warning('No rating found in response, using default 0.0')
|
|
169
|
+
|
|
170
|
+
score.value['gpt_score'] = gpt_score
|
|
171
|
+
score.explanation = f'LLM judge rating: {gpt_score:.2f}'
|
|
172
|
+
|
|
173
|
+
score.metadata = {
|
|
174
|
+
'judge_response': judge_response[:300],
|
|
175
|
+
'model': getattr(self.llm_judge, 'model_id', 'unknown')
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
score.main_score_name = 'gpt_score'
|
|
179
|
+
return score
|
|
180
|
+
|
|
181
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
182
|
+
"""
|
|
183
|
+
Aggregate scores across all samples.
|
|
184
|
+
"""
|
|
185
|
+
if not sample_scores:
|
|
186
|
+
return [
|
|
187
|
+
AggScore(metric_name='gpt_score', score=0.0, num=0, metadata={}),
|
|
188
|
+
AggScore(metric_name='bert_score', score=0.0, num=0, metadata={})
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
# Extract scores
|
|
192
|
+
gpt_scores = [ss.score.value.get('gpt_score', 0.0) for ss in sample_scores]
|
|
193
|
+
bert_scores = [ss.score.value.get('bert_score', 0.0) for ss in sample_scores]
|
|
194
|
+
|
|
195
|
+
# Calculate averages
|
|
196
|
+
avg_gpt_score = sum(gpt_scores) / len(gpt_scores) if gpt_scores else 0.0
|
|
197
|
+
avg_bert_score = sum(bert_scores) / len(bert_scores) if bert_scores else 0.0
|
|
198
|
+
|
|
199
|
+
return [
|
|
200
|
+
AggScore(
|
|
201
|
+
metric_name='gpt_score',
|
|
202
|
+
score=avg_gpt_score,
|
|
203
|
+
num=len(sample_scores),
|
|
204
|
+
metadata={
|
|
205
|
+
'min_score': min(gpt_scores),
|
|
206
|
+
'max_score': max(gpt_scores)
|
|
207
|
+
}
|
|
208
|
+
),
|
|
209
|
+
AggScore(
|
|
210
|
+
metric_name='bert_score',
|
|
211
|
+
score=avg_bert_score,
|
|
212
|
+
num=len(sample_scores),
|
|
213
|
+
metadata={
|
|
214
|
+
'min_score': min(bert_scores),
|
|
215
|
+
'max_score': max(bert_scores)
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
]
|