evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
from evalscope.utils.multi_choices import parse_answers, prompt
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
DESCRIPTION = (
|
|
17
|
+
'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
|
|
18
|
+
'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
|
|
19
|
+
'or rhetorically subversive.'
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
MULTIPLE_ANSWER_TEMPLATE = r"""
|
|
23
|
+
#Instruction#:
|
|
24
|
+
Classify the given text into one or more of the following categories: inversion, wordplay, switchbait, paradox, and misdirection.
|
|
25
|
+
|
|
26
|
+
#Definitions#:
|
|
27
|
+
- inversion: This technique takes a well-known phrase, cliché, or social script and flips it on its head. The humour arises by reversing a familiar structure to creating a new, often satirical, meaning.
|
|
28
|
+
- wordplay: This is the use of linguistic creativity, often by exploiting the phonetics or polysemy of words. It includes puns, double entendres, and similarities.
|
|
29
|
+
- switchbait: This technique hinges on a specific phrase (the "bait") that has a culturally-embedded double meaning. The initial context is then suddenly replaced (the "switch") by a surprising second meaning. The humour is generated by this cynical or culturally-specific reinterpretation of the bait, rather than by derailing a narrative.
|
|
30
|
+
- paradox: This relies on a statement that appears logically self-contradictory but contains a latent, often humorous or profound truth. The core of the technique is the clash of seemingly incompatible ideas.
|
|
31
|
+
- misdirection: This technique leads the listener down an expected path before a final twist reveals a different, often more literal or absurd, ending.
|
|
32
|
+
|
|
33
|
+
Answer the following multiple choice question where multiple answers may be correct.
|
|
34
|
+
The entire content of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}.
|
|
35
|
+
|
|
36
|
+
{question}
|
|
37
|
+
|
|
38
|
+
{choices}
|
|
39
|
+
""".strip() # noqa: E501
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@register_benchmark(
|
|
43
|
+
BenchmarkMeta(
|
|
44
|
+
name='drivel_multilabel',
|
|
45
|
+
pretty_name='DrivelologyMultilabelClassification',
|
|
46
|
+
tags=[Tags.MULTIPLE_CHOICE],
|
|
47
|
+
description=DESCRIPTION.strip(),
|
|
48
|
+
dataset_id='extraordinarylab/drivel-hub',
|
|
49
|
+
subset_list=['multi-label-classification'],
|
|
50
|
+
metric_list=['f1_weighted', 'f1_micro', 'f1_macro', 'exact_match'],
|
|
51
|
+
aggregation='f1_weighted',
|
|
52
|
+
eval_split='test',
|
|
53
|
+
prompt_template='{question}',
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
class DrivelologyMultilabelClassificationAdapter(DefaultDataAdapter):
|
|
57
|
+
|
|
58
|
+
def __init__(self, *args, **kwargs):
|
|
59
|
+
super().__init__(*args, **kwargs)
|
|
60
|
+
self.categories = ['inversion', 'wordplay', 'switchbait', 'paradox', 'misdirection']
|
|
61
|
+
self.choices = {'A': 'inversion', 'B': 'wordplay', 'C': 'switchbait', 'D': 'paradox', 'E': 'misdirection'}
|
|
62
|
+
self.categories_to_letters = {v: k for k, v in self.choices.items()}
|
|
63
|
+
|
|
64
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
65
|
+
text: str = record['text']
|
|
66
|
+
label: List[str] = record['label']
|
|
67
|
+
question = f'Text to classify: {text}'
|
|
68
|
+
choices_list = [f'{key}. {value}' for key, value in self.choices.items()]
|
|
69
|
+
input_text = prompt(question=question, choices=choices_list, template=MULTIPLE_ANSWER_TEMPLATE)
|
|
70
|
+
content_list = [ContentText(text=input_text)]
|
|
71
|
+
target_letters = ''.join(
|
|
72
|
+
sorted([self.categories_to_letters[cat] for cat in label if cat in self.categories_to_letters])
|
|
73
|
+
)
|
|
74
|
+
metadata = {'text': text, 'label': label, 'target_letters': target_letters}
|
|
75
|
+
return Sample(
|
|
76
|
+
input=[ChatMessageUser(content=content_list)],
|
|
77
|
+
choices=choices_list,
|
|
78
|
+
target=target_letters,
|
|
79
|
+
metadata=metadata,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
83
|
+
pattern = r'ANSWER:\s*([A-E]+)'
|
|
84
|
+
match = re.search(pattern, prediction)
|
|
85
|
+
if match:
|
|
86
|
+
letters = match.group(1).strip().upper()
|
|
87
|
+
return ''.join(sorted(set(letters)))
|
|
88
|
+
else:
|
|
89
|
+
try:
|
|
90
|
+
answers = parse_answers(prediction)
|
|
91
|
+
return ''.join(sorted(list(answers)))
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.warning(f'Could not extract answer from: {prediction}. Error: {e}')
|
|
94
|
+
return ''
|
|
95
|
+
|
|
96
|
+
def match_score(
|
|
97
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
98
|
+
) -> Score:
|
|
99
|
+
"""
|
|
100
|
+
Calculate the match score between the prediction and reference for multilabel classification.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
original_prediction: The original model output
|
|
104
|
+
filtered_prediction: The extracted answer (letter format, e.g., "AC")
|
|
105
|
+
reference: The reference answer (letter format, e.g., "AC")
|
|
106
|
+
task_state: The current task state
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Score object with metrics
|
|
110
|
+
"""
|
|
111
|
+
# Create a Score object as required by the API
|
|
112
|
+
score = Score(
|
|
113
|
+
extracted_prediction=filtered_prediction,
|
|
114
|
+
prediction=original_prediction,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Convert letter answers to category sets
|
|
118
|
+
pred_categories = set(self.choices.get(letter, '') for letter in filtered_prediction)
|
|
119
|
+
target_categories = set(self.choices.get(letter, '') for letter in reference)
|
|
120
|
+
|
|
121
|
+
# Remove empty strings (may be caused by invalid letters)
|
|
122
|
+
pred_categories = {cat for cat in pred_categories if cat}
|
|
123
|
+
target_categories = {cat for cat in target_categories if cat}
|
|
124
|
+
|
|
125
|
+
# Calculate TP (true positives), FP (false positives), and FN (false negatives)
|
|
126
|
+
tp = len(pred_categories & target_categories) # intersection
|
|
127
|
+
fp = len(pred_categories - target_categories) # in prediction but not in target
|
|
128
|
+
fn = len(target_categories - pred_categories) # in target but not in prediction
|
|
129
|
+
|
|
130
|
+
# Calculate precision, recall and F1 score
|
|
131
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
132
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
133
|
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
134
|
+
|
|
135
|
+
# Calculate exact match (1.0 if prediction exactly matches target)
|
|
136
|
+
exact_match = 1.0 if pred_categories == target_categories else 0.0
|
|
137
|
+
|
|
138
|
+
# Store category information in metadata for later aggregation
|
|
139
|
+
category_data = {}
|
|
140
|
+
for cat in self.categories:
|
|
141
|
+
in_pred = cat in pred_categories
|
|
142
|
+
in_target = cat in target_categories
|
|
143
|
+
|
|
144
|
+
category_data[cat] = {
|
|
145
|
+
'tp': 1 if in_pred and in_target else 0,
|
|
146
|
+
'fp': 1 if in_pred and not in_target else 0,
|
|
147
|
+
'fn': 1 if not in_pred and in_target else 0,
|
|
148
|
+
'support': 1 if in_target else 0
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# Set simple numerical values in score.value as expected by the API
|
|
152
|
+
score.value = {'f1': f1, 'precision': precision, 'recall': recall, 'exact_match': exact_match}
|
|
153
|
+
|
|
154
|
+
# Store category data in metadata for aggregation
|
|
155
|
+
score.metadata = {'category_data': category_data}
|
|
156
|
+
|
|
157
|
+
return score
|
|
158
|
+
|
|
159
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
160
|
+
"""
|
|
161
|
+
Aggregate scores across all samples.
|
|
162
|
+
Computes weighted, macro, and micro F1 scores for multilabel classification.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
sample_scores: List of sample scores
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
List of aggregated scores
|
|
169
|
+
"""
|
|
170
|
+
if not sample_scores:
|
|
171
|
+
return [
|
|
172
|
+
AggScore(metric_name='f1_weighted', score=0.0, num=0, metadata={}),
|
|
173
|
+
AggScore(metric_name='f1_micro', score=0.0, num=0, metadata={}),
|
|
174
|
+
AggScore(metric_name='f1_macro', score=0.0, num=0, metadata={}),
|
|
175
|
+
AggScore(metric_name='exact_match', score=0.0, num=0, metadata={})
|
|
176
|
+
]
|
|
177
|
+
|
|
178
|
+
# Initialize category statistics
|
|
179
|
+
category_stats = {cat: {'tp': 0, 'fp': 0, 'fn': 0, 'support': 0} for cat in self.categories}
|
|
180
|
+
total_exact_matches = 0
|
|
181
|
+
num_samples = len(sample_scores)
|
|
182
|
+
|
|
183
|
+
# Aggregate statistics across all samples
|
|
184
|
+
for ss in sample_scores:
|
|
185
|
+
# Add exact match score to total
|
|
186
|
+
total_exact_matches += ss.score.value.get('exact_match', 0)
|
|
187
|
+
|
|
188
|
+
# Get category data from metadata
|
|
189
|
+
if 'category_data' in ss.score.metadata:
|
|
190
|
+
cat_data = ss.score.metadata['category_data']
|
|
191
|
+
for cat, stats in cat_data.items():
|
|
192
|
+
if cat in self.categories:
|
|
193
|
+
category_stats[cat]['tp'] += stats.get('tp', 0)
|
|
194
|
+
category_stats[cat]['fp'] += stats.get('fp', 0)
|
|
195
|
+
category_stats[cat]['fn'] += stats.get('fn', 0)
|
|
196
|
+
category_stats[cat]['support'] += stats.get('support', 0)
|
|
197
|
+
|
|
198
|
+
# Calculate F1 scores for each category
|
|
199
|
+
category_f1 = {}
|
|
200
|
+
total_support = sum(stats['support'] for stats in category_stats.values())
|
|
201
|
+
f1_sum = 0.0
|
|
202
|
+
|
|
203
|
+
for cat, stats in category_stats.items():
|
|
204
|
+
tp = stats['tp']
|
|
205
|
+
fp = stats['fp']
|
|
206
|
+
fn = stats['fn']
|
|
207
|
+
|
|
208
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
209
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
210
|
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
211
|
+
|
|
212
|
+
category_f1[cat] = f1
|
|
213
|
+
f1_sum += f1
|
|
214
|
+
|
|
215
|
+
# Calculate micro-average F1 (based on aggregate TP, FP, FN)
|
|
216
|
+
total_tp = sum(stats['tp'] for stats in category_stats.values())
|
|
217
|
+
total_fp = sum(stats['fp'] for stats in category_stats.values())
|
|
218
|
+
total_fn = sum(stats['fn'] for stats in category_stats.values())
|
|
219
|
+
|
|
220
|
+
micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
|
|
221
|
+
micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
|
|
222
|
+
f1_micro = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (
|
|
223
|
+
micro_precision + micro_recall
|
|
224
|
+
) > 0 else 0.0
|
|
225
|
+
|
|
226
|
+
# Calculate macro-average F1 (simple average of category F1 scores)
|
|
227
|
+
f1_macro = f1_sum / len(self.categories) if self.categories else 0.0
|
|
228
|
+
|
|
229
|
+
# Calculate weighted-average F1 (weighted by support)
|
|
230
|
+
f1_weighted = 0.0
|
|
231
|
+
if total_support > 0:
|
|
232
|
+
for cat, stats in category_stats.items():
|
|
233
|
+
cat_f1 = category_f1[cat]
|
|
234
|
+
weight = stats['support'] / total_support
|
|
235
|
+
f1_weighted += cat_f1 * weight
|
|
236
|
+
|
|
237
|
+
# Calculate accuracy (proportion of exact matches)
|
|
238
|
+
exact_match = total_exact_matches / num_samples
|
|
239
|
+
|
|
240
|
+
# Return list of aggregate scores
|
|
241
|
+
return [
|
|
242
|
+
AggScore(
|
|
243
|
+
metric_name='f1_weighted',
|
|
244
|
+
score=f1_weighted,
|
|
245
|
+
num=num_samples,
|
|
246
|
+
metadata={'category_f1': {
|
|
247
|
+
cat: f1
|
|
248
|
+
for cat, f1 in category_f1.items()
|
|
249
|
+
}}
|
|
250
|
+
),
|
|
251
|
+
AggScore(metric_name='f1_micro', score=f1_micro, num=num_samples, metadata={}),
|
|
252
|
+
AggScore(metric_name='f1_macro', score=f1_macro, num=num_samples, metadata={}),
|
|
253
|
+
AggScore(metric_name='exact_match', score=exact_match, num=num_samples, metadata={})
|
|
254
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
2
|
+
from evalscope.api.dataset import Sample
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
|
|
8
|
+
'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
|
|
9
|
+
'or rhetorically subversive.'
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
PROMPT_TEMPLATE = r"""
|
|
13
|
+
Tell me the best option in the following options which represents the underlying narrative of the text?
|
|
14
|
+
The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
|
|
15
|
+
|
|
16
|
+
{question}
|
|
17
|
+
|
|
18
|
+
{choices}
|
|
19
|
+
""".strip() # noqa: E501
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@register_benchmark(
|
|
23
|
+
BenchmarkMeta(
|
|
24
|
+
name='drivel_selection',
|
|
25
|
+
pretty_name='DrivelologyNarrativeSelection',
|
|
26
|
+
tags=[Tags.MULTIPLE_CHOICE],
|
|
27
|
+
description=DESCRIPTION.strip(),
|
|
28
|
+
dataset_id='extraordinarylab/drivel-hub',
|
|
29
|
+
subset_list=['multiple-choice-english-easy', 'multiple-choice-english-hard'],
|
|
30
|
+
metric_list=['acc'],
|
|
31
|
+
few_shot_num=0,
|
|
32
|
+
train_split=None,
|
|
33
|
+
eval_split='test',
|
|
34
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
class DrivelologyNarrativeSelectionAdapter(MultiChoiceAdapter):
|
|
38
|
+
|
|
39
|
+
def __init__(self, **kwargs):
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
self.add_overall_metric = False
|
|
42
|
+
|
|
43
|
+
def record_to_sample(self, record) -> Sample:
|
|
44
|
+
return Sample(
|
|
45
|
+
input=record['text'],
|
|
46
|
+
choices=record['choices'],
|
|
47
|
+
target=record['answer'],
|
|
48
|
+
metadata={},
|
|
49
|
+
)
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
DESCRIPTION = (
|
|
16
|
+
'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
|
|
17
|
+
'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
|
|
18
|
+
'or rhetorically subversive.'
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Keep the original generation and evaluation templates
|
|
22
|
+
NARRATIVE_GENERATION_TEMPLATE = """
|
|
23
|
+
You need to first read and understand the text given. Generate a detailed description to illustrate the implicit narrative of the text.
|
|
24
|
+
|
|
25
|
+
Please provide your response in English, with a clear and comprehensive explanation of the narrative.
|
|
26
|
+
|
|
27
|
+
Text: {text}
|
|
28
|
+
""".strip() # noqa: E501
|
|
29
|
+
|
|
30
|
+
NARRATIVE_EVALUATION_TEMPLATE = """
|
|
31
|
+
Please act as an impartial judge and evaluate how accurately the candidate narrative matches the given reference narrative.
|
|
32
|
+
Your evaluation should consider factors such as the relevance, accuracy, depth, and level of detail of the candidate narrative compared to the reference.
|
|
33
|
+
|
|
34
|
+
Begin your evaluation by providing a short explanation in English. Be as objective as possible.
|
|
35
|
+
|
|
36
|
+
After providing your explanation, you must rate the match on a Likert scale from 1 to 5, where:
|
|
37
|
+
1 = Very poor match
|
|
38
|
+
2 = Poor match
|
|
39
|
+
3 = Moderate match
|
|
40
|
+
4 = Good match
|
|
41
|
+
5 = Excellent match
|
|
42
|
+
|
|
43
|
+
Please format your rating strictly as: "Rating: [[X]]" where X is a whole number from 1 to 5.
|
|
44
|
+
|
|
45
|
+
[Candidate Narrative]
|
|
46
|
+
{candidate}
|
|
47
|
+
|
|
48
|
+
[Reference Narrative]
|
|
49
|
+
{reference}
|
|
50
|
+
""".strip() # noqa: E501
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@register_benchmark(
|
|
54
|
+
BenchmarkMeta(
|
|
55
|
+
name='drivel_writing',
|
|
56
|
+
pretty_name='DrivelologyNarrativeWriting',
|
|
57
|
+
tags=[Tags.KNOWLEDGE, Tags.REASONING],
|
|
58
|
+
description=DESCRIPTION.strip(),
|
|
59
|
+
dataset_id='extraordinarylab/drivel-hub',
|
|
60
|
+
subset_list=['narrative-writing-english'],
|
|
61
|
+
metric_list={
|
|
62
|
+
'bert_score': {
|
|
63
|
+
'model_id_or_path': 'AI-ModelScope/roberta-large',
|
|
64
|
+
'model_type': 'roberta-large'
|
|
65
|
+
},
|
|
66
|
+
'gpt_score': {}
|
|
67
|
+
},
|
|
68
|
+
few_shot_num=0,
|
|
69
|
+
train_split=None,
|
|
70
|
+
eval_split='test',
|
|
71
|
+
prompt_template=NARRATIVE_GENERATION_TEMPLATE
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
class DrivelologyNarrativeWritingAdapter(DefaultDataAdapter):
|
|
75
|
+
|
|
76
|
+
def __init__(self, *args, **kwargs):
|
|
77
|
+
super().__init__(*args, **kwargs)
|
|
78
|
+
self._use_llm_judge = True # Use LLM as a judge by default
|
|
79
|
+
self.use_batch_scoring = True # Enable batch scoring
|
|
80
|
+
|
|
81
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
82
|
+
"""
|
|
83
|
+
Convert a data record to a Sample object.
|
|
84
|
+
"""
|
|
85
|
+
text = record['text']
|
|
86
|
+
reference_narrative = record['narrative']
|
|
87
|
+
|
|
88
|
+
# Format the generation prompt with the text
|
|
89
|
+
input_prompt = NARRATIVE_GENERATION_TEMPLATE.format(text=text)
|
|
90
|
+
|
|
91
|
+
# Create content list for the input
|
|
92
|
+
content_list = [ContentText(text=input_prompt)]
|
|
93
|
+
|
|
94
|
+
return Sample(
|
|
95
|
+
input=[ChatMessageUser(content=content_list)],
|
|
96
|
+
target=reference_narrative,
|
|
97
|
+
metadata={
|
|
98
|
+
'text': text,
|
|
99
|
+
'reference_narrative': reference_narrative
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def batch_match_score(self, original_predictions, filtered_predictions, references, task_states):
|
|
104
|
+
"""
|
|
105
|
+
Batch calculate the match scores using BERTScore.
|
|
106
|
+
"""
|
|
107
|
+
from evalscope.metrics.metric import BertScore
|
|
108
|
+
|
|
109
|
+
score_args = self.metric_list.get('bert_score', {})
|
|
110
|
+
bert_scorer = BertScore(**score_args)
|
|
111
|
+
bert_score_f1 = bert_scorer.apply(filtered_predictions, references)
|
|
112
|
+
scores = []
|
|
113
|
+
for i in range(len(original_predictions)):
|
|
114
|
+
score = Score(
|
|
115
|
+
extracted_prediction=filtered_predictions[i],
|
|
116
|
+
prediction=original_predictions[i],
|
|
117
|
+
value={'bert_score': bert_score_f1[i]}
|
|
118
|
+
)
|
|
119
|
+
scores.append(score)
|
|
120
|
+
return scores
|
|
121
|
+
|
|
122
|
+
def llm_match_score(
|
|
123
|
+
self,
|
|
124
|
+
original_prediction: str,
|
|
125
|
+
filtered_prediction: str,
|
|
126
|
+
reference: str,
|
|
127
|
+
task_state: TaskState,
|
|
128
|
+
) -> Score:
|
|
129
|
+
"""
|
|
130
|
+
Calculate the match score using LLM judge and BERTScore.
|
|
131
|
+
"""
|
|
132
|
+
score = Score(
|
|
133
|
+
extracted_prediction=filtered_prediction,
|
|
134
|
+
prediction=original_prediction,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Initialize score value dictionary
|
|
138
|
+
score.value = {}
|
|
139
|
+
|
|
140
|
+
# Use LLM judge to evaluate narrative quality
|
|
141
|
+
eval_prompt = NARRATIVE_EVALUATION_TEMPLATE.format(candidate=filtered_prediction, reference=reference)
|
|
142
|
+
|
|
143
|
+
judge_response = self.llm_judge.judge(eval_prompt)
|
|
144
|
+
logger.info(f'LLM judge response received (first 100 chars): {judge_response[:100]}...')
|
|
145
|
+
|
|
146
|
+
# Extract rating using regex pattern
|
|
147
|
+
match = re.search(r'Rating:\s*\[\[([1-5])\]\]', judge_response)
|
|
148
|
+
if match:
|
|
149
|
+
rating = int(match.group(1))
|
|
150
|
+
gpt_score = (rating - 1) / 4.0 # Normalize to 0-1 scale
|
|
151
|
+
logger.info(f'Rating extracted: {rating}/5 -> {gpt_score}')
|
|
152
|
+
else:
|
|
153
|
+
# Try alternative pattern
|
|
154
|
+
alt_match = re.search(r'(\[\[|\[)([1-5])(\]\]|\])', judge_response)
|
|
155
|
+
if alt_match:
|
|
156
|
+
rating = int(alt_match.group(2))
|
|
157
|
+
gpt_score = (rating - 1) / 4.0
|
|
158
|
+
logger.info(f'Rating extracted (alt pattern): {rating}/5 -> {gpt_score}')
|
|
159
|
+
else:
|
|
160
|
+
# Last resort: standalone digit
|
|
161
|
+
number_match = re.search(r'(?<!\d)[1-5](?!\d)', judge_response)
|
|
162
|
+
if number_match:
|
|
163
|
+
rating = int(number_match.group(0))
|
|
164
|
+
gpt_score = (rating - 1) / 4.0
|
|
165
|
+
logger.info(f'Rating extracted (fallback): {rating}/5 -> {gpt_score}')
|
|
166
|
+
else:
|
|
167
|
+
gpt_score = 0.0
|
|
168
|
+
logger.warning('No rating found in response, using default 0.0')
|
|
169
|
+
|
|
170
|
+
score.value['gpt_score'] = gpt_score
|
|
171
|
+
score.explanation = f'LLM judge rating: {gpt_score:.2f}'
|
|
172
|
+
|
|
173
|
+
score.metadata = {
|
|
174
|
+
'judge_response': judge_response[:300],
|
|
175
|
+
'model': getattr(self.llm_judge, 'model_id', 'unknown')
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
score.main_score_name = 'gpt_score'
|
|
179
|
+
return score
|
|
180
|
+
|
|
181
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
182
|
+
"""
|
|
183
|
+
Aggregate scores across all samples.
|
|
184
|
+
"""
|
|
185
|
+
if not sample_scores:
|
|
186
|
+
return [
|
|
187
|
+
AggScore(metric_name='gpt_score', score=0.0, num=0, metadata={}),
|
|
188
|
+
AggScore(metric_name='bert_score', score=0.0, num=0, metadata={})
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
# Extract scores
|
|
192
|
+
gpt_scores = [ss.score.value.get('gpt_score', 0.0) for ss in sample_scores]
|
|
193
|
+
bert_scores = [ss.score.value.get('bert_score', 0.0) for ss in sample_scores]
|
|
194
|
+
|
|
195
|
+
# Calculate averages
|
|
196
|
+
avg_gpt_score = sum(gpt_scores) / len(gpt_scores) if gpt_scores else 0.0
|
|
197
|
+
avg_bert_score = sum(bert_scores) / len(bert_scores) if bert_scores else 0.0
|
|
198
|
+
|
|
199
|
+
return [
|
|
200
|
+
AggScore(
|
|
201
|
+
metric_name='gpt_score',
|
|
202
|
+
score=avg_gpt_score,
|
|
203
|
+
num=len(sample_scores),
|
|
204
|
+
metadata={
|
|
205
|
+
'min_score': min(gpt_scores),
|
|
206
|
+
'max_score': max(gpt_scores)
|
|
207
|
+
}
|
|
208
|
+
),
|
|
209
|
+
AggScore(
|
|
210
|
+
metric_name='bert_score',
|
|
211
|
+
score=avg_bert_score,
|
|
212
|
+
num=len(sample_scores),
|
|
213
|
+
metadata={
|
|
214
|
+
'min_score': min(bert_scores),
|
|
215
|
+
'max_score': max(bert_scores)
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
]
|
|
@@ -41,7 +41,7 @@ Answer: 43
|
|
|
41
41
|
description=
|
|
42
42
|
'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
|
|
43
43
|
dataset_id='AI-ModelScope/DROP',
|
|
44
|
-
metric_list=['
|
|
44
|
+
metric_list=['em', 'f1'],
|
|
45
45
|
few_shot_num=3,
|
|
46
46
|
train_split=None,
|
|
47
47
|
eval_split='validation',
|
|
@@ -54,11 +54,9 @@ class DROPAdapter(DefaultDataAdapter):
|
|
|
54
54
|
def __init__(self, **kwargs):
|
|
55
55
|
super().__init__(**kwargs)
|
|
56
56
|
|
|
57
|
-
if self.few_shot_num != 0:
|
|
57
|
+
if self.few_shot_num != 0 and self.few_shot_num != 3:
|
|
58
58
|
self.few_shot_num = 3
|
|
59
59
|
logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
|
|
60
|
-
else:
|
|
61
|
-
self.few_shot_num = 0
|
|
62
60
|
|
|
63
61
|
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
64
62
|
"""
|
|
@@ -70,9 +68,10 @@ class DROPAdapter(DefaultDataAdapter):
|
|
|
70
68
|
Returns:
|
|
71
69
|
Sample: Sample object with input, target, and metadata.
|
|
72
70
|
"""
|
|
71
|
+
from .utils import _get_gold_answers
|
|
73
72
|
|
|
74
73
|
# Parse gold answers
|
|
75
|
-
gold_answers =
|
|
74
|
+
gold_answers = _get_gold_answers(record)
|
|
76
75
|
|
|
77
76
|
return Sample(
|
|
78
77
|
input=record['question'],
|
|
@@ -102,33 +101,6 @@ class DROPAdapter(DefaultDataAdapter):
|
|
|
102
101
|
query=query,
|
|
103
102
|
)
|
|
104
103
|
|
|
105
|
-
def _get_gold_answers(self, input_d: dict) -> List[str]:
|
|
106
|
-
"""
|
|
107
|
-
Parse the raw input labels (gold).
|
|
108
|
-
"""
|
|
109
|
-
|
|
110
|
-
def _flatten_validated_answers(validated_answers):
|
|
111
|
-
"""Flattens a dict of lists of validated answers."""
|
|
112
|
-
valid_answers = []
|
|
113
|
-
for i in range(len(validated_answers['number'])):
|
|
114
|
-
valid_answers.append({
|
|
115
|
-
'number': validated_answers['number'][i],
|
|
116
|
-
'date': validated_answers['date'][i],
|
|
117
|
-
'spans': validated_answers['spans'][i],
|
|
118
|
-
})
|
|
119
|
-
return valid_answers
|
|
120
|
-
|
|
121
|
-
answers = []
|
|
122
|
-
answers_set = set()
|
|
123
|
-
candidates = [input_d['answer']] + _flatten_validated_answers(input_d['validated_answers'])
|
|
124
|
-
for candidate in candidates:
|
|
125
|
-
answer = DROPAdapter.parse_answer(candidate)
|
|
126
|
-
if answer in answers_set:
|
|
127
|
-
continue
|
|
128
|
-
answers_set.add(answer)
|
|
129
|
-
answers.append(answer)
|
|
130
|
-
return answers
|
|
131
|
-
|
|
132
104
|
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
133
105
|
"""
|
|
134
106
|
Extract the answer from the model prediction.
|
|
@@ -147,7 +119,9 @@ class DROPAdapter(DefaultDataAdapter):
|
|
|
147
119
|
"""
|
|
148
120
|
Calculate accuracy score by matching prediction with reference answers.
|
|
149
121
|
"""
|
|
150
|
-
|
|
122
|
+
import numpy as np
|
|
123
|
+
|
|
124
|
+
from .utils import _align_bags, _answer_to_bags
|
|
151
125
|
|
|
152
126
|
score = Score(
|
|
153
127
|
extracted_prediction=filtered_prediction,
|
|
@@ -155,6 +129,7 @@ class DROPAdapter(DefaultDataAdapter):
|
|
|
155
129
|
)
|
|
156
130
|
|
|
157
131
|
max_em = 0
|
|
132
|
+
max_f1 = 0
|
|
158
133
|
reference = ast.literal_eval(reference) if isinstance(reference, str) else reference
|
|
159
134
|
for gold_answer in reference:
|
|
160
135
|
# Convert the answers to bags of answers
|
|
@@ -165,20 +140,16 @@ class DROPAdapter(DefaultDataAdapter):
|
|
|
165
140
|
exact_match = 1.0
|
|
166
141
|
else:
|
|
167
142
|
exact_match = 0.0
|
|
143
|
+
|
|
144
|
+
f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
|
|
145
|
+
f1_score = np.mean(f1_per_bag)
|
|
146
|
+
f1_score = round(f1_score, 2)
|
|
168
147
|
# Check if the answer is empty
|
|
169
148
|
if gold_answer[0].strip():
|
|
170
149
|
max_em = max(max_em, exact_match)
|
|
150
|
+
max_f1 = max(max_f1, f1_score)
|
|
171
151
|
|
|
172
|
-
score.value = {'
|
|
173
|
-
score.main_score_name = '
|
|
152
|
+
score.value = {'em': max_em, 'f1': max_f1}
|
|
153
|
+
score.main_score_name = 'f1'
|
|
174
154
|
|
|
175
155
|
return score
|
|
176
|
-
|
|
177
|
-
@staticmethod
|
|
178
|
-
def parse_answer(answer):
|
|
179
|
-
# NOTE: Everything is returned as a tuple for uniformity and hashability.
|
|
180
|
-
if answer['number'] != '':
|
|
181
|
-
return (str(answer['number']), )
|
|
182
|
-
if answer['spans'] != []:
|
|
183
|
-
return tuple(answer['spans'])
|
|
184
|
-
return (' '.join([answer['date']['day'], answer['date']['month'], answer['date']['year']]).strip(), )
|