evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentText
|
|
6
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
DESCRIPTION = (
|
|
14
|
+
"CoinFlip is a symbolic reasoning dataset that tests an LLM's ability "
|
|
15
|
+
'to track binary state changes through a sequence of actions. '
|
|
16
|
+
'Each example describes whether a coin is flipped or not by different person, '
|
|
17
|
+
'requiring logical inference to determine the final state (heads or tails).'
|
|
18
|
+
) # noqa: E501
|
|
19
|
+
|
|
20
|
+
PROMPT_TEMPLATE = """
|
|
21
|
+
Solve the following coin flip problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
|
|
22
|
+
|
|
23
|
+
{question}
|
|
24
|
+
|
|
25
|
+
Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer YES or NO to the problem.
|
|
26
|
+
|
|
27
|
+
Reasoning:
|
|
28
|
+
""" # noqa: E501
|
|
29
|
+
|
|
30
|
+
FEWSHOT_TEMPLATE = """
|
|
31
|
+
Here are some examples of how to solve similar problems:
|
|
32
|
+
|
|
33
|
+
{fewshot}
|
|
34
|
+
|
|
35
|
+
""".lstrip() + PROMPT_TEMPLATE # noqa: E501
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@register_benchmark(
|
|
39
|
+
BenchmarkMeta(
|
|
40
|
+
name='coin_flip',
|
|
41
|
+
pretty_name='CoinFlip',
|
|
42
|
+
tags=[Tags.REASONING, Tags.YES_NO],
|
|
43
|
+
description=DESCRIPTION.strip(),
|
|
44
|
+
dataset_id='extraordinarylab/coin-flip',
|
|
45
|
+
metric_list=['accuracy', 'precision', 'recall', 'f1_score', 'yes_ratio'],
|
|
46
|
+
aggregation='f1',
|
|
47
|
+
few_shot_num=0,
|
|
48
|
+
train_split='validation',
|
|
49
|
+
eval_split='test',
|
|
50
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
51
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
class CoinFlipAdapter(DefaultDataAdapter):
|
|
55
|
+
|
|
56
|
+
def __init__(self, **kwargs):
|
|
57
|
+
super().__init__(**kwargs)
|
|
58
|
+
self.add_overall_metric = False
|
|
59
|
+
|
|
60
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
61
|
+
question = record['question']
|
|
62
|
+
answer = record['answer']
|
|
63
|
+
input_text = self.prompt_template.format(question=question)
|
|
64
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
65
|
+
answer = str(answer).upper() # 'YES' or 'NO'
|
|
66
|
+
return Sample(input=[ChatMessageUser(content=content_list)], target=answer, metadata={
|
|
67
|
+
'answer': answer,
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
def extract_answer(self, prediction, task_state):
|
|
71
|
+
import re
|
|
72
|
+
|
|
73
|
+
match = re.search(r'ANSWER:\s*(.*)', prediction)
|
|
74
|
+
return match.group(1) if match else prediction
|
|
75
|
+
|
|
76
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
77
|
+
score = Score(
|
|
78
|
+
extracted_prediction=filtered_prediction,
|
|
79
|
+
prediction=original_prediction,
|
|
80
|
+
)
|
|
81
|
+
# Check for an exact match against the extracted answer.
|
|
82
|
+
result = 1 if reference in filtered_prediction else 0
|
|
83
|
+
score.value = {'acc': result}
|
|
84
|
+
return score
|
|
85
|
+
|
|
86
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
87
|
+
"""
|
|
88
|
+
Custom aggregation to compute accuracy, precision, recall, f1_score, and yes_ratio.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
tp = fp = tn = fn = 0
|
|
92
|
+
yes_count = 0
|
|
93
|
+
total_count = len(sample_scores)
|
|
94
|
+
|
|
95
|
+
for ss in sample_scores:
|
|
96
|
+
gt = ss.sample_metadata['answer'].strip().upper()
|
|
97
|
+
pred = ss.score.extracted_prediction.strip().upper()
|
|
98
|
+
|
|
99
|
+
if pred == 'YES':
|
|
100
|
+
yes_count += 1
|
|
101
|
+
if pred == 'YES' and gt == 'YES':
|
|
102
|
+
tp += 1
|
|
103
|
+
elif pred == 'YES' and gt == 'NO':
|
|
104
|
+
fp += 1
|
|
105
|
+
elif pred == 'NO' and gt == 'NO':
|
|
106
|
+
tn += 1
|
|
107
|
+
elif pred == 'NO' and gt == 'YES':
|
|
108
|
+
fn += 1
|
|
109
|
+
|
|
110
|
+
accuracy = (tp + tn) / total_count if total_count > 0 else 0.0
|
|
111
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
112
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
113
|
+
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
114
|
+
yes_ratio = yes_count / total_count if total_count > 0 else 0.0
|
|
115
|
+
|
|
116
|
+
overall_metrics = {
|
|
117
|
+
'accuracy': accuracy,
|
|
118
|
+
'precision': precision,
|
|
119
|
+
'recall': recall,
|
|
120
|
+
'f1_score': f1_score,
|
|
121
|
+
'yes_ratio': yes_ratio
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
agg_scores = []
|
|
125
|
+
for metric_name, value in overall_metrics.items():
|
|
126
|
+
agg_scores.append(AggScore(metric_name=metric_name, score=value, num=len(sample_scores), metadata={}))
|
|
127
|
+
|
|
128
|
+
return agg_scores
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
2
|
+
from evalscope.api.dataset import Sample
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
6
|
+
|
|
7
|
+
DESCRIPTION = 'CommonsenseQA requires different types of commonsense knowledge to predict the correct answers.'
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_benchmark(
|
|
11
|
+
BenchmarkMeta(
|
|
12
|
+
name='commonsense_qa',
|
|
13
|
+
pretty_name='CommonsenseQA',
|
|
14
|
+
tags=[Tags.REASONING, Tags.COMMONSENSE, Tags.MULTIPLE_CHOICE],
|
|
15
|
+
description=DESCRIPTION.strip(),
|
|
16
|
+
dataset_id='extraordinarylab/commonsense-qa',
|
|
17
|
+
metric_list=['acc'],
|
|
18
|
+
few_shot_num=0,
|
|
19
|
+
train_split=None,
|
|
20
|
+
eval_split='validation',
|
|
21
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
class CommonsenseQAAdapter(MultiChoiceAdapter):
|
|
25
|
+
|
|
26
|
+
def record_to_sample(self, record) -> Sample:
|
|
27
|
+
return Sample(
|
|
28
|
+
input=record['question'],
|
|
29
|
+
choices=record['choices'],
|
|
30
|
+
target=record['answer'],
|
|
31
|
+
metadata={},
|
|
32
|
+
)
|
|
@@ -71,3 +71,8 @@ class CompetitionMathAdapter(DefaultDataAdapter):
|
|
|
71
71
|
|
|
72
72
|
def sample_to_fewshot(self, sample: Sample) -> str:
|
|
73
73
|
return f'Problem:\n{sample.input}\nSolution:\n{sample.target}'
|
|
74
|
+
|
|
75
|
+
def extract_answer(self, prediction: str, task_state):
|
|
76
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
77
|
+
|
|
78
|
+
return extract_answer(prediction)
|
|
@@ -6,9 +6,7 @@ from typing import Any, Dict, List
|
|
|
6
6
|
from evalscope.api.benchmark import BenchmarkMeta, DataAdapter, DefaultDataAdapter
|
|
7
7
|
from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
|
|
8
8
|
from evalscope.api.evaluator import TaskState
|
|
9
|
-
from evalscope.api.metric import Score
|
|
10
9
|
from evalscope.api.metric.scorer import AggScore, SampleScore
|
|
11
|
-
from evalscope.api.model.model import Model
|
|
12
10
|
from evalscope.api.registry import get_benchmark, register_benchmark
|
|
13
11
|
from evalscope.config import TaskConfig
|
|
14
12
|
from evalscope.constants import DataCollection, Tags
|
|
@@ -22,8 +20,13 @@ logger = get_logger()
|
|
|
22
20
|
@register_benchmark(
|
|
23
21
|
BenchmarkMeta(
|
|
24
22
|
name=DataCollection.NAME,
|
|
23
|
+
pretty_name='Data-Collection',
|
|
25
24
|
dataset_id='', # dataset_id need to be set
|
|
26
|
-
description='Data collection'
|
|
25
|
+
description='Custom Data collection, mixing multiple evaluation datasets for '
|
|
26
|
+
'a unified evaluation, aiming to use less data to achieve a more comprehensive '
|
|
27
|
+
'assessment of the model\'s capabilities. '
|
|
28
|
+
'[Usage Reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html)',
|
|
29
|
+
tags=[Tags.CUSTOM],
|
|
27
30
|
metric_list=['acc'],
|
|
28
31
|
eval_split='test',
|
|
29
32
|
prompt_template='',
|
|
@@ -55,9 +58,10 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
55
58
|
data_id_or_path=dataset_path,
|
|
56
59
|
split=self.eval_split,
|
|
57
60
|
sample_fields=self.record_to_sample,
|
|
58
|
-
subset=
|
|
61
|
+
subset='test', # NOTE: using hardcoded test subset
|
|
59
62
|
limit=self.limit,
|
|
60
|
-
repeats=self.repeats
|
|
63
|
+
repeats=self.repeats,
|
|
64
|
+
shuffle=self.shuffle,
|
|
61
65
|
).load()
|
|
62
66
|
|
|
63
67
|
test_dataset = DatasetDict({self.default_subset: dataset})
|
|
@@ -95,7 +99,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
95
99
|
|
|
96
100
|
# load dataset args
|
|
97
101
|
dataset_args = copy.deepcopy(self._task_config.dataset_args)
|
|
98
|
-
common_args = dataset_args.get(DataCollection.NAME, {})
|
|
99
102
|
|
|
100
103
|
# Iterate through each sample in the dataset
|
|
101
104
|
dataset = self.test_dataset[self.default_subset]
|
|
@@ -108,7 +111,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
108
111
|
|
|
109
112
|
# update dataset args
|
|
110
113
|
cur_dataset_args = dataset_args.get(dataset_name, {})
|
|
111
|
-
cur_dataset_args.update(common_args)
|
|
112
114
|
|
|
113
115
|
# Initialize dataset adapter
|
|
114
116
|
if dataset_name not in self.dataset_adapters:
|
|
@@ -141,19 +143,22 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
141
143
|
data = []
|
|
142
144
|
for sample_score in sample_scores:
|
|
143
145
|
collection_info = sample_score.sample_metadata[DataCollection.INFO]
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
146
|
+
main_score = sample_score.score.main_value
|
|
147
|
+
main_metric = sample_score.score.main_score_name
|
|
148
|
+
|
|
149
|
+
# use main score
|
|
150
|
+
data.append(
|
|
151
|
+
dict(
|
|
152
|
+
task_type=collection_info['task_type'],
|
|
153
|
+
categories=tuple(collection_info['categories']),
|
|
154
|
+
dataset_name=collection_info['dataset_name'],
|
|
155
|
+
subset_name=collection_info['subset_name'],
|
|
156
|
+
tags=collection_info['tags'],
|
|
157
|
+
sample_id=sample_score.sample_id,
|
|
158
|
+
metric=main_metric,
|
|
159
|
+
score=main_score
|
|
156
160
|
)
|
|
161
|
+
)
|
|
157
162
|
|
|
158
163
|
df = pd.DataFrame(data)
|
|
159
164
|
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator.state import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
PROMPT = """Answer the question according to the image using a single word or phrase.
|
|
16
|
+
{question}
|
|
17
|
+
The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.""" # noqa: E501
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_benchmark(
|
|
21
|
+
BenchmarkMeta(
|
|
22
|
+
name='docvqa',
|
|
23
|
+
pretty_name='DocVQA',
|
|
24
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
25
|
+
description=
|
|
26
|
+
'DocVQA (Document Visual Question Answering) is a benchmark designed to evaluate AI systems on their ability to answer questions based on the content of document images, such as scanned pages, forms, or invoices. Unlike general visual question answering, it requires understanding not just the text extracted by OCR, but also the complex layout, structure, and visual elements of a document.', # noqa: E501
|
|
27
|
+
dataset_id='lmms-lab/DocVQA',
|
|
28
|
+
subset_list=['DocVQA'],
|
|
29
|
+
metric_list=['anls'],
|
|
30
|
+
eval_split='validation',
|
|
31
|
+
prompt_template=PROMPT,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
class DocVQAAdapter(VisionLanguageAdapter):
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
super().__init__(**kwargs)
|
|
38
|
+
self.add_aggregation_name = False
|
|
39
|
+
|
|
40
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
41
|
+
|
|
42
|
+
input_text = PROMPT.format(question=record['question'])
|
|
43
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
44
|
+
image = record.get('image')
|
|
45
|
+
if image:
|
|
46
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
47
|
+
content_list.append(ContentImage(image=image_base64))
|
|
48
|
+
return Sample(
|
|
49
|
+
input=[ChatMessageUser(content=content_list)],
|
|
50
|
+
target=json.dumps(record.get('answers')), # answers is a list
|
|
51
|
+
metadata={
|
|
52
|
+
'questionId': record.get('questionId'),
|
|
53
|
+
'question_types': record.get('question_types'),
|
|
54
|
+
'docId': record.get('docId'),
|
|
55
|
+
'ucsf_document_id': record.get('ucsf_document_id'),
|
|
56
|
+
'ucsf_document_page_no': record.get('ucsf_document_page_no'),
|
|
57
|
+
}
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
61
|
+
import re
|
|
62
|
+
|
|
63
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
64
|
+
match = re.search(pattern, prediction)
|
|
65
|
+
if match:
|
|
66
|
+
return match.group(1).strip()
|
|
67
|
+
return prediction.strip()
|
|
File without changes
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
DESCRIPTION = (
|
|
14
|
+
'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
|
|
15
|
+
'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
|
|
16
|
+
'or rhetorically subversive.'
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
PROMPT_TEMPLATE = """
|
|
20
|
+
#Instruction#:
|
|
21
|
+
Classify whether the given text is a Drivelology sample or not.
|
|
22
|
+
|
|
23
|
+
#Definition#:
|
|
24
|
+
- Drivelology: Statements that appear logically coherent but contain deeper, often paradoxical meanings.
|
|
25
|
+
These challenge conventional interpretation by blending surface-level nonsense with underlying depth,
|
|
26
|
+
often incorporating elements of humor, irony, or sarcasm, and requiring contextual understanding and
|
|
27
|
+
emotional insight to unravel their true significance.
|
|
28
|
+
- non-Drivelology: This includes pure nonsense (grammatically correct but semantically meaningless
|
|
29
|
+
statements, such as "Colourless green ideas sleep furiously") and normal sentences, including quotes
|
|
30
|
+
or proverbs, that convey clear or straightforward information without the layered complexity
|
|
31
|
+
characteristic of Drivelology.
|
|
32
|
+
|
|
33
|
+
#Output Format#:
|
|
34
|
+
You should try your best to answer "Yes" if the given input text is Drivelology, otherwise specify "No".
|
|
35
|
+
The answer you give MUST be \"Yes\" or \"No\"".
|
|
36
|
+
|
|
37
|
+
#Input Text#: {text}
|
|
38
|
+
#Your Answer#:
|
|
39
|
+
""".strip() # noqa: E501
|
|
40
|
+
|
|
41
|
+
FEWSHOT_PROMPT_TEMPLATE = """
|
|
42
|
+
#Instruction#:
|
|
43
|
+
Classify whether the given text is a Drivelology sample or not.
|
|
44
|
+
|
|
45
|
+
#Definition#:
|
|
46
|
+
- Drivelology: Statements that appear logically coherent but contain deeper, often paradoxical meanings.
|
|
47
|
+
These challenge conventional interpretation by blending surface-level nonsense with underlying depth,
|
|
48
|
+
often incorporating elements of humor, irony, or sarcasm, and requiring contextual understanding and
|
|
49
|
+
emotional insight to unravel their true significance.
|
|
50
|
+
- non-Drivelology: This includes pure nonsense (grammatically correct but semantically meaningless
|
|
51
|
+
statements, such as "Colourless green ideas sleep furiously") and normal sentences, including quotes
|
|
52
|
+
or proverbs, that convey clear or straightforward information without the layered complexity
|
|
53
|
+
characteristic of Drivelology.
|
|
54
|
+
|
|
55
|
+
#Output Format#:
|
|
56
|
+
You should try your best to answer "Yes" if the given input text is Drivelology, otherwise specify "No".
|
|
57
|
+
The answer you give MUST be \"Yes\" or \"No\"".
|
|
58
|
+
|
|
59
|
+
Here are some examples of how to solve similar problems:
|
|
60
|
+
|
|
61
|
+
#Input Text#: Saw a book called "how to solve 50 percent of your problems" so I bought 2 books.
|
|
62
|
+
#Your Answer#: Yes
|
|
63
|
+
|
|
64
|
+
#Input Text#: Colourless green ideas sleep furiously.
|
|
65
|
+
#Your Answer#: No
|
|
66
|
+
|
|
67
|
+
#Input Text#: I went to a restaurant, and saw this guy was choking. I gotta save him. And then I realized he was just speaking French.
|
|
68
|
+
#Your Answer#: Yes
|
|
69
|
+
|
|
70
|
+
#Input Text#: Either it is or it isn't.
|
|
71
|
+
#Your Answer#: No
|
|
72
|
+
|
|
73
|
+
#Input Text#: {text}
|
|
74
|
+
#Your Answer#:
|
|
75
|
+
""".strip() # noqa: E501
|
|
76
|
+
|
|
77
|
+
logger = get_logger()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@register_benchmark(
|
|
81
|
+
BenchmarkMeta(
|
|
82
|
+
name='drivel_binary',
|
|
83
|
+
pretty_name='DrivelologyBinaryClassification',
|
|
84
|
+
tags=[Tags.YES_NO],
|
|
85
|
+
description=DESCRIPTION.strip(),
|
|
86
|
+
dataset_id='extraordinarylab/drivel-hub',
|
|
87
|
+
subset_list=['binary-classification'],
|
|
88
|
+
metric_list=['accuracy', 'precision', 'recall', 'f1_score', 'yes_ratio'],
|
|
89
|
+
aggregation='f1',
|
|
90
|
+
few_shot_num=0,
|
|
91
|
+
eval_split='test',
|
|
92
|
+
prompt_template='{question}',
|
|
93
|
+
few_shot_prompt_template='{question}'
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
class DrivelologyBinaryClassificationAdapter(DefaultDataAdapter):
|
|
97
|
+
|
|
98
|
+
def __init__(self, **kwargs):
|
|
99
|
+
super().__init__(**kwargs)
|
|
100
|
+
self.add_overall_metric = False
|
|
101
|
+
if self.few_shot_num not in [0, 4]:
|
|
102
|
+
logger.warning(f'For DrivelologyBinaryClassification, use 4-shot by default.')
|
|
103
|
+
self.few_shot_num = 4
|
|
104
|
+
|
|
105
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
106
|
+
if self.few_shot_num > 0:
|
|
107
|
+
prompt = FEWSHOT_PROMPT_TEMPLATE.format(text=record['text'])
|
|
108
|
+
else:
|
|
109
|
+
prompt = PROMPT_TEMPLATE.format(text=record['text'])
|
|
110
|
+
content_list: List[Content] = [ContentText(text=prompt)]
|
|
111
|
+
answer = 'YES' if str(record['label']) == 'drivelology' else 'NO' # 'YES' or 'NO'
|
|
112
|
+
return Sample(input=[ChatMessageUser(content=content_list)], target=answer, metadata={
|
|
113
|
+
'answer': answer,
|
|
114
|
+
})
|
|
115
|
+
|
|
116
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
117
|
+
score = Score(
|
|
118
|
+
extracted_prediction=filtered_prediction,
|
|
119
|
+
prediction=original_prediction,
|
|
120
|
+
)
|
|
121
|
+
# Check if the reference answer is in the filtered prediction
|
|
122
|
+
result = 1 if reference in filtered_prediction.strip().upper() else 0
|
|
123
|
+
score.value = {'acc': result}
|
|
124
|
+
return score
|
|
125
|
+
|
|
126
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
127
|
+
"""
|
|
128
|
+
Custom aggregation to compute accuracy, precision, recall, f1_score, and yes_ratio.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def compute_metrics(scores: List[SampleScore]):
|
|
132
|
+
tp = fp = tn = fn = 0
|
|
133
|
+
yes_count = 0
|
|
134
|
+
total_count = len(scores)
|
|
135
|
+
|
|
136
|
+
for ss in scores:
|
|
137
|
+
gt = ss.sample_metadata['answer'].strip().upper()
|
|
138
|
+
# Get prediction based on score
|
|
139
|
+
pred = gt if ss.score.main_value == 1 else ('NO' if gt == 'YES' else 'YES')
|
|
140
|
+
if pred == 'YES':
|
|
141
|
+
yes_count += 1
|
|
142
|
+
if pred == 'YES' and gt == 'YES':
|
|
143
|
+
tp += 1
|
|
144
|
+
elif pred == 'YES' and gt == 'NO':
|
|
145
|
+
fp += 1
|
|
146
|
+
elif pred == 'NO' and gt == 'NO':
|
|
147
|
+
tn += 1
|
|
148
|
+
elif pred == 'NO' and gt == 'YES':
|
|
149
|
+
fn += 1
|
|
150
|
+
|
|
151
|
+
accuracy = (tp + tn) / total_count if total_count > 0 else 0.0
|
|
152
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
153
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
154
|
+
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
155
|
+
yes_ratio = yes_count / total_count if total_count > 0 else 0.0
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
'accuracy': accuracy,
|
|
159
|
+
'precision': precision,
|
|
160
|
+
'recall': recall,
|
|
161
|
+
'f1_score': f1_score,
|
|
162
|
+
'yes_ratio': yes_ratio
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
overall_metrics = compute_metrics(sample_scores)
|
|
166
|
+
agg_scores = []
|
|
167
|
+
for metric_name, value in overall_metrics.items():
|
|
168
|
+
agg_scores.append(AggScore(metric_name=metric_name, score=value, num=len(sample_scores), metadata={}))
|
|
169
|
+
|
|
170
|
+
return agg_scores
|