evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import traceback
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from copy import deepcopy
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
from typing import Any, Dict, List, Tuple
|
|
9
|
+
|
|
10
|
+
from evalscope.report import (
|
|
11
|
+
Category,
|
|
12
|
+
Report,
|
|
13
|
+
Subset,
|
|
14
|
+
percentage_weighted_average_from_subsets,
|
|
15
|
+
unweighted_average_from_subsets,
|
|
16
|
+
weighted_average_from_subsets,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# ----------------------------
|
|
20
|
+
# Public constants (extracted)
|
|
21
|
+
# ----------------------------
|
|
22
|
+
|
|
23
|
+
ALL_AVAILABLE_MEMORY_BACKENDS: List[str] = [
|
|
24
|
+
'kv',
|
|
25
|
+
'vector',
|
|
26
|
+
'rec_sum',
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
NON_LIVE_CATEGORY: List[str] = [
|
|
30
|
+
'simple_python',
|
|
31
|
+
'simple_java',
|
|
32
|
+
'simple_javascript',
|
|
33
|
+
'multiple',
|
|
34
|
+
'parallel',
|
|
35
|
+
'parallel_multiple',
|
|
36
|
+
'irrelevance',
|
|
37
|
+
]
|
|
38
|
+
LIVE_CATEGORY: List[str] = [
|
|
39
|
+
'live_simple',
|
|
40
|
+
'live_multiple',
|
|
41
|
+
'live_parallel',
|
|
42
|
+
'live_parallel_multiple',
|
|
43
|
+
'live_irrelevance',
|
|
44
|
+
'live_relevance',
|
|
45
|
+
]
|
|
46
|
+
MULTI_TURN_CATEGORY: List[str] = [
|
|
47
|
+
'multi_turn_base',
|
|
48
|
+
'multi_turn_miss_func',
|
|
49
|
+
'multi_turn_miss_param',
|
|
50
|
+
'multi_turn_long_context',
|
|
51
|
+
]
|
|
52
|
+
WEB_SEARCH_CATEGORY: List[str] = [
|
|
53
|
+
'web_search_base',
|
|
54
|
+
'web_search_no_snippet',
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
MEMORY_CATEGORY: List[str] = [f'memory_{backend}' for backend in ALL_AVAILABLE_MEMORY_BACKENDS]
|
|
58
|
+
MEMORY_SCENARIO_NAME = [
|
|
59
|
+
'student',
|
|
60
|
+
'customer',
|
|
61
|
+
'finance',
|
|
62
|
+
'healthcare',
|
|
63
|
+
'notetaker',
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
SINGLE_TURN_CATEGORY: List[str] = NON_LIVE_CATEGORY + LIVE_CATEGORY
|
|
67
|
+
AGENTIC_CATEGORY: List[str] = MEMORY_CATEGORY + WEB_SEARCH_CATEGORY
|
|
68
|
+
|
|
69
|
+
ALL_SCORING_CATEGORIES: List[str] = SINGLE_TURN_CATEGORY + MULTI_TURN_CATEGORY + AGENTIC_CATEGORY
|
|
70
|
+
|
|
71
|
+
# Dummy models used only to infer underscore_to_dot behavior
|
|
72
|
+
DUMMY_MODEL_UNDERSCORE_TO_DOT = 'gpt-4o-2024-11-20-FC'
|
|
73
|
+
DUMMY_MODEL_NO_UNDERSCORE_TO_DOT = 'meta-llama/Llama-3.3-70B-Instruct-FC'
|
|
74
|
+
|
|
75
|
+
# ----------------------------
|
|
76
|
+
# Data preparation helpers
|
|
77
|
+
# ----------------------------
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def load_bfcl_data(categories: List[str]) -> Tuple[Dict[str, List[Dict]], Dict[str, List[Dict]]]:
|
|
81
|
+
"""
|
|
82
|
+
Load test entries and ground truth data from bfcl_eval for given categories.
|
|
83
|
+
"""
|
|
84
|
+
from bfcl_eval.utils import is_relevance_or_irrelevance, load_dataset_entry, load_ground_truth_entry
|
|
85
|
+
|
|
86
|
+
test_entries_by_cat: Dict[str, List[Dict]] = defaultdict(list)
|
|
87
|
+
ground_truth_by_cat: Dict[str, List[Dict]] = defaultdict(list)
|
|
88
|
+
|
|
89
|
+
for category in categories:
|
|
90
|
+
test_entries_by_cat[category] = load_dataset_entry(
|
|
91
|
+
category, include_prereq=True, include_language_specific_hint=False
|
|
92
|
+
)
|
|
93
|
+
if not is_relevance_or_irrelevance(category):
|
|
94
|
+
ground_truth_by_cat[category] = load_ground_truth_entry(category)
|
|
95
|
+
|
|
96
|
+
return test_entries_by_cat, ground_truth_by_cat
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def prepare_ground_truth_map(category: str, ground_truth_entries: List[Dict]) -> Dict[str, Dict]:
|
|
100
|
+
"""
|
|
101
|
+
Map ground truth entries to IDs with category-specific adjustments.
|
|
102
|
+
"""
|
|
103
|
+
from bfcl_eval.utils import is_memory, is_web_search
|
|
104
|
+
|
|
105
|
+
if not ground_truth_entries:
|
|
106
|
+
return {}
|
|
107
|
+
|
|
108
|
+
if is_memory(category):
|
|
109
|
+
return {entry['id'].replace('memory', category): entry for entry in ground_truth_entries}
|
|
110
|
+
if is_web_search(category):
|
|
111
|
+
return {entry['id'].replace('web_search', category): entry for entry in ground_truth_entries}
|
|
112
|
+
return {entry['id']: entry for entry in ground_truth_entries}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def process_test_entries(
|
|
116
|
+
category: str,
|
|
117
|
+
test_entries: List[Dict[str, Any]],
|
|
118
|
+
ground_truth_entries: List[Dict[str, Any]],
|
|
119
|
+
model_result_dir: Path,
|
|
120
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
121
|
+
"""
|
|
122
|
+
Clean and enrich test entries, return processed entries and prereq entries.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
processed_entries: entries ready to be mapped to Samples
|
|
126
|
+
prereq_entries: entries requiring prereq inference (memory snapshots)
|
|
127
|
+
"""
|
|
128
|
+
from bfcl_eval.utils import (
|
|
129
|
+
clean_up_memory_prereq_entries,
|
|
130
|
+
is_memory_prereq,
|
|
131
|
+
populate_initial_settings_for_memory_test_cases,
|
|
132
|
+
populate_initial_settings_for_web_search_test_cases,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
ground_truth_map = prepare_ground_truth_map(category, ground_truth_entries)
|
|
136
|
+
|
|
137
|
+
test_entries = clean_up_memory_prereq_entries(test_entries)
|
|
138
|
+
test_entries = populate_initial_settings_for_web_search_test_cases(test_entries)
|
|
139
|
+
test_entries = populate_initial_settings_for_memory_test_cases(test_entries, model_result_dir=model_result_dir)
|
|
140
|
+
|
|
141
|
+
prereq_entries = [entry for entry in test_entries if is_memory_prereq(entry['id'])]
|
|
142
|
+
main_entries = [entry for entry in test_entries if not is_memory_prereq(entry['id'])]
|
|
143
|
+
|
|
144
|
+
processed_entries: List[Dict[str, Any]] = []
|
|
145
|
+
for entry in main_entries:
|
|
146
|
+
entry_id = entry['id']
|
|
147
|
+
entry['category'] = category
|
|
148
|
+
entry['ground_truth'] = ground_truth_map.get(entry_id, {}).get('ground_truth', {})
|
|
149
|
+
processed_entries.append(entry)
|
|
150
|
+
|
|
151
|
+
return processed_entries, prereq_entries
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def run_prereq_inference(
|
|
155
|
+
handler: Any,
|
|
156
|
+
prereq_entries: List[Dict[str, Any]],
|
|
157
|
+
model_result_dir: Path,
|
|
158
|
+
batch_size: int,
|
|
159
|
+
logger: Any,
|
|
160
|
+
) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Run prerequisite inferences for memory snapshot creation if results are missing.
|
|
163
|
+
Optimized to run different (backend, scenario) groups in parallel while preserving in-group order.
|
|
164
|
+
"""
|
|
165
|
+
import re
|
|
166
|
+
from bfcl_eval.utils import get_directory_structure_by_id
|
|
167
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
168
|
+
|
|
169
|
+
if not prereq_entries:
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
def _parse_backend_scenario_idx(entry_id: str) -> Tuple[str, str, int]:
|
|
173
|
+
"""
|
|
174
|
+
Extract backend, scenario, and scenario index from an entry id.
|
|
175
|
+
Expected format:
|
|
176
|
+
memory_{backend}_prereq_{total_index}-{scenario}-{scenario_index}
|
|
177
|
+
Returns ('unknown', 'unknown', 0) on failure.
|
|
178
|
+
"""
|
|
179
|
+
backend = 'unknown'
|
|
180
|
+
scenario = 'unknown'
|
|
181
|
+
idx = 0
|
|
182
|
+
|
|
183
|
+
m_backend = re.search(r'^memory_(?P<backend>.+?)_prereq_', entry_id)
|
|
184
|
+
if m_backend:
|
|
185
|
+
backend = m_backend.group('backend')
|
|
186
|
+
|
|
187
|
+
m_tail = re.search(r'-(?P<scenario>[a-zA-Z_]+)-(?P<idx>\d+)$', entry_id)
|
|
188
|
+
if m_tail:
|
|
189
|
+
scenario = m_tail.group('scenario')
|
|
190
|
+
idx = int(m_tail.group('idx'))
|
|
191
|
+
|
|
192
|
+
return backend, scenario, idx
|
|
193
|
+
|
|
194
|
+
# Group entries by (backend, scenario)
|
|
195
|
+
groups: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
|
|
196
|
+
for entry in prereq_entries:
|
|
197
|
+
backend, scenario, idx = _parse_backend_scenario_idx(entry['id'])
|
|
198
|
+
entry['_group_backend'] = backend
|
|
199
|
+
entry['_group_scenario'] = scenario
|
|
200
|
+
entry['_scenario_idx'] = idx
|
|
201
|
+
groups.setdefault((backend, scenario), []).append(entry)
|
|
202
|
+
|
|
203
|
+
# Sort entries within each group by scenario index to keep order
|
|
204
|
+
for group_entries in groups.values():
|
|
205
|
+
group_entries.sort(key=lambda e: e.get('_scenario_idx', 0))
|
|
206
|
+
|
|
207
|
+
# Worker to process a single (backend, scenario) group sequentially
|
|
208
|
+
def _process_group_entries(group_entries: List[Dict[str, Any]], progress: Any) -> None:
|
|
209
|
+
for entry in group_entries:
|
|
210
|
+
try:
|
|
211
|
+
memory_snapshot_folder = (
|
|
212
|
+
model_result_dir / get_directory_structure_by_id(entry['id']) / 'memory_snapshot'
|
|
213
|
+
/ 'prereq_checkpoints'
|
|
214
|
+
)
|
|
215
|
+
existing_filenames = {f.name for f in memory_snapshot_folder.rglob('*.json')}
|
|
216
|
+
if (entry['id'] + '.json') in existing_filenames:
|
|
217
|
+
logger.info(f'Skipping prereq inference for entry ID {entry["id"]} as result already exists.')
|
|
218
|
+
else:
|
|
219
|
+
handler.inference(deepcopy(entry), include_input_log=False, exclude_state_log=False)
|
|
220
|
+
except Exception as e:
|
|
221
|
+
logger.error(f'Error during prereq inference for entry ID {entry.get("id")}: {e}')
|
|
222
|
+
logger.error(traceback.format_exc())
|
|
223
|
+
finally:
|
|
224
|
+
# tqdm is thread-safe; each worker updates shared progress bar
|
|
225
|
+
progress.update(1)
|
|
226
|
+
|
|
227
|
+
# Run each (backend, scenario) group in parallel; preserve in-group order
|
|
228
|
+
total = len(prereq_entries)
|
|
229
|
+
with tqdm(total=total, desc='Running prereq inferences for memory snapshots...') as progress:
|
|
230
|
+
max_workers = min(batch_size, len(groups))
|
|
231
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
232
|
+
futures = [
|
|
233
|
+
executor.submit(_process_group_entries, group_entries, progress) for group_entries in groups.values()
|
|
234
|
+
]
|
|
235
|
+
for _ in as_completed(futures):
|
|
236
|
+
# Errors are logged within workers
|
|
237
|
+
pass
|
|
238
|
+
|
|
239
|
+
# Cleanup temp keys
|
|
240
|
+
for group_entries in groups.values():
|
|
241
|
+
for entry in group_entries:
|
|
242
|
+
entry.pop('_group_backend', None)
|
|
243
|
+
entry.pop('_group_scenario', None)
|
|
244
|
+
entry.pop('_scenario_idx', None)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# ----------------------------
|
|
248
|
+
# Scoring helpers
|
|
249
|
+
# ----------------------------
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def compute_entry_result(
|
|
253
|
+
handler: Any,
|
|
254
|
+
model_result: Any,
|
|
255
|
+
prompt_entry: Dict[str, Any],
|
|
256
|
+
underscore_to_dot: bool,
|
|
257
|
+
) -> Dict[str, Any]:
|
|
258
|
+
"""
|
|
259
|
+
Compute evaluation result for a single entry across BFCL categories.
|
|
260
|
+
"""
|
|
261
|
+
from bfcl_eval.constants.enums import Language, ReturnFormat
|
|
262
|
+
from bfcl_eval.eval_checker.eval_runner import (
|
|
263
|
+
_evaluate_single_agentic_entry,
|
|
264
|
+
_evaluate_single_ast_entry,
|
|
265
|
+
_evaluate_single_multi_turn_entry,
|
|
266
|
+
_evaluate_single_relevance_entry,
|
|
267
|
+
)
|
|
268
|
+
from bfcl_eval.utils import is_agentic, is_java, is_js, is_multi_turn, is_relevance_or_irrelevance
|
|
269
|
+
|
|
270
|
+
test_category = prompt_entry['category']
|
|
271
|
+
index = prompt_entry['id']
|
|
272
|
+
ground_truth = prompt_entry.get('ground_truth', {})
|
|
273
|
+
|
|
274
|
+
model_name = (DUMMY_MODEL_UNDERSCORE_TO_DOT if underscore_to_dot else DUMMY_MODEL_NO_UNDERSCORE_TO_DOT)
|
|
275
|
+
|
|
276
|
+
if is_relevance_or_irrelevance(test_category):
|
|
277
|
+
return _evaluate_single_relevance_entry(
|
|
278
|
+
handler=handler,
|
|
279
|
+
index=index,
|
|
280
|
+
model_result_item=model_result,
|
|
281
|
+
prompt_entry=prompt_entry,
|
|
282
|
+
model_name=model_name,
|
|
283
|
+
test_category=test_category,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
elif is_multi_turn(test_category):
|
|
287
|
+
return _evaluate_single_multi_turn_entry(
|
|
288
|
+
handler=handler,
|
|
289
|
+
test_entry_id=index,
|
|
290
|
+
model_result_list=model_result,
|
|
291
|
+
ground_truth_list=ground_truth,
|
|
292
|
+
prompt_entry=prompt_entry,
|
|
293
|
+
model_name=model_name,
|
|
294
|
+
test_category=test_category,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
elif is_agentic(test_category):
|
|
298
|
+
return _evaluate_single_agentic_entry(
|
|
299
|
+
handler=handler,
|
|
300
|
+
index=index,
|
|
301
|
+
model_result_list=model_result,
|
|
302
|
+
possible_answer_item=ground_truth,
|
|
303
|
+
prompt_entry=prompt_entry,
|
|
304
|
+
model_name=model_name,
|
|
305
|
+
test_category=test_category,
|
|
306
|
+
)
|
|
307
|
+
else:
|
|
308
|
+
# AST categories (python/java/js)
|
|
309
|
+
if is_java(test_category):
|
|
310
|
+
language = Language.JAVA
|
|
311
|
+
return_format = ReturnFormat.JAVA
|
|
312
|
+
elif is_js(test_category):
|
|
313
|
+
language = Language.JAVASCRIPT
|
|
314
|
+
return_format = ReturnFormat.JAVASCRIPT
|
|
315
|
+
else:
|
|
316
|
+
language = Language.PYTHON
|
|
317
|
+
return_format = ReturnFormat.PYTHON
|
|
318
|
+
|
|
319
|
+
return _evaluate_single_ast_entry(
|
|
320
|
+
handler=handler,
|
|
321
|
+
index=index,
|
|
322
|
+
model_result_item=model_result,
|
|
323
|
+
possible_answer_item=ground_truth,
|
|
324
|
+
prompt_entry=prompt_entry,
|
|
325
|
+
model_name=model_name,
|
|
326
|
+
test_category=test_category,
|
|
327
|
+
language=language,
|
|
328
|
+
return_format=return_format,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
# ----------------------------
|
|
333
|
+
# Report aggregation helpers
|
|
334
|
+
# ----------------------------
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def compute_aggregate_subsets(report: Report) -> None:
|
|
338
|
+
"""
|
|
339
|
+
Compute aggregated subsets and overall score for BFCL report.
|
|
340
|
+
Modifies the report in-place.
|
|
341
|
+
"""
|
|
342
|
+
for metric in report.metrics:
|
|
343
|
+
# Collect all subsets in a dictionary for easy access
|
|
344
|
+
subset_dict: Dict[str, Subset] = {}
|
|
345
|
+
for category in metric.categories:
|
|
346
|
+
for subset in category.subsets:
|
|
347
|
+
subset_dict[subset.name] = subset
|
|
348
|
+
|
|
349
|
+
# Step 1: simple_ast
|
|
350
|
+
simple_subsets = ['simple_python', 'simple_java', 'simple_javascript']
|
|
351
|
+
simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
|
|
352
|
+
subset_dict['simple_ast'] = simple_ast
|
|
353
|
+
|
|
354
|
+
# Step 2.1: non_live (simple_ast, multiple, parallel, parallel_multiple)
|
|
355
|
+
non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
|
|
356
|
+
non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
|
|
357
|
+
subset_dict['non_live'] = non_live
|
|
358
|
+
|
|
359
|
+
# Step 2.2: live (weighted)
|
|
360
|
+
live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
|
|
361
|
+
live = weighted_average_from_subsets(live_subsets, subset_dict)
|
|
362
|
+
subset_dict['live'] = live
|
|
363
|
+
|
|
364
|
+
# Step 2.3: hallucination (unweighted)
|
|
365
|
+
hallucination_subsets = ['live_irrelevance', 'irrelevance']
|
|
366
|
+
hallucination = unweighted_average_from_subsets(hallucination_subsets, subset_dict)
|
|
367
|
+
subset_dict['hallucination'] = hallucination
|
|
368
|
+
|
|
369
|
+
# Step 2.4: multi_turn (unweighted)
|
|
370
|
+
multi_turn_subsets = [
|
|
371
|
+
'multi_turn_base',
|
|
372
|
+
'multi_turn_miss_func',
|
|
373
|
+
'multi_turn_miss_param',
|
|
374
|
+
'multi_turn_long_context',
|
|
375
|
+
]
|
|
376
|
+
multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
|
|
377
|
+
subset_dict['multi_turn'] = multi_turn
|
|
378
|
+
|
|
379
|
+
# Step 2.5: web_search (unweighted)
|
|
380
|
+
web_search_subsets = ['web_search_base', 'web_search_no_snippet']
|
|
381
|
+
web_search = unweighted_average_from_subsets(web_search_subsets, subset_dict)
|
|
382
|
+
subset_dict['web_search'] = web_search
|
|
383
|
+
|
|
384
|
+
# Step 2.6: memory (unweighted)
|
|
385
|
+
memory_subsets = ['memory_kv', 'memory_vector', 'memory_rec_sum']
|
|
386
|
+
memory = unweighted_average_from_subsets(memory_subsets, subset_dict)
|
|
387
|
+
subset_dict['memory'] = memory
|
|
388
|
+
|
|
389
|
+
# Step 2.7: agentic (unweighted)
|
|
390
|
+
agentic_subsets = ['web_search', 'memory']
|
|
391
|
+
agentic = unweighted_average_from_subsets(agentic_subsets, subset_dict)
|
|
392
|
+
subset_dict['agentic'] = agentic
|
|
393
|
+
|
|
394
|
+
# Step 4: overall (percentage weighted average)
|
|
395
|
+
overall_subsets = ['agentic', 'multi_turn', 'non_live', 'live', 'hallucination']
|
|
396
|
+
overall = percentage_weighted_average_from_subsets(overall_subsets, subset_dict, weights=[40, 30, 10, 10, 10])
|
|
397
|
+
subset_dict['overall'] = overall
|
|
398
|
+
|
|
399
|
+
# Add computed scores to the category
|
|
400
|
+
computed_subset_names = ['agentic', 'multi_turn', 'non_live', 'live', 'hallucination', 'overall']
|
|
401
|
+
|
|
402
|
+
# Add the computed scores as new subsets in the metric
|
|
403
|
+
dummy_subsets: List[Subset] = []
|
|
404
|
+
for subset_name in computed_subset_names:
|
|
405
|
+
if subset_name in subset_dict and subset_dict[subset_name].num > 0:
|
|
406
|
+
subset = subset_dict[subset_name]
|
|
407
|
+
subset.name = subset_name.upper()
|
|
408
|
+
dummy_subsets.append(subset)
|
|
409
|
+
dummy_category = Category(name='-', subsets=dummy_subsets)
|
|
410
|
+
metric.categories.append(dummy_category)
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
2
|
+
from evalscope.api.dataset import Sample
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate
|
|
6
|
+
|
|
7
|
+
DESCRIPTION = (
|
|
8
|
+
'BiomixQA is a curated biomedical question-answering dataset. '
|
|
9
|
+
'BiomixQA has been utilized to validate the Knowledge Graph based '
|
|
10
|
+
'Retrieval-Augmented Generation (KG-RAG) framework across different LLMs.'
|
|
11
|
+
) # noqa: E501
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='biomix_qa',
|
|
17
|
+
pretty_name='BioMixQA',
|
|
18
|
+
tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.MEDICAL],
|
|
19
|
+
description=DESCRIPTION.strip(),
|
|
20
|
+
dataset_id='extraordinarylab/biomix-qa',
|
|
21
|
+
metric_list=['acc'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
class BioMixQAAdapter(MultiChoiceAdapter):
|
|
29
|
+
|
|
30
|
+
def record_to_sample(self, record) -> Sample:
|
|
31
|
+
return Sample(
|
|
32
|
+
input=record['question'],
|
|
33
|
+
choices=record['choices'],
|
|
34
|
+
target=record['answer'],
|
|
35
|
+
metadata={},
|
|
36
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.multi_choices import format_letter_choices
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
MULT_CHOICE_PROMPT = r"""
|
|
16
|
+
Answer the following multiple choice question. The last line of your response should be of the following format:
|
|
17
|
+
'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
|
|
18
|
+
|
|
19
|
+
{question}
|
|
20
|
+
""".strip()
|
|
21
|
+
|
|
22
|
+
SUBSET_LIST = [
|
|
23
|
+
'Art_Style', 'Counting', 'Forensic_Detection', 'Functional_Correspondence', 'IQ_Test', 'Jigsaw',
|
|
24
|
+
'Multi-view_Reasoning', 'Object_Localization', 'Relative_Depth', 'Relative_Reflectance', 'Semantic_Correspondence',
|
|
25
|
+
'Spatial_Relation', 'Visual_Correspondence', 'Visual_Similarity'
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@register_benchmark(
|
|
30
|
+
BenchmarkMeta(
|
|
31
|
+
name='blink',
|
|
32
|
+
pretty_name='BLINK',
|
|
33
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
|
|
34
|
+
description=
|
|
35
|
+
'BLINK is a benchmark designed to evaluate the core visual perception abilities of multimodal large language models (MLLMs). It transforms 14 classic computer vision tasks into 3,807 multiple-choice questions, accompanied by single or multiple images and visual prompts.', # noqa: E501
|
|
36
|
+
dataset_id='evalscope/BLINK',
|
|
37
|
+
subset_list=SUBSET_LIST,
|
|
38
|
+
metric_list=['acc'],
|
|
39
|
+
eval_split='val',
|
|
40
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
class BLINKAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
|
|
44
|
+
MAX_IMAGES: int = 4
|
|
45
|
+
|
|
46
|
+
def __init__(self, **kwargs):
|
|
47
|
+
super().__init__(**kwargs)
|
|
48
|
+
|
|
49
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
50
|
+
choices = record.get('choices')
|
|
51
|
+
input_text = MULT_CHOICE_PROMPT.format(question=record['prompt'], letters=format_letter_choices(choices))
|
|
52
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
53
|
+
|
|
54
|
+
for i in range(1, self.MAX_IMAGES + 1):
|
|
55
|
+
image = record.get(f'image_{i}')
|
|
56
|
+
if image:
|
|
57
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
|
|
58
|
+
content_list.append(ContentImage(image=image_base64))
|
|
59
|
+
|
|
60
|
+
label_answer = record['answer'].strip('(').strip(')')
|
|
61
|
+
return Sample(input=[ChatMessageUser(content=content_list)], choices=choices, target=label_answer)
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from functools import partial
|
|
4
3
|
from typing import Any, Dict
|
|
5
4
|
|
|
6
5
|
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
7
|
-
from evalscope.api.dataset import
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
8
7
|
from evalscope.api.registry import register_benchmark
|
|
9
8
|
from evalscope.constants import Tags
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
# flake8: noqa
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
OPEN_PROMPT = """
|
|
19
|
+
{question}
|
|
20
|
+
|
|
21
|
+
The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the a single word answer to the problem.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@register_benchmark(
|
|
26
|
+
BenchmarkMeta(
|
|
27
|
+
name='chartqa',
|
|
28
|
+
pretty_name='ChartQA',
|
|
29
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
30
|
+
description=
|
|
31
|
+
'ChartQA is a benchmark designed to evaluate question-answering capabilities about charts (e.g., bar charts, line graphs, pie charts), focusing on both visual and logical reasoning.', # noqa: E501
|
|
32
|
+
dataset_id='lmms-lab/ChartQA',
|
|
33
|
+
subset_list=['human_test', 'augmented_test'],
|
|
34
|
+
metric_list=['relaxed_acc'],
|
|
35
|
+
eval_split='test',
|
|
36
|
+
prompt_template=OPEN_PROMPT,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
class ChartQAAdapter(VisionLanguageAdapter):
|
|
40
|
+
|
|
41
|
+
def __init__(self, *args, **kwargs):
|
|
42
|
+
super().__init__(*args, **kwargs)
|
|
43
|
+
|
|
44
|
+
self.add_aggregation_name = False
|
|
45
|
+
self.reformat_subset = True
|
|
46
|
+
|
|
47
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
48
|
+
question = record['question']
|
|
49
|
+
image_data = record['image']
|
|
50
|
+
image_base64 = bytes_to_base64(image_data['bytes'], format='png', add_header=True)
|
|
51
|
+
|
|
52
|
+
content_list: List[Content] = [
|
|
53
|
+
ContentText(text=OPEN_PROMPT.format(question=question)),
|
|
54
|
+
ContentImage(image=image_base64)
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
return Sample(
|
|
58
|
+
input=[ChatMessageUser(content=content_list)],
|
|
59
|
+
target=record['answer'],
|
|
60
|
+
subset_key=record['type'], # 'human_test' or 'augmented_split'
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
64
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
65
|
+
match = re.search(pattern, prediction)
|
|
66
|
+
if match:
|
|
67
|
+
return match.group(1).strip()
|
|
68
|
+
return ''
|
|
69
|
+
|
|
70
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
71
|
+
from .utils import relaxed_correctness
|
|
72
|
+
|
|
73
|
+
score = relaxed_correctness(filtered_prediction, reference)
|
|
74
|
+
score = 1.0 if score else 0.0
|
|
75
|
+
|
|
76
|
+
return Score(
|
|
77
|
+
value={'relaxed_acc': score},
|
|
78
|
+
prediction=original_prediction,
|
|
79
|
+
extracted_prediction=filtered_prediction,
|
|
80
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
def relaxed_correctness(prediction: str, target: str, max_relative_change: float = 0.05) -> bool:
|
|
2
|
+
"""Calculates relaxed correctness.
|
|
3
|
+
|
|
4
|
+
The correctness tolerates certain error ratio defined by max_relative_change.
|
|
5
|
+
See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
|
|
6
|
+
“Following Methani et al. (2020), we use a relaxed accuracy measure for the
|
|
7
|
+
numeric answers to allow a minor inaccuracy that may result from the automatic
|
|
8
|
+
data extraction process. We consider an answer to be correct if it is within
|
|
9
|
+
5% of the gold answer. For non-numeric answers, we still need an exact match
|
|
10
|
+
to consider an answer to be correct.”
|
|
11
|
+
|
|
12
|
+
This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
|
|
13
|
+
Args:
|
|
14
|
+
target: List of target string.
|
|
15
|
+
prediction: List of predicted string.
|
|
16
|
+
max_relative_change: Maximum relative change.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Whether the prediction was correct given the specified tolerance.
|
|
20
|
+
""" # noqa: E501
|
|
21
|
+
|
|
22
|
+
def _to_float(text: str):
|
|
23
|
+
try:
|
|
24
|
+
if text.endswith('%'):
|
|
25
|
+
# Convert percentages to floats.
|
|
26
|
+
return float(text.rstrip('%')) / 100.0
|
|
27
|
+
else:
|
|
28
|
+
return float(text)
|
|
29
|
+
except ValueError:
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
prediction_float = _to_float(prediction)
|
|
33
|
+
target_float = _to_float(target)
|
|
34
|
+
if prediction_float is not None and target_float:
|
|
35
|
+
relative_change = abs(prediction_float - target_float) / abs(target_float)
|
|
36
|
+
return relative_change <= max_relative_change
|
|
37
|
+
else:
|
|
38
|
+
return prediction.lower() == target.lower()
|
|
File without changes
|