evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.dataset.dataset import DatasetDict
|
|
8
|
+
from evalscope.api.dataset.loader import DictDataLoader
|
|
9
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
10
|
+
from evalscope.api.metric import Score
|
|
11
|
+
from evalscope.api.model import Model, ModelOutput
|
|
12
|
+
from evalscope.api.registry import register_benchmark
|
|
13
|
+
from evalscope.constants import Tags
|
|
14
|
+
from evalscope.utils import get_logger
|
|
15
|
+
from evalscope.utils.function_utils import run_once
|
|
16
|
+
from evalscope.utils.import_utils import check_import
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@register_benchmark(
|
|
22
|
+
BenchmarkMeta(
|
|
23
|
+
name='tau2_bench',
|
|
24
|
+
pretty_name='τ²-bench',
|
|
25
|
+
tags=[Tags.FUNCTION_CALLING, Tags.REASONING, Tags.AGENT],
|
|
26
|
+
description='τ²-bench (Tau Squared Bench) is an extension and enhancement of the original '
|
|
27
|
+
'τ-bench (Tau Bench), which is a benchmark designed to evaluate conversational AI agents '
|
|
28
|
+
'that interact with users through domain-specific API tools and guidelines. '
|
|
29
|
+
'Please install it with `pip install git+https://github.com/sierra-research/tau2-bench@v0.2.0` '
|
|
30
|
+
'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/tau2_bench.html)', # noqa: E501
|
|
31
|
+
dataset_id='evalscope/tau2-bench-data',
|
|
32
|
+
subset_list=['airline', 'retail', 'telecom'],
|
|
33
|
+
aggregation='mean_and_pass_hat_k',
|
|
34
|
+
eval_split='test',
|
|
35
|
+
extra_params={
|
|
36
|
+
'user_model': 'qwen-plus',
|
|
37
|
+
'api_key': 'EMPTY',
|
|
38
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
39
|
+
'generation_config': {
|
|
40
|
+
'temperature': 0.0,
|
|
41
|
+
'max_tokens': 4096,
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
class Tau2BenchAdapter(AgentAdapter):
|
|
47
|
+
|
|
48
|
+
def __init__(self, **kwargs):
|
|
49
|
+
super().__init__(**kwargs)
|
|
50
|
+
|
|
51
|
+
check_import(
|
|
52
|
+
'tau2',
|
|
53
|
+
package='git+https://github.com/sierra-research/tau2-bench@v0.2.0',
|
|
54
|
+
raise_error=True,
|
|
55
|
+
feature_name=self.pretty_name
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# setup user model args
|
|
59
|
+
self.user_model = self.extra_params.get('user_model', 'qwen-plus')
|
|
60
|
+
self.api_key = self.extra_params.get('api_key', 'EMPTY')
|
|
61
|
+
self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
|
|
62
|
+
self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
|
|
63
|
+
|
|
64
|
+
def load(self):
|
|
65
|
+
# Load dataset
|
|
66
|
+
dataset_name_or_path = self.dataset_id
|
|
67
|
+
if os.path.exists(dataset_name_or_path):
|
|
68
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
69
|
+
dataset_path = dataset_name_or_path
|
|
70
|
+
else:
|
|
71
|
+
from modelscope import dataset_snapshot_download
|
|
72
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
73
|
+
dataset_path = dataset_snapshot_download(dataset_name_or_path)
|
|
74
|
+
|
|
75
|
+
# Set Tau2 data dir
|
|
76
|
+
os.environ['TAU2_DATA_DIR'] = dataset_path
|
|
77
|
+
|
|
78
|
+
# Load data for each domain
|
|
79
|
+
from tau2.agent.llm_agent import LLMGTAgent
|
|
80
|
+
from tau2.registry import registry
|
|
81
|
+
|
|
82
|
+
data_dict = defaultdict(dict)
|
|
83
|
+
for domain_name in self.subset_list:
|
|
84
|
+
logger.info(f'Loading Tau2-Bench environment: {domain_name}')
|
|
85
|
+
# Get tasks
|
|
86
|
+
task_loader = registry.get_tasks_loader(domain_name)
|
|
87
|
+
tasks = task_loader()
|
|
88
|
+
tasks = [task for task in tasks if LLMGTAgent.check_valid_task(task)]
|
|
89
|
+
tasks = [task.model_dump(exclude_unset=True) for task in tasks]
|
|
90
|
+
|
|
91
|
+
# load dataset
|
|
92
|
+
dataset = DictDataLoader(
|
|
93
|
+
dict_list=tasks,
|
|
94
|
+
sample_fields=self.record_to_sample,
|
|
95
|
+
limit=self.limit,
|
|
96
|
+
repeats=self.repeats,
|
|
97
|
+
shuffle=self.shuffle,
|
|
98
|
+
).load()
|
|
99
|
+
|
|
100
|
+
data_dict[domain_name] = dataset
|
|
101
|
+
|
|
102
|
+
test_dataset = DatasetDict(data_dict)
|
|
103
|
+
|
|
104
|
+
return test_dataset, None
|
|
105
|
+
|
|
106
|
+
def record_to_sample(self, record: Dict) -> Sample:
|
|
107
|
+
"""Convert a data record to a Sample object."""
|
|
108
|
+
return Sample(
|
|
109
|
+
input=[ChatMessageUser(content=record['description']['purpose'] or '')],
|
|
110
|
+
target='', # Will use the record for evaluation
|
|
111
|
+
subset_key=record['user_scenario']['instructions']['domain'],
|
|
112
|
+
metadata=record # Store the full record for evaluation
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
|
|
116
|
+
from .generation import predict
|
|
117
|
+
return predict(model, sample, adapter_instance=self)
|
|
118
|
+
|
|
119
|
+
def match_score(self, original_prediction: str, filtered_prediction: str, reference: str, task_state) -> Score:
|
|
120
|
+
|
|
121
|
+
score = Score(
|
|
122
|
+
extracted_prediction=filtered_prediction,
|
|
123
|
+
prediction=original_prediction,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
# Parse the prediction to get the reward
|
|
128
|
+
task_result = task_state.metadata['task_result']
|
|
129
|
+
reward = task_result['reward']
|
|
130
|
+
|
|
131
|
+
score.value = {
|
|
132
|
+
'acc': float(reward),
|
|
133
|
+
}
|
|
134
|
+
score.explanation = f'Task completed with reward: {reward}'
|
|
135
|
+
score.metadata = {
|
|
136
|
+
'task_result': task_result,
|
|
137
|
+
}
|
|
138
|
+
score.main_score_name = 'acc'
|
|
139
|
+
|
|
140
|
+
except Exception as e:
|
|
141
|
+
score.value = {'acc': 0.0}
|
|
142
|
+
score.explanation = f'Evaluation failed: {str(e)}'
|
|
143
|
+
score.metadata = {'error': str(e)}
|
|
144
|
+
score.main_score_name = 'acc'
|
|
145
|
+
|
|
146
|
+
return score
|
|
File without changes
|
|
@@ -45,7 +45,7 @@ def _patch_agent_solve(model: Model):
|
|
|
45
45
|
input=[dict_to_chat_message(msg) for msg in messages],
|
|
46
46
|
tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
|
|
47
47
|
)
|
|
48
|
-
oai_res = openai_chat_choices(res.choices)
|
|
48
|
+
oai_res = openai_chat_choices(res.choices, include_reasoning=False)
|
|
49
49
|
|
|
50
50
|
next_message = oai_res[0].message.model_dump(exclude_none=True)
|
|
51
51
|
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import importlib
|
|
2
1
|
from collections import defaultdict
|
|
3
2
|
from typing import Dict, List
|
|
4
3
|
|
|
5
|
-
from evalscope.api.benchmark import
|
|
4
|
+
from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
|
|
6
5
|
from evalscope.api.dataset import Sample
|
|
7
6
|
from evalscope.api.dataset.dataset import DatasetDict
|
|
8
7
|
from evalscope.api.dataset.loader import DictDataLoader
|
|
@@ -13,6 +12,7 @@ from evalscope.api.registry import register_benchmark
|
|
|
13
12
|
from evalscope.constants import Tags
|
|
14
13
|
from evalscope.utils import get_logger
|
|
15
14
|
from evalscope.utils.function_utils import run_once
|
|
15
|
+
from evalscope.utils.import_utils import check_import
|
|
16
16
|
|
|
17
17
|
logger = get_logger()
|
|
18
18
|
|
|
@@ -21,47 +21,43 @@ logger = get_logger()
|
|
|
21
21
|
BenchmarkMeta(
|
|
22
22
|
name='tau_bench',
|
|
23
23
|
pretty_name='τ-bench',
|
|
24
|
-
tags=[Tags.FUNCTION_CALLING, Tags.REASONING],
|
|
24
|
+
tags=[Tags.FUNCTION_CALLING, Tags.REASONING, Tags.AGENT],
|
|
25
25
|
description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
|
|
26
26
|
'and a language agent provided with domain-specific API tools and policy guidelines. '
|
|
27
27
|
'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` '
|
|
28
|
-
'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/
|
|
28
|
+
'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/tau_bench.html)', # noqa: E501
|
|
29
29
|
dataset_id='https://github.com/sierra-research/tau-bench',
|
|
30
30
|
subset_list=['airline', 'retail'],
|
|
31
|
-
|
|
31
|
+
aggregation='mean_and_pass_hat_k',
|
|
32
32
|
eval_split='test',
|
|
33
33
|
extra_params={
|
|
34
34
|
'user_model': 'qwen-plus',
|
|
35
35
|
'api_key': 'EMPTY',
|
|
36
36
|
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
37
37
|
'generation_config': {
|
|
38
|
-
'temperature': 0.
|
|
39
|
-
'
|
|
38
|
+
'temperature': 0.0,
|
|
39
|
+
'max_tokens': 4096,
|
|
40
40
|
}
|
|
41
41
|
}
|
|
42
42
|
)
|
|
43
43
|
)
|
|
44
|
-
class TauBenchAdapter(
|
|
44
|
+
class TauBenchAdapter(AgentAdapter):
|
|
45
45
|
|
|
46
46
|
def __init__(self, **kwargs):
|
|
47
47
|
super().__init__(**kwargs)
|
|
48
48
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
49
|
+
check_import(
|
|
50
|
+
'tau_bench',
|
|
51
|
+
package='git+https://github.com/sierra-research/tau-bench',
|
|
52
|
+
raise_error=True,
|
|
53
|
+
feature_name=self.pretty_name
|
|
54
|
+
)
|
|
54
55
|
|
|
55
56
|
# setup user model args
|
|
56
57
|
self.user_model = self.extra_params.get('user_model', 'qwen-plus')
|
|
57
58
|
self.api_key = self.extra_params.get('api_key', 'EMPTY')
|
|
58
59
|
self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
|
|
59
|
-
self.generation_config = self.extra_params.get(
|
|
60
|
-
'generation_config', {
|
|
61
|
-
'temperature': 0.7,
|
|
62
|
-
'max_new_tokens': 1024
|
|
63
|
-
}
|
|
64
|
-
)
|
|
60
|
+
self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
|
|
65
61
|
|
|
66
62
|
self._patch_env_completion()
|
|
67
63
|
|
|
@@ -84,10 +80,10 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
84
80
|
|
|
85
81
|
res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
|
|
86
82
|
|
|
87
|
-
message = res.
|
|
83
|
+
message = {'role': 'assistant', 'content': res.completion}
|
|
88
84
|
self.messages.append(message)
|
|
89
85
|
self.total_cost = 0
|
|
90
|
-
return
|
|
86
|
+
return res.completion
|
|
91
87
|
|
|
92
88
|
# get the current instance of TauBenchAdapter
|
|
93
89
|
adapter_instance = self
|
|
@@ -114,7 +110,11 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
114
110
|
})
|
|
115
111
|
# load dataset
|
|
116
112
|
dataset = DictDataLoader(
|
|
117
|
-
dict_list=tasks,
|
|
113
|
+
dict_list=tasks,
|
|
114
|
+
sample_fields=self.record_to_sample,
|
|
115
|
+
limit=self.limit,
|
|
116
|
+
repeats=self.repeats,
|
|
117
|
+
shuffle=self.shuffle,
|
|
118
118
|
).load()
|
|
119
119
|
|
|
120
120
|
data_dict[env_name] = dataset
|
|
@@ -145,24 +145,24 @@ class TauBenchAdapter(DefaultDataAdapter):
|
|
|
145
145
|
|
|
146
146
|
try:
|
|
147
147
|
# Parse the prediction to get the reward
|
|
148
|
-
|
|
149
|
-
reward =
|
|
148
|
+
task_result = task_state.metadata['task_result']
|
|
149
|
+
reward = task_result.get('reward', 0.0)
|
|
150
150
|
|
|
151
151
|
score.value = {
|
|
152
|
-
'
|
|
152
|
+
'acc': float(reward),
|
|
153
153
|
}
|
|
154
154
|
score.explanation = f'Task completed with reward: {reward}'
|
|
155
155
|
score.metadata = {
|
|
156
|
-
'task_result':
|
|
156
|
+
'task_result': task_result,
|
|
157
157
|
'env_name': task_state.metadata.get('env_name', 'unknown'),
|
|
158
158
|
'task_index': task_state.metadata.get('task_index', -1)
|
|
159
159
|
}
|
|
160
|
-
score.main_score_name = '
|
|
160
|
+
score.main_score_name = 'acc'
|
|
161
161
|
|
|
162
162
|
except Exception as e:
|
|
163
|
-
score.value = {'
|
|
163
|
+
score.value = {'acc': 0.0}
|
|
164
164
|
score.explanation = f'Evaluation failed: {str(e)}'
|
|
165
165
|
score.metadata = {'error': str(e)}
|
|
166
|
-
score.main_score_name = '
|
|
166
|
+
score.main_score_name = 'acc'
|
|
167
167
|
|
|
168
168
|
return score
|
|
File without changes
|
|
@@ -16,8 +16,10 @@ logger = get_logger()
|
|
|
16
16
|
@register_benchmark(
|
|
17
17
|
BenchmarkMeta(
|
|
18
18
|
name='evalmuse',
|
|
19
|
+
pretty_name='EvalMuse',
|
|
19
20
|
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
20
|
-
description='EvalMuse Text-to-Image Benchmark'
|
|
21
|
+
description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
|
|
22
|
+
'and semantic alignment of finely generated images',
|
|
21
23
|
tags=[Tags.TEXT_TO_IMAGE],
|
|
22
24
|
subset_list=['EvalMuse'],
|
|
23
25
|
metric_list=['FGA_BLIP2Score'],
|
|
@@ -4,7 +4,6 @@ import os
|
|
|
4
4
|
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
5
5
|
from evalscope.api.dataset import Sample
|
|
6
6
|
from evalscope.api.messages import ChatMessageUser
|
|
7
|
-
from evalscope.api.metric.scorer import Score
|
|
8
7
|
from evalscope.api.registry import get_metric, register_benchmark
|
|
9
8
|
from evalscope.constants import Tags
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
@@ -15,8 +14,9 @@ logger = get_logger()
|
|
|
15
14
|
@register_benchmark(
|
|
16
15
|
BenchmarkMeta(
|
|
17
16
|
name='genai_bench',
|
|
17
|
+
pretty_name='GenAI-Bench',
|
|
18
18
|
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
19
|
-
description='GenAI-Bench Text-to-Image Benchmark',
|
|
19
|
+
description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
|
|
20
20
|
tags=[Tags.TEXT_TO_IMAGE],
|
|
21
21
|
subset_list=['GenAI-Bench-1600'],
|
|
22
22
|
metric_list=['VQAScore'],
|
|
@@ -16,7 +16,7 @@ logger = get_logger()
|
|
|
16
16
|
name='general_t2i',
|
|
17
17
|
dataset_id='general_t2i',
|
|
18
18
|
description='General Text-to-Image Benchmark',
|
|
19
|
-
tags=[Tags.TEXT_TO_IMAGE],
|
|
19
|
+
tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
|
|
20
20
|
subset_list=['default'],
|
|
21
21
|
metric_list=['PickScore'],
|
|
22
22
|
few_shot_num=0,
|
|
@@ -14,8 +14,10 @@ logger = get_logger()
|
|
|
14
14
|
@register_benchmark(
|
|
15
15
|
BenchmarkMeta(
|
|
16
16
|
name='hpdv2',
|
|
17
|
+
pretty_name='HPD-v2',
|
|
17
18
|
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
18
|
-
description='HPDv2 Text-to-Image Benchmark'
|
|
19
|
+
description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
|
|
20
|
+
'trained on the Human Preference Dataset (HPD v2)',
|
|
19
21
|
tags=[Tags.TEXT_TO_IMAGE],
|
|
20
22
|
subset_list=['HPDv2'],
|
|
21
23
|
metric_list=['HPSv2.1Score'],
|
|
@@ -41,7 +43,10 @@ class HPDv2Adapter(Text2ImageAdapter):
|
|
|
41
43
|
return Sample(
|
|
42
44
|
input=[ChatMessageUser(content=record['prompt'])],
|
|
43
45
|
metadata={
|
|
46
|
+
'id': record['id'],
|
|
47
|
+
'prompt': record['prompt'],
|
|
44
48
|
'category': record.get('tags', {}).get('category', ''),
|
|
45
|
-
'tags': record.get('tags', {})
|
|
49
|
+
'tags': record.get('tags', {}),
|
|
50
|
+
'image_path': record.get('image_path', ''), # Optional field for existing image path
|
|
46
51
|
}
|
|
47
52
|
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import Any, Dict
|
|
3
3
|
|
|
4
|
-
from evalscope.api.benchmark import
|
|
4
|
+
from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
|
|
5
5
|
from evalscope.api.dataset import Sample
|
|
6
6
|
from evalscope.api.evaluator import TaskState
|
|
7
7
|
from evalscope.api.messages.chat_message import ChatMessage, dict_to_chat_message
|
|
@@ -21,14 +21,14 @@ logger = get_logger()
|
|
|
21
21
|
description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
|
|
22
22
|
'It includes various subsets such as in-domain and out-of-domain, '
|
|
23
23
|
'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
|
|
24
|
-
'[Usage Example](https://evalscope.readthedocs.io/
|
|
24
|
+
'[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html)',
|
|
25
25
|
dataset_id='AI-ModelScope/ToolBench-Static',
|
|
26
26
|
subset_list=['in_domain', 'out_of_domain'],
|
|
27
27
|
metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
|
|
28
28
|
eval_split='test',
|
|
29
29
|
)
|
|
30
30
|
)
|
|
31
|
-
class ToolBenchAdapter(
|
|
31
|
+
class ToolBenchAdapter(AgentAdapter):
|
|
32
32
|
"""
|
|
33
33
|
ToolBench adapter using the new data processing framework.
|
|
34
34
|
"""
|
|
@@ -37,6 +37,7 @@ TRUTHFUL_QA_PROMPT = (
|
|
|
37
37
|
dataset_id='evalscope/truthful_qa',
|
|
38
38
|
metric_list=['multi_choice_acc'],
|
|
39
39
|
subset_list=['multiple_choice'],
|
|
40
|
+
shuffle_choices=True,
|
|
40
41
|
few_shot_num=0,
|
|
41
42
|
train_split=None,
|
|
42
43
|
eval_split='validation',
|
|
@@ -55,8 +56,6 @@ class TruthfulQaAdapter(MultiChoiceAdapter):
|
|
|
55
56
|
|
|
56
57
|
super().__init__(**kwargs)
|
|
57
58
|
|
|
58
|
-
self.shuffle_choices = True
|
|
59
|
-
|
|
60
59
|
self.multiple_correct = self.extra_params.get('multiple_correct', False)
|
|
61
60
|
if self.multiple_correct:
|
|
62
61
|
self.prompt_template = MultipleChoiceTemplate.MULTIPLE_ANSWER
|
|
File without changes
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
from evalscope.utils.multi_choices import parse_answers
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
MULT_CHOICE_PROMPT = """
|
|
17
|
+
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A, B, C, D. Think step by step before answering.
|
|
18
|
+
|
|
19
|
+
{question}
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
SUBSET_LIST = [
|
|
23
|
+
'Quantitative Reasoning', 'Other', 'Positional Reasoning', 'Stylistic Reasoning', 'Spatial Reasoning',
|
|
24
|
+
'Attribute Reasoning'
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@register_benchmark(
|
|
29
|
+
BenchmarkMeta(
|
|
30
|
+
name='visulogic',
|
|
31
|
+
pretty_name='VisuLogic',
|
|
32
|
+
dataset_id='evalscope/VisuLogic',
|
|
33
|
+
tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
|
|
34
|
+
description=
|
|
35
|
+
'VisuLogic is a benchmark aimed at evaluating the visual reasoning capabilities of Multi-modal Large Language Models (MLLMs), independent of textual reasoning processes. It features carefully constructed visual reasoning tasks spanning multiple categories, divided into six types based on required reasoning skills (e.g., Quantitative Reasoning, which involves understanding and deducing changes in the quantity of elements in images). Unlike existing benchmarks, VisuLogic is a challenging visual reasoning benchmark that is inherently difficult to articulate using language, providing a more rigorous evaluation of the visual reasoning capabilities of MLLMs.',
|
|
36
|
+
subset_list=SUBSET_LIST,
|
|
37
|
+
metric_list=['acc'],
|
|
38
|
+
eval_split='test',
|
|
39
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
class VisuLogicAdapter(VisionLanguageAdapter):
|
|
43
|
+
|
|
44
|
+
def __init__(self, **kwargs):
|
|
45
|
+
super().__init__(**kwargs)
|
|
46
|
+
self.reformat_subset = True
|
|
47
|
+
|
|
48
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
49
|
+
question = record.get('question', '')
|
|
50
|
+
content_list: List[Content] = []
|
|
51
|
+
prompt_text = self.prompt_template.format(question=question).strip()
|
|
52
|
+
content_list.append(ContentText(text=prompt_text))
|
|
53
|
+
|
|
54
|
+
image = record.get('image')
|
|
55
|
+
if image and isinstance(image, dict):
|
|
56
|
+
image_bytes = image.get('bytes')
|
|
57
|
+
if image_bytes:
|
|
58
|
+
image_base64 = bytes_to_base64(image_bytes, format='png', add_header=True)
|
|
59
|
+
content_list.append(ContentImage(image=image_base64))
|
|
60
|
+
|
|
61
|
+
metadata = {
|
|
62
|
+
'id': record['id'],
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return Sample(
|
|
66
|
+
input=[ChatMessageUser(content=content_list)],
|
|
67
|
+
target=record['label'],
|
|
68
|
+
choices=['A', 'B', 'C', 'D'],
|
|
69
|
+
subset_key=record['tag'],
|
|
70
|
+
metadata=metadata,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
74
|
+
answers = parse_answers(task_state)
|
|
75
|
+
return ''.join(sorted(list(answers)))
|
|
File without changes
|