evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# flake8: noqa: E501
|
|
3
|
+
import re
|
|
4
|
+
import urllib.request
|
|
5
|
+
import zipfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
10
|
+
from evalscope.api.dataset import Sample
|
|
11
|
+
from evalscope.api.evaluator import TaskState
|
|
12
|
+
from evalscope.api.messages import ChatMessageUser
|
|
13
|
+
from evalscope.api.metric import Score
|
|
14
|
+
from evalscope.api.registry import register_benchmark
|
|
15
|
+
from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, Tags
|
|
16
|
+
from evalscope.utils.logger import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
# Default judge prompt template
|
|
21
|
+
JUDGE_PROMPT = """Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT. For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.
|
|
22
|
+
|
|
23
|
+
The question, for reference only: {question}
|
|
24
|
+
The OFFICIAL ANSWER: {correct_answer}
|
|
25
|
+
CANDIDATE ANSWER TO ASSESS: {response}
|
|
26
|
+
|
|
27
|
+
Reply only with CORRECT or INCORRECT."""
|
|
28
|
+
|
|
29
|
+
PROMPT_TEMPLATE = """
|
|
30
|
+
BEGIN INPUT DOCUMENTS
|
|
31
|
+
|
|
32
|
+
{documents_text}
|
|
33
|
+
|
|
34
|
+
END INPUT DOCUMENTS
|
|
35
|
+
|
|
36
|
+
Answer the following question using the input documents provided above.
|
|
37
|
+
|
|
38
|
+
START QUESTION
|
|
39
|
+
|
|
40
|
+
{question}
|
|
41
|
+
|
|
42
|
+
END QUESTION
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# New constants for auto-download
|
|
46
|
+
DOWNLOAD_URL: str = (
|
|
47
|
+
'https://modelscope.cn/datasets/evalscope/AA-LCR/resolve/master/extracted_text/AA-LCR_extracted-text.zip'
|
|
48
|
+
)
|
|
49
|
+
DEFAULT_CACHE_SUBDIR: str = 'aa_lcr'
|
|
50
|
+
DEFAULT_ZIP_NAME: str = 'AA-LCR_extracted-text.zip'
|
|
51
|
+
DEFAULT_EXTRACTED_DIR_NAME: str = 'lcr'
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@register_benchmark(
|
|
55
|
+
BenchmarkMeta(
|
|
56
|
+
name='aa_lcr',
|
|
57
|
+
pretty_name='AA-LCR',
|
|
58
|
+
tags=[Tags.KNOWLEDGE, Tags.REASONING, Tags.LONG_CONTEXT],
|
|
59
|
+
description='AA-LCR (Artificial Analysis Long Context Retrieval) is a benchmark for evaluating long-context '
|
|
60
|
+
'retrieval and reasoning capabilities of language models across multiple documents.', # noqa: E501
|
|
61
|
+
dataset_id='evalscope/AA-LCR',
|
|
62
|
+
metric_list=['acc'],
|
|
63
|
+
few_shot_num=0,
|
|
64
|
+
train_split=None,
|
|
65
|
+
eval_split='test',
|
|
66
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
67
|
+
extra_params={'text_dir': None}
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
class AALCRAdapter(DefaultDataAdapter):
|
|
71
|
+
|
|
72
|
+
def __init__(self, *args, **kwargs):
|
|
73
|
+
super().__init__(*args, **kwargs)
|
|
74
|
+
|
|
75
|
+
self._use_llm_judge = True
|
|
76
|
+
|
|
77
|
+
# Get extra parameters
|
|
78
|
+
self.text_dir = self.extra_params.get('text_dir')
|
|
79
|
+
|
|
80
|
+
def load(self):
|
|
81
|
+
# Auto download and extract when text_dir is not provided
|
|
82
|
+
if not self.text_dir:
|
|
83
|
+
self.text_dir = self._ensure_text_dir_downloaded()
|
|
84
|
+
elif not Path(self.text_dir).exists():
|
|
85
|
+
raise ValueError(
|
|
86
|
+
'AA-LCR text_dir does not exist: '
|
|
87
|
+
f'{self.text_dir}. Please provide a valid directory or omit text_dir to auto-download.'
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
self.text_dir = Path(self.text_dir)
|
|
91
|
+
return super().load()
|
|
92
|
+
|
|
93
|
+
def _ensure_text_dir_downloaded(self) -> Path:
|
|
94
|
+
"""Ensure AA-LCR extracted texts are available locally; download and extract if missing."""
|
|
95
|
+
cache_root = Path(DEFAULT_EVALSCOPE_CACHE_DIR) / DEFAULT_CACHE_SUBDIR
|
|
96
|
+
extracted_dir = cache_root / DEFAULT_EXTRACTED_DIR_NAME
|
|
97
|
+
|
|
98
|
+
if extracted_dir.exists():
|
|
99
|
+
logger.info(f'AA-LCR documents found: {extracted_dir}')
|
|
100
|
+
return extracted_dir
|
|
101
|
+
|
|
102
|
+
cache_root.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
zip_path = cache_root / DEFAULT_ZIP_NAME
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
logger.info(f'Downloading AA-LCR documents from {DOWNLOAD_URL} to {zip_path}...')
|
|
107
|
+
urllib.request.urlretrieve(DOWNLOAD_URL, zip_path)
|
|
108
|
+
|
|
109
|
+
logger.info(f'Extracting {zip_path} to {cache_root}...')
|
|
110
|
+
with zipfile.ZipFile(zip_path, 'r') as zf:
|
|
111
|
+
zf.extractall(cache_root)
|
|
112
|
+
|
|
113
|
+
if not extracted_dir.exists():
|
|
114
|
+
raise ValueError(f'Extraction succeeded but target directory not found: {extracted_dir}')
|
|
115
|
+
|
|
116
|
+
logger.info(f'AA-LCR documents ready at {extracted_dir}')
|
|
117
|
+
return extracted_dir
|
|
118
|
+
except Exception as e:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f'Failed to download or extract AA-LCR documents: {e}. '
|
|
121
|
+
'You can also manually download and set extra_params["text_dir"].'
|
|
122
|
+
) from e
|
|
123
|
+
finally:
|
|
124
|
+
# Best-effort cleanup of the zip file
|
|
125
|
+
try:
|
|
126
|
+
if zip_path.exists():
|
|
127
|
+
zip_path.unlink()
|
|
128
|
+
except Exception:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
def _get_context(self, record: Dict[str, Any]) -> str:
|
|
132
|
+
doc_folder = self.text_dir / record['document_category'] / record['document_set_id']
|
|
133
|
+
|
|
134
|
+
# Check if the document folder exists
|
|
135
|
+
if not doc_folder.exists() or not doc_folder.is_dir():
|
|
136
|
+
logger.warning(f'Document folder not found: {doc_folder}. Returning empty context.')
|
|
137
|
+
return ''
|
|
138
|
+
|
|
139
|
+
doc_blocks = []
|
|
140
|
+
try:
|
|
141
|
+
for file_path in doc_folder.iterdir():
|
|
142
|
+
if file_path.is_file():
|
|
143
|
+
try:
|
|
144
|
+
content = file_path.read_text(encoding='utf-8').strip()
|
|
145
|
+
if content:
|
|
146
|
+
doc_blocks.append(content)
|
|
147
|
+
except (IOError, UnicodeDecodeError) as e:
|
|
148
|
+
logger.warning(f'Could not read file {file_path}, skipping: {e}')
|
|
149
|
+
except OSError as e:
|
|
150
|
+
logger.warning(f'Could not access document folder {doc_folder}: {e}')
|
|
151
|
+
return f"ERROR: Could not read documents for {record['document_category']}/{record['document_set_id']}"
|
|
152
|
+
|
|
153
|
+
documents_text = '\n\n'.join(
|
|
154
|
+
f'BEGIN DOCUMENT {i + 1}:\n{doc}\nEND DOCUMENT {i + 1}' for i, doc in enumerate(doc_blocks)
|
|
155
|
+
)
|
|
156
|
+
return documents_text
|
|
157
|
+
|
|
158
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
159
|
+
"""Convert a record to a Sample with long-context prompt."""
|
|
160
|
+
context = self._get_context(record)
|
|
161
|
+
prompt = self.prompt_template.format(documents_text=context, question=record['question'])
|
|
162
|
+
|
|
163
|
+
return Sample(
|
|
164
|
+
input=[ChatMessageUser(content=prompt)],
|
|
165
|
+
target=record['answer'],
|
|
166
|
+
metadata={
|
|
167
|
+
'question': record['question'],
|
|
168
|
+
'data_source_urls': record['data_source_urls'],
|
|
169
|
+
'input_tokens': record.get('input_tokens', 0),
|
|
170
|
+
}
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def llm_match_score(
|
|
174
|
+
self,
|
|
175
|
+
original_prediction: str,
|
|
176
|
+
filtered_prediction: str,
|
|
177
|
+
reference: str,
|
|
178
|
+
task_state: TaskState,
|
|
179
|
+
) -> Score:
|
|
180
|
+
score = Score(
|
|
181
|
+
extracted_prediction=filtered_prediction,
|
|
182
|
+
prediction=original_prediction,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
judge_prompt = JUDGE_PROMPT.format(
|
|
186
|
+
question=task_state.metadata['question'], correct_answer=reference, response=filtered_prediction
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Request judge and obtain score
|
|
190
|
+
judge_response = self.llm_judge.judge(prompt=judge_prompt)
|
|
191
|
+
|
|
192
|
+
# Parse judge response to get accuracy score
|
|
193
|
+
# Use word boundaries to avoid matching "CORRECT" within "INCORRECT"
|
|
194
|
+
is_correct = bool(re.search(r'\bCORRECT\b', judge_response, re.IGNORECASE))
|
|
195
|
+
score.value = {
|
|
196
|
+
'acc': 1.0 if is_correct else 0.0,
|
|
197
|
+
}
|
|
198
|
+
score.explanation = f'LLM judge: {judge_response}'
|
|
199
|
+
score.metadata = {
|
|
200
|
+
'source': 'llm_judge',
|
|
201
|
+
'judge_strategy': self.judge_strategy,
|
|
202
|
+
'model': self.llm_judge.model_id,
|
|
203
|
+
}
|
|
204
|
+
score.main_score_name = 'acc'
|
|
205
|
+
return score
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
SUBSET_LIST = ['default']
|
|
16
|
+
|
|
17
|
+
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_benchmark(
|
|
21
|
+
BenchmarkMeta(
|
|
22
|
+
name='ai2d',
|
|
23
|
+
pretty_name='AI2D',
|
|
24
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
25
|
+
description=
|
|
26
|
+
'AI2D is a benchmark dataset for researching the understanding of diagrams by AI. It contains over 5,000 diverse diagrams from science textbooks (e.g., the water cycle, food webs). Each diagram is accompanied by multiple-choice questions that test an AI\'s ability to interpret visual elements, text labels, and their relationships. The benchmark is challenging because it requires jointly understanding the layout, symbols, and text to answer questions correctly.', # noqa: E501
|
|
27
|
+
dataset_id='lmms-lab/ai2d',
|
|
28
|
+
subset_list=SUBSET_LIST,
|
|
29
|
+
metric_list=['acc'],
|
|
30
|
+
eval_split='test',
|
|
31
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
class Ai2dAdapter(VisionLanguageAdapter):
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
super().__init__(**kwargs)
|
|
38
|
+
|
|
39
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
40
|
+
answers_list: list[str] = record['options']
|
|
41
|
+
input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
|
|
42
|
+
content_list: list[Content] = [ContentText(text=input_text)]
|
|
43
|
+
image = record.get('image')
|
|
44
|
+
if image:
|
|
45
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
46
|
+
content_list.append(ContentImage(image=image_base64))
|
|
47
|
+
|
|
48
|
+
label_answer = chr(int(record['answer']) + ord('A'))
|
|
49
|
+
|
|
50
|
+
return Sample(input=[ChatMessageUser(content=content_list)], choices=answers_list, target=label_answer)
|
|
51
|
+
|
|
52
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
53
|
+
answers = parse_answers(task_state)
|
|
54
|
+
return ''.join(sorted(list(answers)))
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import re
|
|
3
4
|
from typing import Any, Dict
|
|
4
5
|
|
|
5
6
|
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
7
|
from evalscope.api.dataset import Sample
|
|
7
8
|
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.metric import Score
|
|
8
10
|
from evalscope.api.registry import register_benchmark
|
|
9
11
|
from evalscope.constants import Tags
|
|
10
12
|
from evalscope.utils.logger import get_logger
|
|
@@ -13,6 +15,74 @@ from evalscope.utils.logger import get_logger
|
|
|
13
15
|
|
|
14
16
|
logger = get_logger()
|
|
15
17
|
|
|
18
|
+
JUDGE_PROMPT = """
|
|
19
|
+
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
|
|
20
|
+
|
|
21
|
+
Examples:
|
|
22
|
+
|
|
23
|
+
Expression 1: $2x+3$
|
|
24
|
+
Expression 2: $3+2x$
|
|
25
|
+
|
|
26
|
+
Yes
|
|
27
|
+
|
|
28
|
+
Expression 1: 3/2
|
|
29
|
+
Expression 2: 1.5
|
|
30
|
+
|
|
31
|
+
Yes
|
|
32
|
+
|
|
33
|
+
Expression 1: $x^2+2x+1$
|
|
34
|
+
Expression 2: $y^2+2y+1$
|
|
35
|
+
|
|
36
|
+
No
|
|
37
|
+
|
|
38
|
+
Expression 1: $x^2+2x+1$
|
|
39
|
+
Expression 2: $(x+1)^2$
|
|
40
|
+
|
|
41
|
+
Yes
|
|
42
|
+
|
|
43
|
+
Expression 1: 3245/5
|
|
44
|
+
Expression 2: 649
|
|
45
|
+
|
|
46
|
+
No
|
|
47
|
+
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
|
|
48
|
+
|
|
49
|
+
Expression 1: 2/(-3)
|
|
50
|
+
Expression 2: -2/3
|
|
51
|
+
|
|
52
|
+
Yes
|
|
53
|
+
(trivial simplifications are allowed)
|
|
54
|
+
|
|
55
|
+
Expression 1: 72 degrees
|
|
56
|
+
Expression 2: 72
|
|
57
|
+
|
|
58
|
+
Yes
|
|
59
|
+
(give benefit of the doubt to units)
|
|
60
|
+
|
|
61
|
+
Expression 1: 64
|
|
62
|
+
Expression 2: 64 square feet
|
|
63
|
+
|
|
64
|
+
Yes
|
|
65
|
+
(give benefit of the doubt to units)
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
YOUR TASK
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
Respond with only "Yes" or "No" (without quotes). Do not include a rationale.
|
|
73
|
+
|
|
74
|
+
Expression 1: {expression1}
|
|
75
|
+
Expression 2: {expression2}
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
PROMPT_TEMPLATE = """
|
|
80
|
+
Solve the following math problem step by step. Put your answer inside \\boxed{{}}.
|
|
81
|
+
|
|
82
|
+
{question}
|
|
83
|
+
|
|
84
|
+
Remember to put your answer inside \\boxed{{}}."""
|
|
85
|
+
|
|
16
86
|
|
|
17
87
|
@register_benchmark(
|
|
18
88
|
BenchmarkMeta(
|
|
@@ -31,7 +101,7 @@ logger = get_logger()
|
|
|
31
101
|
few_shot_num=0,
|
|
32
102
|
train_split=None,
|
|
33
103
|
eval_split='test',
|
|
34
|
-
prompt_template=
|
|
104
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
35
105
|
)
|
|
36
106
|
)
|
|
37
107
|
class AIME25Adapter(DefaultDataAdapter):
|
|
@@ -44,3 +114,68 @@ class AIME25Adapter(DefaultDataAdapter):
|
|
|
44
114
|
input=record['question'],
|
|
45
115
|
target=record['answer'],
|
|
46
116
|
)
|
|
117
|
+
|
|
118
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
119
|
+
"""
|
|
120
|
+
Args:
|
|
121
|
+
prediction (str): The model prediction to extract from
|
|
122
|
+
task_state (TaskState): The task state for additional context
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
str: The extracted answer
|
|
126
|
+
"""
|
|
127
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
128
|
+
from .math_normalize import normalize_answer
|
|
129
|
+
|
|
130
|
+
extracted_pred = extract_answer(prediction)
|
|
131
|
+
filtered_pred = normalize_answer(extracted_pred)
|
|
132
|
+
return filtered_pred
|
|
133
|
+
|
|
134
|
+
def match_score(
|
|
135
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
136
|
+
) -> Score:
|
|
137
|
+
from evalscope.metrics.math_parser import extract_answer
|
|
138
|
+
from .grader import grade_answer
|
|
139
|
+
|
|
140
|
+
score = Score(
|
|
141
|
+
extracted_prediction=filtered_prediction,
|
|
142
|
+
prediction=original_prediction,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Use the custom grade_answer function for evaluation
|
|
146
|
+
try:
|
|
147
|
+
is_correct = grade_answer(extract_answer(original_prediction), reference)
|
|
148
|
+
accuracy_score = 1.0 if is_correct else 0.0
|
|
149
|
+
score.value['acc'] = accuracy_score
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.error(f'Error in custom grading: {e}')
|
|
152
|
+
score.value['acc'] = 0.0
|
|
153
|
+
score.metadata['acc'] = f'grading_error: {str(e)}'
|
|
154
|
+
return score
|
|
155
|
+
|
|
156
|
+
def llm_match_score(
|
|
157
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
158
|
+
) -> Score:
|
|
159
|
+
score = Score(
|
|
160
|
+
extracted_prediction=filtered_prediction,
|
|
161
|
+
prediction=original_prediction,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
judge_prompt = JUDGE_PROMPT.format(expression1=original_prediction, expression2=reference)
|
|
165
|
+
|
|
166
|
+
# Request judge and obtain score
|
|
167
|
+
judge_response = self.llm_judge.judge(prompt=judge_prompt)
|
|
168
|
+
|
|
169
|
+
# Parse judge response to get accuracy score
|
|
170
|
+
is_correct = bool(re.search(r'\bYes\b', judge_response, re.IGNORECASE))
|
|
171
|
+
score.value = {
|
|
172
|
+
'acc': 1.0 if is_correct else 0.0,
|
|
173
|
+
}
|
|
174
|
+
score.explanation = f'LLM judge: {judge_response}'
|
|
175
|
+
score.metadata = {
|
|
176
|
+
'source': 'llm_judge',
|
|
177
|
+
'judge_strategy': self.judge_strategy,
|
|
178
|
+
'model': self.llm_judge.model_id,
|
|
179
|
+
}
|
|
180
|
+
score.main_score_name = 'acc'
|
|
181
|
+
return score
|