PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py ADDED Viewed

@@ -0,0 +1,146 @@
+import os
+from collections import defaultdict
+from typing import Dict, List
+from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
+from evalscope.api.dataset import Sample
+from evalscope.api.dataset.dataset import DatasetDict
+from evalscope.api.dataset.loader import DictDataLoader
+from evalscope.api.messages.chat_message import ChatMessageUser
+from evalscope.api.metric import Score
+from evalscope.api.model import Model, ModelOutput
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils import get_logger
+from evalscope.utils.function_utils import run_once
+from evalscope.utils.import_utils import check_import
+logger = get_logger()
+@register_benchmark(
+    BenchmarkMeta(
+        name='tau2_bench',
+        pretty_name='τ²-bench',
+        tags=[Tags.FUNCTION_CALLING, Tags.REASONING, Tags.AGENT],
+        description='τ²-bench (Tau Squared Bench) is an extension and enhancement of the original '
+        'τ-bench (Tau Bench), which is a benchmark designed to evaluate conversational AI agents '
+        'that interact with users through domain-specific API tools and guidelines. '
+        'Please install it with `pip install git+https://github.com/sierra-research/tau2-bench@v0.2.0` '
+        'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/tau2_bench.html)',  # noqa: E501
+        dataset_id='evalscope/tau2-bench-data',
+        subset_list=['airline', 'retail', 'telecom'],
+        aggregation='mean_and_pass_hat_k',
+        eval_split='test',
+        extra_params={
+            'user_model': 'qwen-plus',
+            'api_key': 'EMPTY',
+            'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+            'generation_config': {
+                'temperature': 0.0,
+                'max_tokens': 4096,
+            }
+        }
+    )
+)
+class Tau2BenchAdapter(AgentAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        check_import(
+            'tau2',
+            package='git+https://github.com/sierra-research/tau2-bench@v0.2.0',
+            raise_error=True,
+            feature_name=self.pretty_name
+        )
+        # setup user model args
+        self.user_model = self.extra_params.get('user_model', 'qwen-plus')
+        self.api_key = self.extra_params.get('api_key', 'EMPTY')
+        self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
+        self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
+    def load(self):
+        # Load dataset
+        dataset_name_or_path = self.dataset_id
+        if os.path.exists(dataset_name_or_path):
+            logger.info(f'Loading dataset from {dataset_name_or_path}')
+            dataset_path = dataset_name_or_path
+        else:
+            from modelscope import dataset_snapshot_download
+            logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
+            dataset_path = dataset_snapshot_download(dataset_name_or_path)
+        # Set Tau2 data dir
+        os.environ['TAU2_DATA_DIR'] = dataset_path
+        # Load data for each domain
+        from tau2.agent.llm_agent import LLMGTAgent
+        from tau2.registry import registry
+        data_dict = defaultdict(dict)
+        for domain_name in self.subset_list:
+            logger.info(f'Loading Tau2-Bench environment: {domain_name}')
+            # Get tasks
+            task_loader = registry.get_tasks_loader(domain_name)
+            tasks = task_loader()
+            tasks = [task for task in tasks if LLMGTAgent.check_valid_task(task)]
+            tasks = [task.model_dump(exclude_unset=True) for task in tasks]
+            # load dataset
+            dataset = DictDataLoader(
+                dict_list=tasks,
+                sample_fields=self.record_to_sample,
+                limit=self.limit,
+                repeats=self.repeats,
+                shuffle=self.shuffle,
+            ).load()
+            data_dict[domain_name] = dataset
+        test_dataset = DatasetDict(data_dict)
+        return test_dataset, None
+    def record_to_sample(self, record: Dict) -> Sample:
+        """Convert a data record to a Sample object."""
+        return Sample(
+            input=[ChatMessageUser(content=record['description']['purpose'] or '')],
+            target='',  # Will use the record for evaluation
+            subset_key=record['user_scenario']['instructions']['domain'],
+            metadata=record  # Store the full record for evaluation
+        )
+    def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
+        from .generation import predict
+        return predict(model, sample, adapter_instance=self)
+    def match_score(self, original_prediction: str, filtered_prediction: str, reference: str, task_state) -> Score:
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        try:
+            # Parse the prediction to get the reward
+            task_result = task_state.metadata['task_result']
+            reward = task_result['reward']
+            score.value = {
+                'acc': float(reward),
+            }
+            score.explanation = f'Task completed with reward: {reward}'
+            score.metadata = {
+                'task_result': task_result,
+            }
+            score.main_score_name = 'acc'
+        except Exception as e:
+            score.value = {'acc': 0.0}
+            score.explanation = f'Evaluation failed: {str(e)}'
+            score.metadata = {'error': str(e)}
+            score.main_score_name = 'acc'
+        return score

evalscope/benchmarks/tau_bench/tau_bench/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} RENAMED Viewed

@@ -45,7 +45,7 @@ def _patch_agent_solve(model: Model):
                 input=[dict_to_chat_message(msg) for msg in messages],
                 tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
             )
-            oai_res = openai_chat_choices(res.choices)
+            oai_res = openai_chat_choices(res.choices, include_reasoning=False)
             next_message = oai_res[0].message.model_dump(exclude_none=True)

evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} RENAMED Viewed

@@ -1,8 +1,7 @@
-import importlib
 from collections import defaultdict
 from typing import Dict, List
-from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
 from evalscope.api.dataset import Sample
 from evalscope.api.dataset.dataset import DatasetDict
 from evalscope.api.dataset.loader import DictDataLoader
@@ -13,6 +12,7 @@ from evalscope.api.registry import register_benchmark
 from evalscope.constants import Tags
 from evalscope.utils import get_logger
 from evalscope.utils.function_utils import run_once
+from evalscope.utils.import_utils import check_import
 logger = get_logger()
@@ -21,47 +21,43 @@ logger = get_logger()
     BenchmarkMeta(
         name='tau_bench',
         pretty_name='τ-bench',
-        tags=[Tags.FUNCTION_CALLING, Tags.REASONING],
+        tags=[Tags.FUNCTION_CALLING, Tags.REASONING, Tags.AGENT],
         description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
         'and a language agent provided with domain-specific API tools and policy guidelines. '
         'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` '
-        'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/tau_bench.html)',  # noqa: E501
+        'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/tau_bench.html)',  # noqa: E501
         dataset_id='https://github.com/sierra-research/tau-bench',
         subset_list=['airline', 'retail'],
-        metric_list=['Pass^1'],
+        aggregation='mean_and_pass_hat_k',
         eval_split='test',
         extra_params={
             'user_model': 'qwen-plus',
             'api_key': 'EMPTY',
             'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
             'generation_config': {
-                'temperature': 0.7,
-                'max_new_tokens': 1024
+                'temperature': 0.0,
+                'max_tokens': 4096,
             }
         }
     )
 )
-class TauBenchAdapter(DefaultDataAdapter):
+class TauBenchAdapter(AgentAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        spec = importlib.util.find_spec('tau_bench')
-        if spec is None:
-            raise ImportError(
-                '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.'  # noqa: E501
-            )
+        check_import(
+            'tau_bench',
+            package='git+https://github.com/sierra-research/tau-bench',
+            raise_error=True,
+            feature_name=self.pretty_name
+        )
         # setup user model args
         self.user_model = self.extra_params.get('user_model', 'qwen-plus')
         self.api_key = self.extra_params.get('api_key', 'EMPTY')
         self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
-        self.generation_config = self.extra_params.get(
-            'generation_config', {
-                'temperature': 0.7,
-                'max_new_tokens': 1024
-            }
-        )
+        self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
         self._patch_env_completion()
@@ -84,10 +80,10 @@ class TauBenchAdapter(DefaultDataAdapter):
             res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
-            message = res.message.model_dump(exclude_none=True)
+            message = {'role': 'assistant', 'content': res.completion}
             self.messages.append(message)
             self.total_cost = 0
-            return message['content']
+            return res.completion
         # get the current instance of TauBenchAdapter
         adapter_instance = self
@@ -114,7 +110,11 @@ class TauBenchAdapter(DefaultDataAdapter):
                 })
             # load dataset
             dataset = DictDataLoader(
-                dict_list=tasks, sample_fields=self.record_to_sample, limit=self.limit, repeats=self.repeats
+                dict_list=tasks,
+                sample_fields=self.record_to_sample,
+                limit=self.limit,
+                repeats=self.repeats,
+                shuffle=self.shuffle,
             ).load()
             data_dict[env_name] = dataset
@@ -145,24 +145,24 @@ class TauBenchAdapter(DefaultDataAdapter):
         try:
             # Parse the prediction to get the reward
-            res = task_state.metadata
-            reward = res.get('reward', 0.0)
+            task_result = task_state.metadata['task_result']
+            reward = task_result.get('reward', 0.0)
             score.value = {
-                'Pass^1': float(reward),
+                'acc': float(reward),
             }
             score.explanation = f'Task completed with reward: {reward}'
             score.metadata = {
-                'task_result': res,
+                'task_result': task_result,
                 'env_name': task_state.metadata.get('env_name', 'unknown'),
                 'task_index': task_state.metadata.get('task_index', -1)
             }
-            score.main_score_name = 'Pass^1'
+            score.main_score_name = 'acc'
         except Exception as e:
-            score.value = {'Pass^1': 0.0}
+            score.value = {'acc': 0.0}
             score.explanation = f'Evaluation failed: {str(e)}'
             score.metadata = {'error': str(e)}
-            score.main_score_name = 'Pass^1'
+            score.main_score_name = 'acc'
         return score

evalscope/benchmarks/text2image/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py RENAMED Viewed

@@ -16,8 +16,10 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='evalmuse',
+        pretty_name='EvalMuse',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-        description='EvalMuse Text-to-Image Benchmark',
+        description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
+        'and semantic alignment of finely generated images',
         tags=[Tags.TEXT_TO_IMAGE],
         subset_list=['EvalMuse'],
         metric_list=['FGA_BLIP2Score'],

evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py RENAMED Viewed

@@ -4,7 +4,6 @@ import os
 from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
 from evalscope.api.dataset import Sample
 from evalscope.api.messages import ChatMessageUser
-from evalscope.api.metric.scorer import Score
 from evalscope.api.registry import get_metric, register_benchmark
 from evalscope.constants import Tags
 from evalscope.utils.logger import get_logger
@@ -15,8 +14,9 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='genai_bench',
+        pretty_name='GenAI-Bench',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-        description='GenAI-Bench Text-to-Image Benchmark',
+        description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
         tags=[Tags.TEXT_TO_IMAGE],
         subset_list=['GenAI-Bench-1600'],
         metric_list=['VQAScore'],

evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py RENAMED Viewed

@@ -16,7 +16,7 @@ logger = get_logger()
         name='general_t2i',
         dataset_id='general_t2i',
         description='General Text-to-Image Benchmark',
-        tags=[Tags.TEXT_TO_IMAGE],
+        tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
         subset_list=['default'],
         metric_list=['PickScore'],
         few_shot_num=0,

evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py RENAMED Viewed

@@ -14,8 +14,10 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='hpdv2',
+        pretty_name='HPD-v2',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-        description='HPDv2 Text-to-Image Benchmark',
+        description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
+        'trained on the Human Preference Dataset (HPD v2)',
         tags=[Tags.TEXT_TO_IMAGE],
         subset_list=['HPDv2'],
         metric_list=['HPSv2.1Score'],
@@ -41,7 +43,10 @@ class HPDv2Adapter(Text2ImageAdapter):
         return Sample(
             input=[ChatMessageUser(content=record['prompt'])],
             metadata={
+                'id': record['id'],
+                'prompt': record['prompt'],
                 'category': record.get('tags', {}).get('category', ''),
-                'tags': record.get('tags', {})
+                'tags': record.get('tags', {}),
+                'image_path': record.get('image_path', ''),  # Optional field for existing image path
             }
         )

evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py RENAMED Viewed

@@ -10,6 +10,7 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name='tifa160',
+        pretty_name='TIFA-160',
         dataset_id='AI-ModelScope/T2V-Eval-Prompts',
         description='TIFA-160 Text-to-Image Benchmark',
         tags=[Tags.TEXT_TO_IMAGE],

evalscope/benchmarks/tool_bench/tool_bench_adapter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 from typing import Any, Dict
-from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
 from evalscope.api.dataset import Sample
 from evalscope.api.evaluator import TaskState
 from evalscope.api.messages.chat_message import ChatMessage, dict_to_chat_message
@@ -21,14 +21,14 @@ logger = get_logger()
         description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
         'It includes various subsets such as in-domain and out-of-domain, '
         'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
-        '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)',
+        '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html)',
         dataset_id='AI-ModelScope/ToolBench-Static',
         subset_list=['in_domain', 'out_of_domain'],
         metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
         eval_split='test',
     )
 )
-class ToolBenchAdapter(DefaultDataAdapter):
+class ToolBenchAdapter(AgentAdapter):
     """
     ToolBench adapter using the new data processing framework.
     """

evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py CHANGED Viewed

@@ -37,6 +37,7 @@ TRUTHFUL_QA_PROMPT = (
         dataset_id='evalscope/truthful_qa',
         metric_list=['multi_choice_acc'],
         subset_list=['multiple_choice'],
+        shuffle_choices=True,
         few_shot_num=0,
         train_split=None,
         eval_split='validation',
@@ -55,8 +56,6 @@ class TruthfulQaAdapter(MultiChoiceAdapter):
         super().__init__(**kwargs)
-        self.shuffle_choices = True
         self.multiple_correct = self.extra_params.get('multiple_correct', False)
         if self.multiple_correct:
             self.prompt_template = MultipleChoiceTemplate.MULTIPLE_ANSWER

evalscope/benchmarks/visu_logic/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/visu_logic/visu_logic_adapter.py ADDED Viewed

@@ -0,0 +1,75 @@
+# flake8: noqa: E501
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import parse_answers
+logger = get_logger()
+MULT_CHOICE_PROMPT = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A, B, C, D. Think step by step before answering.
+{question}
+"""
+SUBSET_LIST = [
+    'Quantitative Reasoning', 'Other', 'Positional Reasoning', 'Stylistic Reasoning', 'Spatial Reasoning',
+    'Attribute Reasoning'
+]
+@register_benchmark(
+    BenchmarkMeta(
+        name='visulogic',
+        pretty_name='VisuLogic',
+        dataset_id='evalscope/VisuLogic',
+        tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
+        description=
+        'VisuLogic is a benchmark aimed at evaluating the visual reasoning capabilities of Multi-modal Large Language Models (MLLMs), independent of textual reasoning processes. It features carefully constructed visual reasoning tasks spanning multiple categories, divided into six types based on required reasoning skills (e.g., Quantitative Reasoning, which involves understanding and deducing changes in the quantity of elements in images). Unlike existing benchmarks, VisuLogic is a challenging visual reasoning benchmark that is inherently difficult to articulate using language, providing a more rigorous evaluation of the visual reasoning capabilities of MLLMs.',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='test',
+        prompt_template=MULT_CHOICE_PROMPT,
+    )
+)
+class VisuLogicAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.reformat_subset = True
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        question = record.get('question', '')
+        content_list: List[Content] = []
+        prompt_text = self.prompt_template.format(question=question).strip()
+        content_list.append(ContentText(text=prompt_text))
+        image = record.get('image')
+        if image and isinstance(image, dict):
+            image_bytes = image.get('bytes')
+            if image_bytes:
+                image_base64 = bytes_to_base64(image_bytes, format='png', add_header=True)
+                content_list.append(ContentImage(image=image_base64))
+        metadata = {
+            'id': record['id'],
+        }
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=record['label'],
+            choices=['A', 'B', 'C', 'D'],
+            subset_key=record['tag'],
+            metadata=metadata,
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        answers = parse_answers(task_state)
+        return ''.join(sorted(list(answers)))

evalscope/benchmarks/wmt/__init__.py ADDED Viewed

File without changes

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl